diff options
Diffstat (limited to 'src/freedreno')
24 files changed, 13731 insertions, 1 deletions
diff --git a/src/freedreno/Makefile.am b/src/freedreno/Makefile.am index 9ddc3c0ad35..8f027e34f8a 100644 --- a/src/freedreno/Makefile.am +++ b/src/freedreno/Makefile.am @@ -45,7 +45,8 @@ TESTS = BUILT_SOURCES = CLEANFILES = EXTRA_DIST = \ - drm/meson.build + drm/meson.build \ + ir3/meson.build MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D) PYTHON_GEN = $(AM_V_GEN)$(PYTHON) $(PYTHON_FLAGS) @@ -57,3 +58,19 @@ noinst_LTLIBRARIES += libfreedreno_drm.la libfreedreno_drm_la_SOURCES = $(drm_SOURCES) libfreedreno_drm_la_CFLAGS = $(VALGRIND_CFLAGS) $(LIBDRM_CFLAGS) +noinst_LTLIBRARIES += libfreedreno_ir3.la + +libfreedreno_ir3_la_SOURCES = $(ir3_SOURCES) $(ir3_GENERATED_FILES) +libfreedreno_ir3_la_CFLAGS = \ + -I$(top_srcdir)/src/freedreno/ir3 \ + -I$(top_builddir)/src/compiler/nir \ + -I$(top_srcdir)/src/compiler/nir +libfreedreno_ir3_LIBADD = \ + $(top_builddir)/src/compiler/nir/libnir.la \ + $(top_builddir)/src/util/libmesautil.la + +MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D) +ir3/ir3_nir_trig.c: ir3/ir3_nir_trig.py $(top_srcdir)/src/compiler/nir/nir_algebraic.py + $(MKDIR_GEN) + $(AM_V_GEN) $(PYTHON) $(PYTHON_FLAGS) $(srcdir)/ir3/ir3_nir_trig.py -p $(top_srcdir)/src/compiler/nir > $@ || ($(RM) $@; false) + diff --git a/src/freedreno/Makefile.sources b/src/freedreno/Makefile.sources index 06a1a99b9e2..1df5e6250b5 100644 --- a/src/freedreno/Makefile.sources +++ b/src/freedreno/Makefile.sources @@ -15,3 +15,27 @@ drm_SOURCES := \ drm/msm_drm.h \ drm/msm_ringbuffer.c +ir3_SOURCES := \ + ir3/disasm-a3xx.c \ + ir3/instr-a3xx.h \ + ir3/ir3.c \ + ir3/ir3_compiler.c \ + ir3/ir3_compiler.h \ + ir3/ir3_compiler_nir.c \ + ir3/ir3_cp.c \ + ir3/ir3_depth.c \ + ir3/ir3_group.c \ + ir3/ir3.h \ + ir3/ir3_legalize.c \ + ir3/ir3_nir.c \ + ir3/ir3_nir.h \ + ir3/ir3_nir_lower_tg4_to_tex.c \ + ir3/ir3_print.c \ + ir3/ir3_ra.c \ + ir3/ir3_sched.c \ + ir3/ir3_shader.c \ + ir3/ir3_shader.h + +ir3_GENERATED_FILES := \ + ir3/ir3_nir_trig.c + diff --git a/src/freedreno/ir3/disasm-a3xx.c b/src/freedreno/ir3/disasm-a3xx.c new file mode 100644 index 00000000000..4cf45ce9227 --- /dev/null +++ b/src/freedreno/ir3/disasm-a3xx.c @@ -0,0 +1,1038 @@ +/* + * Copyright (c) 2013 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <stdbool.h> +#include <string.h> +#include <assert.h> + +#include <util/u_debug.h> + +#include "instr-a3xx.h" + +/* bitmask of debug flags */ +enum debug_t { + PRINT_RAW = 0x1, /* dump raw hexdump */ + PRINT_VERBOSE = 0x2, +}; + +static enum debug_t debug; + +#define printf debug_printf + +static const char *levels[] = { + "", + "\t", + "\t\t", + "\t\t\t", + "\t\t\t\t", + "\t\t\t\t\t", + "\t\t\t\t\t\t", + "\t\t\t\t\t\t\t", + "\t\t\t\t\t\t\t\t", + "\t\t\t\t\t\t\t\t\t", + "x", + "x", + "x", + "x", + "x", + "x", +}; + +static const char *component = "xyzw"; + +static const char *type[] = { + [TYPE_F16] = "f16", + [TYPE_F32] = "f32", + [TYPE_U16] = "u16", + [TYPE_U32] = "u32", + [TYPE_S16] = "s16", + [TYPE_S32] = "s32", + [TYPE_U8] = "u8", + [TYPE_S8] = "s8", +}; + +struct disasm_ctx { + FILE *out; + int level; + + /* current instruction repeat flag: */ + unsigned repeat; +}; + +static void print_reg(struct disasm_ctx *ctx, reg_t reg, bool full, bool r, + bool c, bool im, bool neg, bool abs, bool addr_rel) +{ + const char type = c ? 'c' : 'r'; + + // XXX I prefer - and || for neg/abs, but preserving format used + // by libllvm-a3xx for easy diffing.. + + if (abs && neg) + fprintf(ctx->out, "(absneg)"); + else if (neg) + fprintf(ctx->out, "(neg)"); + else if (abs) + fprintf(ctx->out, "(abs)"); + + if (r) + fprintf(ctx->out, "(r)"); + + if (im) { + fprintf(ctx->out, "%d", reg.iim_val); + } else if (addr_rel) { + /* I would just use %+d but trying to make it diff'able with + * libllvm-a3xx... + */ + if (reg.iim_val < 0) + fprintf(ctx->out, "%s%c<a0.x - %d>", full ? "" : "h", type, -reg.iim_val); + else if (reg.iim_val > 0) + fprintf(ctx->out, "%s%c<a0.x + %d>", full ? "" : "h", type, reg.iim_val); + else + fprintf(ctx->out, "%s%c<a0.x>", full ? "" : "h", type); + } else if ((reg.num == REG_A0) && !c) { + fprintf(ctx->out, "a0.%c", component[reg.comp]); + } else if ((reg.num == REG_P0) && !c) { + fprintf(ctx->out, "p0.%c", component[reg.comp]); + } else { + fprintf(ctx->out, "%s%c%d.%c", full ? "" : "h", type, reg.num & 0x3f, component[reg.comp]); + } +} + + +static void print_reg_dst(struct disasm_ctx *ctx, reg_t reg, bool full, bool addr_rel) +{ + print_reg(ctx, reg, full, false, false, false, false, false, addr_rel); +} + +static void print_reg_src(struct disasm_ctx *ctx, reg_t reg, bool full, bool r, + bool c, bool im, bool neg, bool abs, bool addr_rel) +{ + print_reg(ctx, reg, full, r, c, im, neg, abs, addr_rel); +} + +/* TODO switch to using reginfo struct everywhere, since more readable + * than passing a bunch of bools to print_reg_src + */ + +struct reginfo { + reg_t reg; + bool full; + bool r; + bool c; + bool im; + bool neg; + bool abs; + bool addr_rel; +}; + +static void print_src(struct disasm_ctx *ctx, struct reginfo *info) +{ + print_reg_src(ctx, info->reg, info->full, info->r, info->c, info->im, + info->neg, info->abs, info->addr_rel); +} + +//static void print_dst(struct disasm_ctx *ctx, struct reginfo *info) +//{ +// print_reg_dst(ctx, info->reg, info->full, info->addr_rel); +//} + +static void print_instr_cat0(struct disasm_ctx *ctx, instr_t *instr) +{ + instr_cat0_t *cat0 = &instr->cat0; + + switch (cat0->opc) { + case OPC_KILL: + fprintf(ctx->out, " %sp0.%c", cat0->inv ? "!" : "", + component[cat0->comp]); + break; + case OPC_BR: + fprintf(ctx->out, " %sp0.%c, #%d", cat0->inv ? "!" : "", + component[cat0->comp], cat0->a3xx.immed); + break; + case OPC_JUMP: + case OPC_CALL: + fprintf(ctx->out, " #%d", cat0->a3xx.immed); + break; + } + + if ((debug & PRINT_VERBOSE) && (cat0->dummy2|cat0->dummy3|cat0->dummy4)) + fprintf(ctx->out, "\t{0: %x,%x,%x}", cat0->dummy2, cat0->dummy3, cat0->dummy4); +} + +static void print_instr_cat1(struct disasm_ctx *ctx, instr_t *instr) +{ + instr_cat1_t *cat1 = &instr->cat1; + + if (cat1->ul) + fprintf(ctx->out, "(ul)"); + + if (cat1->src_type == cat1->dst_type) { + if ((cat1->src_type == TYPE_S16) && (((reg_t)cat1->dst).num == REG_A0)) { + /* special case (nmemonic?): */ + fprintf(ctx->out, "mova"); + } else { + fprintf(ctx->out, "mov.%s%s", type[cat1->src_type], type[cat1->dst_type]); + } + } else { + fprintf(ctx->out, "cov.%s%s", type[cat1->src_type], type[cat1->dst_type]); + } + + fprintf(ctx->out, " "); + + if (cat1->even) + fprintf(ctx->out, "(even)"); + + if (cat1->pos_inf) + fprintf(ctx->out, "(pos_infinity)"); + + print_reg_dst(ctx, (reg_t)(cat1->dst), type_size(cat1->dst_type) == 32, + cat1->dst_rel); + + fprintf(ctx->out, ", "); + + /* ugg, have to special case this.. vs print_reg().. */ + if (cat1->src_im) { + if (type_float(cat1->src_type)) + fprintf(ctx->out, "(%f)", cat1->fim_val); + else if (type_uint(cat1->src_type)) + fprintf(ctx->out, "0x%08x", cat1->uim_val); + else + fprintf(ctx->out, "%d", cat1->iim_val); + } else if (cat1->src_rel && !cat1->src_c) { + /* I would just use %+d but trying to make it diff'able with + * libllvm-a3xx... + */ + char type = cat1->src_rel_c ? 'c' : 'r'; + if (cat1->off < 0) + fprintf(ctx->out, "%c<a0.x - %d>", type, -cat1->off); + else if (cat1->off > 0) + fprintf(ctx->out, "%c<a0.x + %d>", type, cat1->off); + else + fprintf(ctx->out, "%c<a0.x>", type); + } else { + print_reg_src(ctx, (reg_t)(cat1->src), type_size(cat1->src_type) == 32, + cat1->src_r, cat1->src_c, cat1->src_im, false, false, false); + } + + if ((debug & PRINT_VERBOSE) && (cat1->must_be_0)) + fprintf(ctx->out, "\t{1: %x}", cat1->must_be_0); +} + +static void print_instr_cat2(struct disasm_ctx *ctx, instr_t *instr) +{ + instr_cat2_t *cat2 = &instr->cat2; + static const char *cond[] = { + "lt", + "le", + "gt", + "ge", + "eq", + "ne", + "?6?", + }; + + switch (_OPC(2, cat2->opc)) { + case OPC_CMPS_F: + case OPC_CMPS_U: + case OPC_CMPS_S: + case OPC_CMPV_F: + case OPC_CMPV_U: + case OPC_CMPV_S: + fprintf(ctx->out, ".%s", cond[cat2->cond]); + break; + } + + fprintf(ctx->out, " "); + if (cat2->ei) + fprintf(ctx->out, "(ei)"); + print_reg_dst(ctx, (reg_t)(cat2->dst), cat2->full ^ cat2->dst_half, false); + fprintf(ctx->out, ", "); + + if (cat2->c1.src1_c) { + print_reg_src(ctx, (reg_t)(cat2->c1.src1), cat2->full, cat2->src1_r, + cat2->c1.src1_c, cat2->src1_im, cat2->src1_neg, + cat2->src1_abs, false); + } else if (cat2->rel1.src1_rel) { + print_reg_src(ctx, (reg_t)(cat2->rel1.src1), cat2->full, cat2->src1_r, + cat2->rel1.src1_c, cat2->src1_im, cat2->src1_neg, + cat2->src1_abs, cat2->rel1.src1_rel); + } else { + print_reg_src(ctx, (reg_t)(cat2->src1), cat2->full, cat2->src1_r, + false, cat2->src1_im, cat2->src1_neg, + cat2->src1_abs, false); + } + + switch (_OPC(2, cat2->opc)) { + case OPC_ABSNEG_F: + case OPC_ABSNEG_S: + case OPC_CLZ_B: + case OPC_CLZ_S: + case OPC_SIGN_F: + case OPC_FLOOR_F: + case OPC_CEIL_F: + case OPC_RNDNE_F: + case OPC_RNDAZ_F: + case OPC_TRUNC_F: + case OPC_NOT_B: + case OPC_BFREV_B: + case OPC_SETRM: + case OPC_CBITS_B: + /* these only have one src reg */ + break; + default: + fprintf(ctx->out, ", "); + if (cat2->c2.src2_c) { + print_reg_src(ctx, (reg_t)(cat2->c2.src2), cat2->full, cat2->src2_r, + cat2->c2.src2_c, cat2->src2_im, cat2->src2_neg, + cat2->src2_abs, false); + } else if (cat2->rel2.src2_rel) { + print_reg_src(ctx, (reg_t)(cat2->rel2.src2), cat2->full, cat2->src2_r, + cat2->rel2.src2_c, cat2->src2_im, cat2->src2_neg, + cat2->src2_abs, cat2->rel2.src2_rel); + } else { + print_reg_src(ctx, (reg_t)(cat2->src2), cat2->full, cat2->src2_r, + false, cat2->src2_im, cat2->src2_neg, + cat2->src2_abs, false); + } + break; + } +} + +static void print_instr_cat3(struct disasm_ctx *ctx, instr_t *instr) +{ + instr_cat3_t *cat3 = &instr->cat3; + bool full = instr_cat3_full(cat3); + + fprintf(ctx->out, " "); + print_reg_dst(ctx, (reg_t)(cat3->dst), full ^ cat3->dst_half, false); + fprintf(ctx->out, ", "); + if (cat3->c1.src1_c) { + print_reg_src(ctx, (reg_t)(cat3->c1.src1), full, + cat3->src1_r, cat3->c1.src1_c, false, cat3->src1_neg, + false, false); + } else if (cat3->rel1.src1_rel) { + print_reg_src(ctx, (reg_t)(cat3->rel1.src1), full, + cat3->src1_r, cat3->rel1.src1_c, false, cat3->src1_neg, + false, cat3->rel1.src1_rel); + } else { + print_reg_src(ctx, (reg_t)(cat3->src1), full, + cat3->src1_r, false, false, cat3->src1_neg, + false, false); + } + fprintf(ctx->out, ", "); + print_reg_src(ctx, (reg_t)cat3->src2, full, + cat3->src2_r, cat3->src2_c, false, cat3->src2_neg, + false, false); + fprintf(ctx->out, ", "); + if (cat3->c2.src3_c) { + print_reg_src(ctx, (reg_t)(cat3->c2.src3), full, + cat3->src3_r, cat3->c2.src3_c, false, cat3->src3_neg, + false, false); + } else if (cat3->rel2.src3_rel) { + print_reg_src(ctx, (reg_t)(cat3->rel2.src3), full, + cat3->src3_r, cat3->rel2.src3_c, false, cat3->src3_neg, + false, cat3->rel2.src3_rel); + } else { + print_reg_src(ctx, (reg_t)(cat3->src3), full, + cat3->src3_r, false, false, cat3->src3_neg, + false, false); + } +} + +static void print_instr_cat4(struct disasm_ctx *ctx, instr_t *instr) +{ + instr_cat4_t *cat4 = &instr->cat4; + + fprintf(ctx->out, " "); + print_reg_dst(ctx, (reg_t)(cat4->dst), cat4->full ^ cat4->dst_half, false); + fprintf(ctx->out, ", "); + + if (cat4->c.src_c) { + print_reg_src(ctx, (reg_t)(cat4->c.src), cat4->full, + cat4->src_r, cat4->c.src_c, cat4->src_im, + cat4->src_neg, cat4->src_abs, false); + } else if (cat4->rel.src_rel) { + print_reg_src(ctx, (reg_t)(cat4->rel.src), cat4->full, + cat4->src_r, cat4->rel.src_c, cat4->src_im, + cat4->src_neg, cat4->src_abs, cat4->rel.src_rel); + } else { + print_reg_src(ctx, (reg_t)(cat4->src), cat4->full, + cat4->src_r, false, cat4->src_im, + cat4->src_neg, cat4->src_abs, false); + } + + if ((debug & PRINT_VERBOSE) && (cat4->dummy1|cat4->dummy2)) + fprintf(ctx->out, "\t{4: %x,%x}", cat4->dummy1, cat4->dummy2); +} + +static void print_instr_cat5(struct disasm_ctx *ctx, instr_t *instr) +{ + static const struct { + bool src1, src2, samp, tex; + } info[0x1f] = { + [opc_op(OPC_ISAM)] = { true, false, true, true, }, + [opc_op(OPC_ISAML)] = { true, true, true, true, }, + [opc_op(OPC_ISAMM)] = { true, false, true, true, }, + [opc_op(OPC_SAM)] = { true, false, true, true, }, + [opc_op(OPC_SAMB)] = { true, true, true, true, }, + [opc_op(OPC_SAML)] = { true, true, true, true, }, + [opc_op(OPC_SAMGQ)] = { true, false, true, true, }, + [opc_op(OPC_GETLOD)] = { true, false, true, true, }, + [opc_op(OPC_CONV)] = { true, true, true, true, }, + [opc_op(OPC_CONVM)] = { true, true, true, true, }, + [opc_op(OPC_GETSIZE)] = { true, false, false, true, }, + [opc_op(OPC_GETBUF)] = { false, false, false, true, }, + [opc_op(OPC_GETPOS)] = { true, false, false, true, }, + [opc_op(OPC_GETINFO)] = { false, false, false, true, }, + [opc_op(OPC_DSX)] = { true, false, false, false, }, + [opc_op(OPC_DSY)] = { true, false, false, false, }, + [opc_op(OPC_GATHER4R)] = { true, false, true, true, }, + [opc_op(OPC_GATHER4G)] = { true, false, true, true, }, + [opc_op(OPC_GATHER4B)] = { true, false, true, true, }, + [opc_op(OPC_GATHER4A)] = { true, false, true, true, }, + [opc_op(OPC_SAMGP0)] = { true, false, true, true, }, + [opc_op(OPC_SAMGP1)] = { true, false, true, true, }, + [opc_op(OPC_SAMGP2)] = { true, false, true, true, }, + [opc_op(OPC_SAMGP3)] = { true, false, true, true, }, + [opc_op(OPC_DSXPP_1)] = { true, false, false, false, }, + [opc_op(OPC_DSYPP_1)] = { true, false, false, false, }, + [opc_op(OPC_RGETPOS)] = { false, false, false, false, }, + [opc_op(OPC_RGETINFO)] = { false, false, false, false, }, + }; + instr_cat5_t *cat5 = &instr->cat5; + int i; + + if (cat5->is_3d) fprintf(ctx->out, ".3d"); + if (cat5->is_a) fprintf(ctx->out, ".a"); + if (cat5->is_o) fprintf(ctx->out, ".o"); + if (cat5->is_p) fprintf(ctx->out, ".p"); + if (cat5->is_s) fprintf(ctx->out, ".s"); + if (cat5->is_s2en) fprintf(ctx->out, ".s2en"); + + fprintf(ctx->out, " "); + + switch (_OPC(5, cat5->opc)) { + case OPC_DSXPP_1: + case OPC_DSYPP_1: + break; + default: + fprintf(ctx->out, "(%s)", type[cat5->type]); + break; + } + + fprintf(ctx->out, "("); + for (i = 0; i < 4; i++) + if (cat5->wrmask & (1 << i)) + fprintf(ctx->out, "%c", "xyzw"[i]); + fprintf(ctx->out, ")"); + + print_reg_dst(ctx, (reg_t)(cat5->dst), type_size(cat5->type) == 32, false); + + if (info[cat5->opc].src1) { + fprintf(ctx->out, ", "); + print_reg_src(ctx, (reg_t)(cat5->src1), cat5->full, false, false, false, + false, false, false); + } + + if (cat5->is_s2en) { + fprintf(ctx->out, ", "); + print_reg_src(ctx, (reg_t)(cat5->s2en.src2), cat5->full, false, false, false, + false, false, false); + fprintf(ctx->out, ", "); + print_reg_src(ctx, (reg_t)(cat5->s2en.src3), false, false, false, false, + false, false, false); + } else { + if (cat5->is_o || info[cat5->opc].src2) { + fprintf(ctx->out, ", "); + print_reg_src(ctx, (reg_t)(cat5->norm.src2), cat5->full, + false, false, false, false, false, false); + } + if (info[cat5->opc].samp) + fprintf(ctx->out, ", s#%d", cat5->norm.samp); + if (info[cat5->opc].tex) + fprintf(ctx->out, ", t#%d", cat5->norm.tex); + } + + if (debug & PRINT_VERBOSE) { + if (cat5->is_s2en) { + if ((debug & PRINT_VERBOSE) && (cat5->s2en.dummy1|cat5->s2en.dummy2|cat5->dummy2)) + fprintf(ctx->out, "\t{5: %x,%x,%x}", cat5->s2en.dummy1, cat5->s2en.dummy2, cat5->dummy2); + } else { + if ((debug & PRINT_VERBOSE) && (cat5->norm.dummy1|cat5->dummy2)) + fprintf(ctx->out, "\t{5: %x,%x}", cat5->norm.dummy1, cat5->dummy2); + } + } +} + +static void print_instr_cat6(struct disasm_ctx *ctx, instr_t *instr) +{ + instr_cat6_t *cat6 = &instr->cat6; + char sd = 0, ss = 0; /* dst/src address space */ + bool nodst = false; + struct reginfo dst, src1, src2; + int src1off = 0, dstoff = 0; + + memset(&dst, 0, sizeof(dst)); + memset(&src1, 0, sizeof(src1)); + memset(&src2, 0, sizeof(src2)); + + switch (_OPC(6, cat6->opc)) { + case OPC_RESINFO: + case OPC_RESFMT: + dst.full = type_size(cat6->type) == 32; + src1.full = type_size(cat6->type) == 32; + src2.full = type_size(cat6->type) == 32; + break; + case OPC_L2G: + case OPC_G2L: + dst.full = true; + src1.full = true; + src2.full = true; + break; + case OPC_STG: + case OPC_STL: + case OPC_STP: + case OPC_STI: + case OPC_STLW: + case OPC_STIB: + dst.full = true; + src1.full = type_size(cat6->type) == 32; + src2.full = type_size(cat6->type) == 32; + break; + default: + dst.full = type_size(cat6->type) == 32; + src1.full = true; + src2.full = true; + break; + } + + switch (_OPC(6, cat6->opc)) { + case OPC_PREFETCH: + break; + case OPC_RESINFO: + fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1); + break; + case OPC_LDGB: + fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped"); + fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1); + fprintf(ctx->out, ".%s", type[cat6->type]); + fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1); + break; + case OPC_STGB: + case OPC_STIB: + fprintf(ctx->out, ".%s", cat6->stgb.typed ? "typed" : "untyped"); + fprintf(ctx->out, ".%dd", cat6->stgb.d + 1); + fprintf(ctx->out, ".%s", type[cat6->type]); + fprintf(ctx->out, ".%d", cat6->stgb.type_size + 1); + break; + case OPC_ATOMIC_ADD: + case OPC_ATOMIC_SUB: + case OPC_ATOMIC_XCHG: + case OPC_ATOMIC_INC: + case OPC_ATOMIC_DEC: + case OPC_ATOMIC_CMPXCHG: + case OPC_ATOMIC_MIN: + case OPC_ATOMIC_MAX: + case OPC_ATOMIC_AND: + case OPC_ATOMIC_OR: + case OPC_ATOMIC_XOR: + ss = cat6->g ? 'g' : 'l'; + fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped"); + fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1); + fprintf(ctx->out, ".%s", type[cat6->type]); + fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1); + fprintf(ctx->out, ".%c", ss); + break; + default: + dst.im = cat6->g && !cat6->dst_off; + fprintf(ctx->out, ".%s", type[cat6->type]); + break; + } + fprintf(ctx->out, " "); + + switch (_OPC(6, cat6->opc)) { + case OPC_STG: + sd = 'g'; + break; + case OPC_STP: + sd = 'p'; + break; + case OPC_STL: + case OPC_STLW: + sd = 'l'; + break; + + case OPC_LDG: + case OPC_LDC: + ss = 'g'; + break; + case OPC_LDP: + ss = 'p'; + break; + case OPC_LDL: + case OPC_LDLW: + case OPC_LDLV: + ss = 'l'; + break; + + case OPC_L2G: + ss = 'l'; + sd = 'g'; + break; + + case OPC_G2L: + ss = 'g'; + sd = 'l'; + break; + + case OPC_PREFETCH: + ss = 'g'; + nodst = true; + break; + + case OPC_STI: + dst.full = false; // XXX or inverts?? + break; + } + + if ((_OPC(6, cat6->opc) == OPC_STGB) || (_OPC(6, cat6->opc) == OPC_STIB)) { + struct reginfo src3; + + memset(&src3, 0, sizeof(src3)); + + src1.reg = (reg_t)(cat6->stgb.src1); + src2.reg = (reg_t)(cat6->stgb.src2); + src2.im = cat6->stgb.src2_im; + src3.reg = (reg_t)(cat6->stgb.src3); + src3.im = cat6->stgb.src3_im; + src3.full = true; + + fprintf(ctx->out, "g[%u], ", cat6->stgb.dst_ssbo); + print_src(ctx, &src1); + fprintf(ctx->out, ", "); + print_src(ctx, &src2); + fprintf(ctx->out, ", "); + print_src(ctx, &src3); + + if (debug & PRINT_VERBOSE) + fprintf(ctx->out, " (pad0=%x, pad3=%x)", cat6->stgb.pad0, cat6->stgb.pad3); + + return; + } + + if (is_atomic(_OPC(6, cat6->opc))) { + + src1.reg = (reg_t)(cat6->ldgb.src1); + src1.im = cat6->ldgb.src1_im; + src2.reg = (reg_t)(cat6->ldgb.src2); + src2.im = cat6->ldgb.src2_im; + dst.reg = (reg_t)(cat6->ldgb.dst); + + print_src(ctx, &dst); + fprintf(ctx->out, ", "); + if (ss == 'g') { + struct reginfo src3; + memset(&src3, 0, sizeof(src3)); + + src3.reg = (reg_t)(cat6->ldgb.src3); + src3.full = true; + + /* For images, the ".typed" variant is used and src2 is + * the ivecN coordinates, ie ivec2 for 2d. + * + * For SSBOs, the ".untyped" variant is used and src2 is + * a simple dword offset.. src3 appears to be + * uvec2(offset * 4, 0). Not sure the point of that. + */ + + fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo); + print_src(ctx, &src1); /* value */ + fprintf(ctx->out, ", "); + print_src(ctx, &src2); /* offset/coords */ + fprintf(ctx->out, ", "); + print_src(ctx, &src3); /* 64b byte offset.. */ + + if (debug & PRINT_VERBOSE) { + fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0, + cat6->ldgb.pad3, cat6->ldgb.mustbe0); + } + } else { /* ss == 'l' */ + fprintf(ctx->out, "l["); + print_src(ctx, &src1); /* simple byte offset */ + fprintf(ctx->out, "], "); + print_src(ctx, &src2); /* value */ + + if (debug & PRINT_VERBOSE) { + fprintf(ctx->out, " (src3=%x, pad0=%x, pad3=%x, mustbe0=%x)", + cat6->ldgb.src3, cat6->ldgb.pad0, + cat6->ldgb.pad3, cat6->ldgb.mustbe0); + } + } + + return; + } else if (_OPC(6, cat6->opc) == OPC_RESINFO) { + dst.reg = (reg_t)(cat6->ldgb.dst); + + print_src(ctx, &dst); + fprintf(ctx->out, ", "); + fprintf(ctx->out, "g[%u]", cat6->ldgb.src_ssbo); + + return; + } else if (_OPC(6, cat6->opc) == OPC_LDGB) { + + src1.reg = (reg_t)(cat6->ldgb.src1); + src1.im = cat6->ldgb.src1_im; + src2.reg = (reg_t)(cat6->ldgb.src2); + src2.im = cat6->ldgb.src2_im; + dst.reg = (reg_t)(cat6->ldgb.dst); + + print_src(ctx, &dst); + fprintf(ctx->out, ", "); + fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo); + print_src(ctx, &src1); + fprintf(ctx->out, ", "); + print_src(ctx, &src2); + + if (debug & PRINT_VERBOSE) + fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0, cat6->ldgb.pad3, cat6->ldgb.mustbe0); + + return; + } + if (cat6->dst_off) { + dst.reg = (reg_t)(cat6->c.dst); + dstoff = cat6->c.off; + } else { + dst.reg = (reg_t)(cat6->d.dst); + } + + if (cat6->src_off) { + src1.reg = (reg_t)(cat6->a.src1); + src1.im = cat6->a.src1_im; + src2.reg = (reg_t)(cat6->a.src2); + src2.im = cat6->a.src2_im; + src1off = cat6->a.off; + } else { + src1.reg = (reg_t)(cat6->b.src1); + src1.im = cat6->b.src1_im; + src2.reg = (reg_t)(cat6->b.src2); + src2.im = cat6->b.src2_im; + } + + if (!nodst) { + if (sd) + fprintf(ctx->out, "%c[", sd); + /* note: dst might actually be a src (ie. address to store to) */ + print_src(ctx, &dst); + if (dstoff) + fprintf(ctx->out, "%+d", dstoff); + if (sd) + fprintf(ctx->out, "]"); + fprintf(ctx->out, ", "); + } + + if (ss) + fprintf(ctx->out, "%c[", ss); + + /* can have a larger than normal immed, so hack: */ + if (src1.im) { + fprintf(ctx->out, "%u", src1.reg.dummy13); + } else { + print_src(ctx, &src1); + } + + if (src1off) + fprintf(ctx->out, "%+d", src1off); + if (ss) + fprintf(ctx->out, "]"); + + switch (_OPC(6, cat6->opc)) { + case OPC_RESINFO: + case OPC_RESFMT: + break; + default: + fprintf(ctx->out, ", "); + print_src(ctx, &src2); + break; + } +} + +static void print_instr_cat7(struct disasm_ctx *ctx, instr_t *instr) +{ + instr_cat7_t *cat7 = &instr->cat7; + + if (cat7->g) + fprintf(ctx->out, ".g"); + if (cat7->l) + fprintf(ctx->out, ".l"); + + if (_OPC(7, cat7->opc) == OPC_FENCE) { + if (cat7->r) + fprintf(ctx->out, ".r"); + if (cat7->w) + fprintf(ctx->out, ".w"); + } +} + +/* size of largest OPC field of all the instruction categories: */ +#define NOPC_BITS 6 + +static const struct opc_info { + uint16_t cat; + uint16_t opc; + const char *name; + void (*print)(struct disasm_ctx *ctx, instr_t *instr); +} opcs[1 << (3+NOPC_BITS)] = { +#define OPC(cat, opc, name) [(opc)] = { (cat), (opc), #name, print_instr_cat##cat } + /* category 0: */ + OPC(0, OPC_NOP, nop), + OPC(0, OPC_BR, br), + OPC(0, OPC_JUMP, jump), + OPC(0, OPC_CALL, call), + OPC(0, OPC_RET, ret), + OPC(0, OPC_KILL, kill), + OPC(0, OPC_END, end), + OPC(0, OPC_EMIT, emit), + OPC(0, OPC_CUT, cut), + OPC(0, OPC_CHMASK, chmask), + OPC(0, OPC_CHSH, chsh), + OPC(0, OPC_FLOW_REV, flow_rev), + + /* category 1: */ + OPC(1, OPC_MOV, ), + + /* category 2: */ + OPC(2, OPC_ADD_F, add.f), + OPC(2, OPC_MIN_F, min.f), + OPC(2, OPC_MAX_F, max.f), + OPC(2, OPC_MUL_F, mul.f), + OPC(2, OPC_SIGN_F, sign.f), + OPC(2, OPC_CMPS_F, cmps.f), + OPC(2, OPC_ABSNEG_F, absneg.f), + OPC(2, OPC_CMPV_F, cmpv.f), + OPC(2, OPC_FLOOR_F, floor.f), + OPC(2, OPC_CEIL_F, ceil.f), + OPC(2, OPC_RNDNE_F, rndne.f), + OPC(2, OPC_RNDAZ_F, rndaz.f), + OPC(2, OPC_TRUNC_F, trunc.f), + OPC(2, OPC_ADD_U, add.u), + OPC(2, OPC_ADD_S, add.s), + OPC(2, OPC_SUB_U, sub.u), + OPC(2, OPC_SUB_S, sub.s), + OPC(2, OPC_CMPS_U, cmps.u), + OPC(2, OPC_CMPS_S, cmps.s), + OPC(2, OPC_MIN_U, min.u), + OPC(2, OPC_MIN_S, min.s), + OPC(2, OPC_MAX_U, max.u), + OPC(2, OPC_MAX_S, max.s), + OPC(2, OPC_ABSNEG_S, absneg.s), + OPC(2, OPC_AND_B, and.b), + OPC(2, OPC_OR_B, or.b), + OPC(2, OPC_NOT_B, not.b), + OPC(2, OPC_XOR_B, xor.b), + OPC(2, OPC_CMPV_U, cmpv.u), + OPC(2, OPC_CMPV_S, cmpv.s), + OPC(2, OPC_MUL_U, mul.u), + OPC(2, OPC_MUL_S, mul.s), + OPC(2, OPC_MULL_U, mull.u), + OPC(2, OPC_BFREV_B, bfrev.b), + OPC(2, OPC_CLZ_S, clz.s), + OPC(2, OPC_CLZ_B, clz.b), + OPC(2, OPC_SHL_B, shl.b), + OPC(2, OPC_SHR_B, shr.b), + OPC(2, OPC_ASHR_B, ashr.b), + OPC(2, OPC_BARY_F, bary.f), + OPC(2, OPC_MGEN_B, mgen.b), + OPC(2, OPC_GETBIT_B, getbit.b), + OPC(2, OPC_SETRM, setrm), + OPC(2, OPC_CBITS_B, cbits.b), + OPC(2, OPC_SHB, shb), + OPC(2, OPC_MSAD, msad), + + /* category 3: */ + OPC(3, OPC_MAD_U16, mad.u16), + OPC(3, OPC_MADSH_U16, madsh.u16), + OPC(3, OPC_MAD_S16, mad.s16), + OPC(3, OPC_MADSH_M16, madsh.m16), + OPC(3, OPC_MAD_U24, mad.u24), + OPC(3, OPC_MAD_S24, mad.s24), + OPC(3, OPC_MAD_F16, mad.f16), + OPC(3, OPC_MAD_F32, mad.f32), + OPC(3, OPC_SEL_B16, sel.b16), + OPC(3, OPC_SEL_B32, sel.b32), + OPC(3, OPC_SEL_S16, sel.s16), + OPC(3, OPC_SEL_S32, sel.s32), + OPC(3, OPC_SEL_F16, sel.f16), + OPC(3, OPC_SEL_F32, sel.f32), + OPC(3, OPC_SAD_S16, sad.s16), + OPC(3, OPC_SAD_S32, sad.s32), + + /* category 4: */ + OPC(4, OPC_RCP, rcp), + OPC(4, OPC_RSQ, rsq), + OPC(4, OPC_LOG2, log2), + OPC(4, OPC_EXP2, exp2), + OPC(4, OPC_SIN, sin), + OPC(4, OPC_COS, cos), + OPC(4, OPC_SQRT, sqrt), + + /* category 5: */ + OPC(5, OPC_ISAM, isam), + OPC(5, OPC_ISAML, isaml), + OPC(5, OPC_ISAMM, isamm), + OPC(5, OPC_SAM, sam), + OPC(5, OPC_SAMB, samb), + OPC(5, OPC_SAML, saml), + OPC(5, OPC_SAMGQ, samgq), + OPC(5, OPC_GETLOD, getlod), + OPC(5, OPC_CONV, conv), + OPC(5, OPC_CONVM, convm), + OPC(5, OPC_GETSIZE, getsize), + OPC(5, OPC_GETBUF, getbuf), + OPC(5, OPC_GETPOS, getpos), + OPC(5, OPC_GETINFO, getinfo), + OPC(5, OPC_DSX, dsx), + OPC(5, OPC_DSY, dsy), + OPC(5, OPC_GATHER4R, gather4r), + OPC(5, OPC_GATHER4G, gather4g), + OPC(5, OPC_GATHER4B, gather4b), + OPC(5, OPC_GATHER4A, gather4a), + OPC(5, OPC_SAMGP0, samgp0), + OPC(5, OPC_SAMGP1, samgp1), + OPC(5, OPC_SAMGP2, samgp2), + OPC(5, OPC_SAMGP3, samgp3), + OPC(5, OPC_DSXPP_1, dsxpp.1), + OPC(5, OPC_DSYPP_1, dsypp.1), + OPC(5, OPC_RGETPOS, rgetpos), + OPC(5, OPC_RGETINFO, rgetinfo), + + + /* category 6: */ + OPC(6, OPC_LDG, ldg), + OPC(6, OPC_LDL, ldl), + OPC(6, OPC_LDP, ldp), + OPC(6, OPC_STG, stg), + OPC(6, OPC_STL, stl), + OPC(6, OPC_STP, stp), + OPC(6, OPC_STI, sti), + OPC(6, OPC_G2L, g2l), + OPC(6, OPC_L2G, l2g), + OPC(6, OPC_PREFETCH, prefetch), + OPC(6, OPC_LDLW, ldlw), + OPC(6, OPC_STLW, stlw), + OPC(6, OPC_RESFMT, resfmt), + OPC(6, OPC_RESINFO, resinfo), + OPC(6, OPC_ATOMIC_ADD, atomic.add), + OPC(6, OPC_ATOMIC_SUB, atomic.sub), + OPC(6, OPC_ATOMIC_XCHG, atomic.xchg), + OPC(6, OPC_ATOMIC_INC, atomic.inc), + OPC(6, OPC_ATOMIC_DEC, atomic.dec), + OPC(6, OPC_ATOMIC_CMPXCHG, atomic.cmpxchg), + OPC(6, OPC_ATOMIC_MIN, atomic.min), + OPC(6, OPC_ATOMIC_MAX, atomic.max), + OPC(6, OPC_ATOMIC_AND, atomic.and), + OPC(6, OPC_ATOMIC_OR, atomic.or), + OPC(6, OPC_ATOMIC_XOR, atomic.xor), + OPC(6, OPC_LDGB, ldgb), + OPC(6, OPC_STGB, stgb), + OPC(6, OPC_STIB, stib), + OPC(6, OPC_LDC, ldc), + OPC(6, OPC_LDLV, ldlv), + + OPC(7, OPC_BAR, bar), + OPC(7, OPC_FENCE, fence), + +#undef OPC +}; + +#define GETINFO(instr) (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr)])) + +// XXX hack.. probably should move this table somewhere common: +#include "ir3.h" +const char *ir3_instr_name(struct ir3_instruction *instr) +{ + if (opc_cat(instr->opc) == -1) return "??meta??"; + return opcs[instr->opc].name; +} + +static bool print_instr(struct disasm_ctx *ctx, uint32_t *dwords, int n) +{ + instr_t *instr = (instr_t *)dwords; + uint32_t opc = instr_opc(instr); + const char *name; + + if (debug & PRINT_VERBOSE) + fprintf(ctx->out, "%s%04d[%08xx_%08xx] ", levels[ctx->level], n, dwords[1], dwords[0]); + + /* NOTE: order flags are printed is a bit fugly.. but for now I + * try to match the order in llvm-a3xx disassembler for easy + * diff'ing.. + */ + + ctx->repeat = instr_repeat(instr); + + if (instr->sync) + fprintf(ctx->out, "(sy)"); + if (instr->ss && ((instr->opc_cat <= 4) || (instr->opc_cat == 7))) + fprintf(ctx->out, "(ss)"); + if (instr->jmp_tgt) + fprintf(ctx->out, "(jp)"); + if (instr_sat(instr)) + fprintf(ctx->out, "(sat)"); + if (ctx->repeat) + fprintf(ctx->out, "(rpt%d)", ctx->repeat); + if (instr->ul && ((2 <= instr->opc_cat) && (instr->opc_cat <= 4))) + fprintf(ctx->out, "(ul)"); + + name = GETINFO(instr)->name; + + if (name) { + fprintf(ctx->out, "%s", name); + GETINFO(instr)->print(ctx, instr); + } else { + fprintf(ctx->out, "unknown(%d,%d)", instr->opc_cat, opc); + } + + fprintf(ctx->out, "\n"); + + return (instr->opc_cat == 0) && (opc == OPC_END); +} + +int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out) +{ + struct disasm_ctx ctx; + int i; + + assert((sizedwords % 2) == 0); + + memset(&ctx, 0, sizeof(ctx)); + ctx.out = out; + ctx.level = level; + + for (i = 0; i < sizedwords; i += 2) + print_instr(&ctx, &dwords[i], i/2); + + return 0; +} diff --git a/src/freedreno/ir3/instr-a3xx.h b/src/freedreno/ir3/instr-a3xx.h new file mode 100644 index 00000000000..7f60ee5fd4c --- /dev/null +++ b/src/freedreno/ir3/instr-a3xx.h @@ -0,0 +1,872 @@ +/* + * Copyright (c) 2013 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef INSTR_A3XX_H_ +#define INSTR_A3XX_H_ + +#define PACKED __attribute__((__packed__)) + +#include <stdint.h> +#include <stdio.h> +#include <stdbool.h> +#include <assert.h> + +/* size of largest OPC field of all the instruction categories: */ +#define NOPC_BITS 6 + +#define _OPC(cat, opc) (((cat) << NOPC_BITS) | opc) + +typedef enum { + /* category 0: */ + OPC_NOP = _OPC(0, 0), + OPC_BR = _OPC(0, 1), + OPC_JUMP = _OPC(0, 2), + OPC_CALL = _OPC(0, 3), + OPC_RET = _OPC(0, 4), + OPC_KILL = _OPC(0, 5), + OPC_END = _OPC(0, 6), + OPC_EMIT = _OPC(0, 7), + OPC_CUT = _OPC(0, 8), + OPC_CHMASK = _OPC(0, 9), + OPC_CHSH = _OPC(0, 10), + OPC_FLOW_REV = _OPC(0, 11), + + /* category 1: */ + OPC_MOV = _OPC(1, 0), + + /* category 2: */ + OPC_ADD_F = _OPC(2, 0), + OPC_MIN_F = _OPC(2, 1), + OPC_MAX_F = _OPC(2, 2), + OPC_MUL_F = _OPC(2, 3), + OPC_SIGN_F = _OPC(2, 4), + OPC_CMPS_F = _OPC(2, 5), + OPC_ABSNEG_F = _OPC(2, 6), + OPC_CMPV_F = _OPC(2, 7), + /* 8 - invalid */ + OPC_FLOOR_F = _OPC(2, 9), + OPC_CEIL_F = _OPC(2, 10), + OPC_RNDNE_F = _OPC(2, 11), + OPC_RNDAZ_F = _OPC(2, 12), + OPC_TRUNC_F = _OPC(2, 13), + /* 14-15 - invalid */ + OPC_ADD_U = _OPC(2, 16), + OPC_ADD_S = _OPC(2, 17), + OPC_SUB_U = _OPC(2, 18), + OPC_SUB_S = _OPC(2, 19), + OPC_CMPS_U = _OPC(2, 20), + OPC_CMPS_S = _OPC(2, 21), + OPC_MIN_U = _OPC(2, 22), + OPC_MIN_S = _OPC(2, 23), + OPC_MAX_U = _OPC(2, 24), + OPC_MAX_S = _OPC(2, 25), + OPC_ABSNEG_S = _OPC(2, 26), + /* 27 - invalid */ + OPC_AND_B = _OPC(2, 28), + OPC_OR_B = _OPC(2, 29), + OPC_NOT_B = _OPC(2, 30), + OPC_XOR_B = _OPC(2, 31), + /* 32 - invalid */ + OPC_CMPV_U = _OPC(2, 33), + OPC_CMPV_S = _OPC(2, 34), + /* 35-47 - invalid */ + OPC_MUL_U = _OPC(2, 48), + OPC_MUL_S = _OPC(2, 49), + OPC_MULL_U = _OPC(2, 50), + OPC_BFREV_B = _OPC(2, 51), + OPC_CLZ_S = _OPC(2, 52), + OPC_CLZ_B = _OPC(2, 53), + OPC_SHL_B = _OPC(2, 54), + OPC_SHR_B = _OPC(2, 55), + OPC_ASHR_B = _OPC(2, 56), + OPC_BARY_F = _OPC(2, 57), + OPC_MGEN_B = _OPC(2, 58), + OPC_GETBIT_B = _OPC(2, 59), + OPC_SETRM = _OPC(2, 60), + OPC_CBITS_B = _OPC(2, 61), + OPC_SHB = _OPC(2, 62), + OPC_MSAD = _OPC(2, 63), + + /* category 3: */ + OPC_MAD_U16 = _OPC(3, 0), + OPC_MADSH_U16 = _OPC(3, 1), + OPC_MAD_S16 = _OPC(3, 2), + OPC_MADSH_M16 = _OPC(3, 3), /* should this be .s16? */ + OPC_MAD_U24 = _OPC(3, 4), + OPC_MAD_S24 = _OPC(3, 5), + OPC_MAD_F16 = _OPC(3, 6), + OPC_MAD_F32 = _OPC(3, 7), + OPC_SEL_B16 = _OPC(3, 8), + OPC_SEL_B32 = _OPC(3, 9), + OPC_SEL_S16 = _OPC(3, 10), + OPC_SEL_S32 = _OPC(3, 11), + OPC_SEL_F16 = _OPC(3, 12), + OPC_SEL_F32 = _OPC(3, 13), + OPC_SAD_S16 = _OPC(3, 14), + OPC_SAD_S32 = _OPC(3, 15), + + /* category 4: */ + OPC_RCP = _OPC(4, 0), + OPC_RSQ = _OPC(4, 1), + OPC_LOG2 = _OPC(4, 2), + OPC_EXP2 = _OPC(4, 3), + OPC_SIN = _OPC(4, 4), + OPC_COS = _OPC(4, 5), + OPC_SQRT = _OPC(4, 6), + // 7-63 - invalid + + /* category 5: */ + OPC_ISAM = _OPC(5, 0), + OPC_ISAML = _OPC(5, 1), + OPC_ISAMM = _OPC(5, 2), + OPC_SAM = _OPC(5, 3), + OPC_SAMB = _OPC(5, 4), + OPC_SAML = _OPC(5, 5), + OPC_SAMGQ = _OPC(5, 6), + OPC_GETLOD = _OPC(5, 7), + OPC_CONV = _OPC(5, 8), + OPC_CONVM = _OPC(5, 9), + OPC_GETSIZE = _OPC(5, 10), + OPC_GETBUF = _OPC(5, 11), + OPC_GETPOS = _OPC(5, 12), + OPC_GETINFO = _OPC(5, 13), + OPC_DSX = _OPC(5, 14), + OPC_DSY = _OPC(5, 15), + OPC_GATHER4R = _OPC(5, 16), + OPC_GATHER4G = _OPC(5, 17), + OPC_GATHER4B = _OPC(5, 18), + OPC_GATHER4A = _OPC(5, 19), + OPC_SAMGP0 = _OPC(5, 20), + OPC_SAMGP1 = _OPC(5, 21), + OPC_SAMGP2 = _OPC(5, 22), + OPC_SAMGP3 = _OPC(5, 23), + OPC_DSXPP_1 = _OPC(5, 24), + OPC_DSYPP_1 = _OPC(5, 25), + OPC_RGETPOS = _OPC(5, 26), + OPC_RGETINFO = _OPC(5, 27), + + /* category 6: */ + OPC_LDG = _OPC(6, 0), /* load-global */ + OPC_LDL = _OPC(6, 1), + OPC_LDP = _OPC(6, 2), + OPC_STG = _OPC(6, 3), /* store-global */ + OPC_STL = _OPC(6, 4), + OPC_STP = _OPC(6, 5), + OPC_STI = _OPC(6, 6), + OPC_G2L = _OPC(6, 7), + OPC_L2G = _OPC(6, 8), + OPC_PREFETCH = _OPC(6, 9), + OPC_LDLW = _OPC(6, 10), + OPC_STLW = _OPC(6, 11), + OPC_RESFMT = _OPC(6, 14), + OPC_RESINFO = _OPC(6, 15), + OPC_ATOMIC_ADD = _OPC(6, 16), + OPC_ATOMIC_SUB = _OPC(6, 17), + OPC_ATOMIC_XCHG = _OPC(6, 18), + OPC_ATOMIC_INC = _OPC(6, 19), + OPC_ATOMIC_DEC = _OPC(6, 20), + OPC_ATOMIC_CMPXCHG = _OPC(6, 21), + OPC_ATOMIC_MIN = _OPC(6, 22), + OPC_ATOMIC_MAX = _OPC(6, 23), + OPC_ATOMIC_AND = _OPC(6, 24), + OPC_ATOMIC_OR = _OPC(6, 25), + OPC_ATOMIC_XOR = _OPC(6, 26), + OPC_LDGB = _OPC(6, 27), + OPC_STGB = _OPC(6, 28), + OPC_STIB = _OPC(6, 29), + OPC_LDC = _OPC(6, 30), + OPC_LDLV = _OPC(6, 31), + + /* category 7: */ + OPC_BAR = _OPC(7, 0), + OPC_FENCE = _OPC(7, 1), + + /* meta instructions (category -1): */ + /* placeholder instr to mark shader inputs: */ + OPC_META_INPUT = _OPC(-1, 0), + /* The "fan-in" and "fan-out" instructions are used for keeping + * track of instructions that write to multiple dst registers + * (fan-out) like texture sample instructions, or read multiple + * consecutive scalar registers (fan-in) (bary.f, texture samp) + */ + OPC_META_FO = _OPC(-1, 2), + OPC_META_FI = _OPC(-1, 3), + +} opc_t; + +#define opc_cat(opc) ((int)((opc) >> NOPC_BITS)) +#define opc_op(opc) ((unsigned)((opc) & ((1 << NOPC_BITS) - 1))) + +typedef enum { + TYPE_F16 = 0, + TYPE_F32 = 1, + TYPE_U16 = 2, + TYPE_U32 = 3, + TYPE_S16 = 4, + TYPE_S32 = 5, + TYPE_U8 = 6, + TYPE_S8 = 7, // XXX I assume? +} type_t; + +static inline uint32_t type_size(type_t type) +{ + switch (type) { + case TYPE_F32: + case TYPE_U32: + case TYPE_S32: + return 32; + case TYPE_F16: + case TYPE_U16: + case TYPE_S16: + return 16; + case TYPE_U8: + case TYPE_S8: + return 8; + default: + assert(0); /* invalid type */ + return 0; + } +} + +static inline int type_float(type_t type) +{ + return (type == TYPE_F32) || (type == TYPE_F16); +} + +static inline int type_uint(type_t type) +{ + return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8); +} + +static inline int type_sint(type_t type) +{ + return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8); +} + +typedef union PACKED { + /* normal gpr or const src register: */ + struct PACKED { + uint32_t comp : 2; + uint32_t num : 10; + }; + /* for immediate val: */ + int32_t iim_val : 11; + /* to make compiler happy: */ + uint32_t dummy32; + uint32_t dummy10 : 10; + int32_t idummy10 : 10; + uint32_t dummy11 : 11; + uint32_t dummy12 : 12; + uint32_t dummy13 : 13; + uint32_t dummy8 : 8; +} reg_t; + +/* special registers: */ +#define REG_A0 61 /* address register */ +#define REG_P0 62 /* predicate register */ + +static inline int reg_special(reg_t reg) +{ + return (reg.num == REG_A0) || (reg.num == REG_P0); +} + +typedef struct PACKED { + /* dword0: */ + union PACKED { + struct PACKED { + int16_t immed : 16; + uint32_t dummy1 : 16; + } a3xx; + struct PACKED { + int32_t immed : 20; + uint32_t dummy1 : 12; + } a4xx; + struct PACKED { + int32_t immed : 32; + } a5xx; + }; + + /* dword1: */ + uint32_t dummy2 : 8; + uint32_t repeat : 3; + uint32_t dummy3 : 1; + uint32_t ss : 1; + uint32_t dummy4 : 7; + uint32_t inv : 1; + uint32_t comp : 2; + uint32_t opc : 4; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; +} instr_cat0_t; + +typedef struct PACKED { + /* dword0: */ + union PACKED { + /* for normal src register: */ + struct PACKED { + uint32_t src : 11; + /* at least low bit of pad must be zero or it will + * look like a address relative src + */ + uint32_t pad : 21; + }; + /* for address relative: */ + struct PACKED { + int32_t off : 10; + uint32_t src_rel_c : 1; + uint32_t src_rel : 1; + uint32_t unknown : 20; + }; + /* for immediate: */ + int32_t iim_val; + uint32_t uim_val; + float fim_val; + }; + + /* dword1: */ + uint32_t dst : 8; + uint32_t repeat : 3; + uint32_t src_r : 1; + uint32_t ss : 1; + uint32_t ul : 1; + uint32_t dst_type : 3; + uint32_t dst_rel : 1; + uint32_t src_type : 3; + uint32_t src_c : 1; + uint32_t src_im : 1; + uint32_t even : 1; + uint32_t pos_inf : 1; + uint32_t must_be_0 : 2; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; +} instr_cat1_t; + +typedef struct PACKED { + /* dword0: */ + union PACKED { + struct PACKED { + uint32_t src1 : 11; + uint32_t must_be_zero1: 2; + uint32_t src1_im : 1; /* immediate */ + uint32_t src1_neg : 1; /* negate */ + uint32_t src1_abs : 1; /* absolute value */ + }; + struct PACKED { + uint32_t src1 : 10; + uint32_t src1_c : 1; /* relative-const */ + uint32_t src1_rel : 1; /* relative address */ + uint32_t must_be_zero : 1; + uint32_t dummy : 3; + } rel1; + struct PACKED { + uint32_t src1 : 12; + uint32_t src1_c : 1; /* const */ + uint32_t dummy : 3; + } c1; + }; + + union PACKED { + struct PACKED { + uint32_t src2 : 11; + uint32_t must_be_zero2: 2; + uint32_t src2_im : 1; /* immediate */ + uint32_t src2_neg : 1; /* negate */ + uint32_t src2_abs : 1; /* absolute value */ + }; + struct PACKED { + uint32_t src2 : 10; + uint32_t src2_c : 1; /* relative-const */ + uint32_t src2_rel : 1; /* relative address */ + uint32_t must_be_zero : 1; + uint32_t dummy : 3; + } rel2; + struct PACKED { + uint32_t src2 : 12; + uint32_t src2_c : 1; /* const */ + uint32_t dummy : 3; + } c2; + }; + + /* dword1: */ + uint32_t dst : 8; + uint32_t repeat : 2; + uint32_t sat : 1; + uint32_t src1_r : 1; + uint32_t ss : 1; + uint32_t ul : 1; /* dunno */ + uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */ + uint32_t ei : 1; + uint32_t cond : 3; + uint32_t src2_r : 1; + uint32_t full : 1; /* not half */ + uint32_t opc : 6; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; +} instr_cat2_t; + +typedef struct PACKED { + /* dword0: */ + union PACKED { + struct PACKED { + uint32_t src1 : 11; + uint32_t must_be_zero1: 2; + uint32_t src2_c : 1; + uint32_t src1_neg : 1; + uint32_t src2_r : 1; + }; + struct PACKED { + uint32_t src1 : 10; + uint32_t src1_c : 1; + uint32_t src1_rel : 1; + uint32_t must_be_zero : 1; + uint32_t dummy : 3; + } rel1; + struct PACKED { + uint32_t src1 : 12; + uint32_t src1_c : 1; + uint32_t dummy : 3; + } c1; + }; + + union PACKED { + struct PACKED { + uint32_t src3 : 11; + uint32_t must_be_zero2: 2; + uint32_t src3_r : 1; + uint32_t src2_neg : 1; + uint32_t src3_neg : 1; + }; + struct PACKED { + uint32_t src3 : 10; + uint32_t src3_c : 1; + uint32_t src3_rel : 1; + uint32_t must_be_zero : 1; + uint32_t dummy : 3; + } rel2; + struct PACKED { + uint32_t src3 : 12; + uint32_t src3_c : 1; + uint32_t dummy : 3; + } c2; + }; + + /* dword1: */ + uint32_t dst : 8; + uint32_t repeat : 2; + uint32_t sat : 1; + uint32_t src1_r : 1; + uint32_t ss : 1; + uint32_t ul : 1; + uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */ + uint32_t src2 : 8; + uint32_t opc : 4; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; +} instr_cat3_t; + +static inline bool instr_cat3_full(instr_cat3_t *cat3) +{ + switch (_OPC(3, cat3->opc)) { + case OPC_MAD_F16: + case OPC_MAD_U16: + case OPC_MAD_S16: + case OPC_SEL_B16: + case OPC_SEL_S16: + case OPC_SEL_F16: + case OPC_SAD_S16: + case OPC_SAD_S32: // really?? + return false; + default: + return true; + } +} + +typedef struct PACKED { + /* dword0: */ + union PACKED { + struct PACKED { + uint32_t src : 11; + uint32_t must_be_zero1: 2; + uint32_t src_im : 1; /* immediate */ + uint32_t src_neg : 1; /* negate */ + uint32_t src_abs : 1; /* absolute value */ + }; + struct PACKED { + uint32_t src : 10; + uint32_t src_c : 1; /* relative-const */ + uint32_t src_rel : 1; /* relative address */ + uint32_t must_be_zero : 1; + uint32_t dummy : 3; + } rel; + struct PACKED { + uint32_t src : 12; + uint32_t src_c : 1; /* const */ + uint32_t dummy : 3; + } c; + }; + uint32_t dummy1 : 16; /* seem to be ignored */ + + /* dword1: */ + uint32_t dst : 8; + uint32_t repeat : 2; + uint32_t sat : 1; + uint32_t src_r : 1; + uint32_t ss : 1; + uint32_t ul : 1; + uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */ + uint32_t dummy2 : 5; /* seem to be ignored */ + uint32_t full : 1; /* not half */ + uint32_t opc : 6; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; +} instr_cat4_t; + +typedef struct PACKED { + /* dword0: */ + union PACKED { + /* normal case: */ + struct PACKED { + uint32_t full : 1; /* not half */ + uint32_t src1 : 8; + uint32_t src2 : 8; + uint32_t dummy1 : 4; /* seem to be ignored */ + uint32_t samp : 4; + uint32_t tex : 7; + } norm; + /* s2en case: */ + struct PACKED { + uint32_t full : 1; /* not half */ + uint32_t src1 : 8; + uint32_t src2 : 11; + uint32_t dummy1 : 1; + uint32_t src3 : 8; + uint32_t dummy2 : 3; + } s2en; + /* same in either case: */ + // XXX I think, confirm this + struct PACKED { + uint32_t full : 1; /* not half */ + uint32_t src1 : 8; + uint32_t pad : 23; + }; + }; + + /* dword1: */ + uint32_t dst : 8; + uint32_t wrmask : 4; /* write-mask */ + uint32_t type : 3; + uint32_t dummy2 : 1; /* seems to be ignored */ + uint32_t is_3d : 1; + + uint32_t is_a : 1; + uint32_t is_s : 1; + uint32_t is_s2en : 1; + uint32_t is_o : 1; + uint32_t is_p : 1; + + uint32_t opc : 5; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; +} instr_cat5_t; + +/* dword0 encoding for src_off: [src1 + off], src2: */ +typedef struct PACKED { + /* dword0: */ + uint32_t mustbe1 : 1; + int32_t off : 13; + uint32_t src1 : 8; + uint32_t src1_im : 1; + uint32_t src2_im : 1; + uint32_t src2 : 8; + + /* dword1: */ + uint32_t dword1; +} instr_cat6a_t; + +/* dword0 encoding for !src_off: [src1], src2 */ +typedef struct PACKED { + /* dword0: */ + uint32_t mustbe0 : 1; + uint32_t src1 : 13; + uint32_t ignore0 : 8; + uint32_t src1_im : 1; + uint32_t src2_im : 1; + uint32_t src2 : 8; + + /* dword1: */ + uint32_t dword1; +} instr_cat6b_t; + +/* dword1 encoding for dst_off: */ +typedef struct PACKED { + /* dword0: */ + uint32_t dword0; + + /* note: there is some weird stuff going on where sometimes + * cat6->a.off is involved.. but that seems like a bug in + * the blob, since it is used even if !cat6->src_off + * It would make sense for there to be some more bits to + * bring us to 11 bits worth of offset, but not sure.. + */ + int32_t off : 8; + uint32_t mustbe1 : 1; + uint32_t dst : 8; + uint32_t pad1 : 15; +} instr_cat6c_t; + +/* dword1 encoding for !dst_off: */ +typedef struct PACKED { + /* dword0: */ + uint32_t dword0; + + uint32_t dst : 8; + uint32_t mustbe0 : 1; + uint32_t idx : 8; + uint32_t pad0 : 15; +} instr_cat6d_t; + +/* ldgb and atomics.. + * + * ldgb: pad0=0, pad3=1 + * atomic .g: pad0=1, pad3=1 + * .l: pad0=1, pad3=0 + */ +typedef struct PACKED { + /* dword0: */ + uint32_t pad0 : 1; + uint32_t src3 : 8; + uint32_t d : 2; + uint32_t typed : 1; + uint32_t type_size : 2; + uint32_t src1 : 8; + uint32_t src1_im : 1; + uint32_t src2_im : 1; + uint32_t src2 : 8; + + /* dword1: */ + uint32_t dst : 8; + uint32_t mustbe0 : 1; + uint32_t src_ssbo : 8; + uint32_t pad2 : 3; // type + uint32_t g : 1; + uint32_t pad3 : 1; + uint32_t pad4 : 10; // opc/jmp_tgt/sync/opc_cat +} instr_cat6ldgb_t; + +/* stgb, pad0=0, pad3=2 + */ +typedef struct PACKED { + /* dword0: */ + uint32_t mustbe1 : 1; // ??? + uint32_t src1 : 8; + uint32_t d : 2; + uint32_t typed : 1; + uint32_t type_size : 2; + uint32_t pad0 : 9; + uint32_t src2_im : 1; + uint32_t src2 : 8; + + /* dword1: */ + uint32_t src3 : 8; + uint32_t src3_im : 1; + uint32_t dst_ssbo : 8; + uint32_t pad2 : 3; // type + uint32_t pad3 : 2; + uint32_t pad4 : 10; // opc/jmp_tgt/sync/opc_cat +} instr_cat6stgb_t; + +typedef union PACKED { + instr_cat6a_t a; + instr_cat6b_t b; + instr_cat6c_t c; + instr_cat6d_t d; + instr_cat6ldgb_t ldgb; + instr_cat6stgb_t stgb; + struct PACKED { + /* dword0: */ + uint32_t src_off : 1; + uint32_t pad1 : 31; + + /* dword1: */ + uint32_t pad2 : 8; + uint32_t dst_off : 1; + uint32_t pad3 : 8; + uint32_t type : 3; + uint32_t g : 1; /* or in some cases it means dst immed */ + uint32_t pad4 : 1; + uint32_t opc : 5; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; + }; +} instr_cat6_t; + +typedef struct PACKED { + /* dword0: */ + uint32_t pad1 : 32; + + /* dword1: */ + uint32_t pad2 : 12; + uint32_t ss : 1; /* maybe in the encoding, but blob only uses (sy) */ + uint32_t pad3 : 6; + uint32_t w : 1; /* write */ + uint32_t r : 1; /* read */ + uint32_t l : 1; /* local */ + uint32_t g : 1; /* global */ + uint32_t opc : 4; /* presumed, but only a couple known OPCs */ + uint32_t jmp_tgt : 1; /* (jp) */ + uint32_t sync : 1; /* (sy) */ + uint32_t opc_cat : 3; +} instr_cat7_t; + +typedef union PACKED { + instr_cat0_t cat0; + instr_cat1_t cat1; + instr_cat2_t cat2; + instr_cat3_t cat3; + instr_cat4_t cat4; + instr_cat5_t cat5; + instr_cat6_t cat6; + instr_cat7_t cat7; + struct PACKED { + /* dword0: */ + uint32_t pad1 : 32; + + /* dword1: */ + uint32_t pad2 : 12; + uint32_t ss : 1; /* cat1-cat4 (cat0??) and cat7 (?) */ + uint32_t ul : 1; /* cat2-cat4 (and cat1 in blob.. which may be bug??) */ + uint32_t pad3 : 13; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; + + }; +} instr_t; + +static inline uint32_t instr_repeat(instr_t *instr) +{ + switch (instr->opc_cat) { + case 0: return instr->cat0.repeat; + case 1: return instr->cat1.repeat; + case 2: return instr->cat2.repeat; + case 3: return instr->cat3.repeat; + case 4: return instr->cat4.repeat; + default: return 0; + } +} + +static inline bool instr_sat(instr_t *instr) +{ + switch (instr->opc_cat) { + case 2: return instr->cat2.sat; + case 3: return instr->cat3.sat; + case 4: return instr->cat4.sat; + default: return false; + } +} + +static inline uint32_t instr_opc(instr_t *instr) +{ + switch (instr->opc_cat) { + case 0: return instr->cat0.opc; + case 1: return 0; + case 2: return instr->cat2.opc; + case 3: return instr->cat3.opc; + case 4: return instr->cat4.opc; + case 5: return instr->cat5.opc; + case 6: return instr->cat6.opc; + case 7: return instr->cat7.opc; + default: return 0; + } +} + +static inline bool is_mad(opc_t opc) +{ + switch (opc) { + case OPC_MAD_U16: + case OPC_MAD_S16: + case OPC_MAD_U24: + case OPC_MAD_S24: + case OPC_MAD_F16: + case OPC_MAD_F32: + return true; + default: + return false; + } +} + +static inline bool is_madsh(opc_t opc) +{ + switch (opc) { + case OPC_MADSH_U16: + case OPC_MADSH_M16: + return true; + default: + return false; + } +} + +static inline bool is_atomic(opc_t opc) +{ + switch (opc) { + case OPC_ATOMIC_ADD: + case OPC_ATOMIC_SUB: + case OPC_ATOMIC_XCHG: + case OPC_ATOMIC_INC: + case OPC_ATOMIC_DEC: + case OPC_ATOMIC_CMPXCHG: + case OPC_ATOMIC_MIN: + case OPC_ATOMIC_MAX: + case OPC_ATOMIC_AND: + case OPC_ATOMIC_OR: + case OPC_ATOMIC_XOR: + return true; + default: + return false; + } +} + +static inline bool is_ssbo(opc_t opc) +{ + switch (opc) { + case OPC_RESFMT: + case OPC_RESINFO: + case OPC_LDGB: + case OPC_STGB: + case OPC_STIB: + return true; + default: + return false; + } +} + +int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out); + +#endif /* INSTR_A3XX_H_ */ diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c new file mode 100644 index 00000000000..3d1c4449b12 --- /dev/null +++ b/src/freedreno/ir3/ir3.c @@ -0,0 +1,941 @@ +/* + * Copyright (c) 2012 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ir3.h" + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include <stdbool.h> +#include <errno.h> + +#include "util/bitscan.h" +#include "util/ralloc.h" +#include "util/u_math.h" + +#include "instr-a3xx.h" + +/* simple allocator to carve allocations out of an up-front allocated heap, + * so that we can free everything easily in one shot. + */ +void * ir3_alloc(struct ir3 *shader, int sz) +{ + return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */ +} + +struct ir3 * ir3_create(struct ir3_compiler *compiler, + unsigned nin, unsigned nout) +{ + struct ir3 *shader = rzalloc(compiler, struct ir3); + + shader->compiler = compiler; + shader->ninputs = nin; + shader->inputs = ir3_alloc(shader, sizeof(shader->inputs[0]) * nin); + + shader->noutputs = nout; + shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout); + + list_inithead(&shader->block_list); + list_inithead(&shader->array_list); + + return shader; +} + +void ir3_destroy(struct ir3 *shader) +{ + ralloc_free(shader); +} + +#define iassert(cond) do { \ + if (!(cond)) { \ + debug_assert(cond); \ + return -1; \ + } } while (0) + +#define iassert_type(reg, full) do { \ + if ((full)) { \ + iassert(!((reg)->flags & IR3_REG_HALF)); \ + } else { \ + iassert((reg)->flags & IR3_REG_HALF); \ + } } while (0); + +static uint32_t reg(struct ir3_register *reg, struct ir3_info *info, + uint32_t repeat, uint32_t valid_flags) +{ + reg_t val = { .dummy32 = 0 }; + + if (reg->flags & ~valid_flags) { + debug_printf("INVALID FLAGS: %x vs %x\n", + reg->flags, valid_flags); + } + + if (!(reg->flags & IR3_REG_R)) + repeat = 0; + + if (reg->flags & IR3_REG_IMMED) { + val.iim_val = reg->iim_val; + } else { + unsigned components; + int16_t max; + + if (reg->flags & IR3_REG_RELATIV) { + components = reg->size; + val.idummy10 = reg->array.offset; + max = (reg->array.offset + repeat + components - 1) >> 2; + } else { + components = util_last_bit(reg->wrmask); + val.comp = reg->num & 0x3; + val.num = reg->num >> 2; + max = (reg->num + repeat + components - 1) >> 2; + } + + if (reg->flags & IR3_REG_CONST) { + info->max_const = MAX2(info->max_const, max); + } else if (val.num == 63) { + /* ignore writes to dummy register r63.x */ + } else if (max < 48) { + if (reg->flags & IR3_REG_HALF) { + if (info->gpu_id >= 600) { + /* starting w/ a6xx, half regs conflict with full regs: */ + info->max_reg = MAX2(info->max_reg, (max+1)/2); + } else { + info->max_half_reg = MAX2(info->max_half_reg, max); + } + } else { + info->max_reg = MAX2(info->max_reg, max); + } + } + } + + return val.dummy32; +} + +static int emit_cat0(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) +{ + instr_cat0_t *cat0 = ptr; + + if (info->gpu_id >= 500) { + cat0->a5xx.immed = instr->cat0.immed; + } else if (info->gpu_id >= 400) { + cat0->a4xx.immed = instr->cat0.immed; + } else { + cat0->a3xx.immed = instr->cat0.immed; + } + cat0->repeat = instr->repeat; + cat0->ss = !!(instr->flags & IR3_INSTR_SS); + cat0->inv = instr->cat0.inv; + cat0->comp = instr->cat0.comp; + cat0->opc = instr->opc; + cat0->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat0->sync = !!(instr->flags & IR3_INSTR_SY); + cat0->opc_cat = 0; + + return 0; +} + +static int emit_cat1(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) +{ + struct ir3_register *dst = instr->regs[0]; + struct ir3_register *src = instr->regs[1]; + instr_cat1_t *cat1 = ptr; + + iassert(instr->regs_count == 2); + iassert_type(dst, type_size(instr->cat1.dst_type) == 32); + if (!(src->flags & IR3_REG_IMMED)) + iassert_type(src, type_size(instr->cat1.src_type) == 32); + + if (src->flags & IR3_REG_IMMED) { + cat1->iim_val = src->iim_val; + cat1->src_im = 1; + } else if (src->flags & IR3_REG_RELATIV) { + cat1->off = reg(src, info, instr->repeat, + IR3_REG_R | IR3_REG_CONST | IR3_REG_HALF | IR3_REG_RELATIV); + cat1->src_rel = 1; + cat1->src_rel_c = !!(src->flags & IR3_REG_CONST); + } else { + cat1->src = reg(src, info, instr->repeat, + IR3_REG_R | IR3_REG_CONST | IR3_REG_HALF); + cat1->src_c = !!(src->flags & IR3_REG_CONST); + } + + cat1->dst = reg(dst, info, instr->repeat, + IR3_REG_RELATIV | IR3_REG_EVEN | + IR3_REG_R | IR3_REG_POS_INF | IR3_REG_HALF); + cat1->repeat = instr->repeat; + cat1->src_r = !!(src->flags & IR3_REG_R); + cat1->ss = !!(instr->flags & IR3_INSTR_SS); + cat1->ul = !!(instr->flags & IR3_INSTR_UL); + cat1->dst_type = instr->cat1.dst_type; + cat1->dst_rel = !!(dst->flags & IR3_REG_RELATIV); + cat1->src_type = instr->cat1.src_type; + cat1->even = !!(dst->flags & IR3_REG_EVEN); + cat1->pos_inf = !!(dst->flags & IR3_REG_POS_INF); + cat1->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat1->sync = !!(instr->flags & IR3_INSTR_SY); + cat1->opc_cat = 1; + + return 0; +} + +static int emit_cat2(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) +{ + struct ir3_register *dst = instr->regs[0]; + struct ir3_register *src1 = instr->regs[1]; + struct ir3_register *src2 = instr->regs[2]; + instr_cat2_t *cat2 = ptr; + unsigned absneg = ir3_cat2_absneg(instr->opc); + + iassert((instr->regs_count == 2) || (instr->regs_count == 3)); + + if (src1->flags & IR3_REG_RELATIV) { + iassert(src1->array.offset < (1 << 10)); + cat2->rel1.src1 = reg(src1, info, instr->repeat, + IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R | + IR3_REG_HALF | absneg); + cat2->rel1.src1_c = !!(src1->flags & IR3_REG_CONST); + cat2->rel1.src1_rel = 1; + } else if (src1->flags & IR3_REG_CONST) { + iassert(src1->num < (1 << 12)); + cat2->c1.src1 = reg(src1, info, instr->repeat, + IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF); + cat2->c1.src1_c = 1; + } else { + iassert(src1->num < (1 << 11)); + cat2->src1 = reg(src1, info, instr->repeat, + IR3_REG_IMMED | IR3_REG_R | IR3_REG_HALF | + absneg); + } + cat2->src1_im = !!(src1->flags & IR3_REG_IMMED); + cat2->src1_neg = !!(src1->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)); + cat2->src1_abs = !!(src1->flags & (IR3_REG_FABS | IR3_REG_SABS)); + cat2->src1_r = !!(src1->flags & IR3_REG_R); + + if (src2) { + iassert((src2->flags & IR3_REG_IMMED) || + !((src1->flags ^ src2->flags) & IR3_REG_HALF)); + + if (src2->flags & IR3_REG_RELATIV) { + iassert(src2->array.offset < (1 << 10)); + cat2->rel2.src2 = reg(src2, info, instr->repeat, + IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R | + IR3_REG_HALF | absneg); + cat2->rel2.src2_c = !!(src2->flags & IR3_REG_CONST); + cat2->rel2.src2_rel = 1; + } else if (src2->flags & IR3_REG_CONST) { + iassert(src2->num < (1 << 12)); + cat2->c2.src2 = reg(src2, info, instr->repeat, + IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF); + cat2->c2.src2_c = 1; + } else { + iassert(src2->num < (1 << 11)); + cat2->src2 = reg(src2, info, instr->repeat, + IR3_REG_IMMED | IR3_REG_R | IR3_REG_HALF | + absneg); + } + + cat2->src2_im = !!(src2->flags & IR3_REG_IMMED); + cat2->src2_neg = !!(src2->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)); + cat2->src2_abs = !!(src2->flags & (IR3_REG_FABS | IR3_REG_SABS)); + cat2->src2_r = !!(src2->flags & IR3_REG_R); + } + + cat2->dst = reg(dst, info, instr->repeat, + IR3_REG_R | IR3_REG_EI | IR3_REG_HALF); + cat2->repeat = instr->repeat; + cat2->sat = !!(instr->flags & IR3_INSTR_SAT); + cat2->ss = !!(instr->flags & IR3_INSTR_SS); + cat2->ul = !!(instr->flags & IR3_INSTR_UL); + cat2->dst_half = !!((src1->flags ^ dst->flags) & IR3_REG_HALF); + cat2->ei = !!(dst->flags & IR3_REG_EI); + cat2->cond = instr->cat2.condition; + cat2->full = ! (src1->flags & IR3_REG_HALF); + cat2->opc = instr->opc; + cat2->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat2->sync = !!(instr->flags & IR3_INSTR_SY); + cat2->opc_cat = 2; + + return 0; +} + +static int emit_cat3(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) +{ + struct ir3_register *dst = instr->regs[0]; + struct ir3_register *src1 = instr->regs[1]; + struct ir3_register *src2 = instr->regs[2]; + struct ir3_register *src3 = instr->regs[3]; + unsigned absneg = ir3_cat3_absneg(instr->opc); + instr_cat3_t *cat3 = ptr; + uint32_t src_flags = 0; + + switch (instr->opc) { + case OPC_MAD_F16: + case OPC_MAD_U16: + case OPC_MAD_S16: + case OPC_SEL_B16: + case OPC_SEL_S16: + case OPC_SEL_F16: + case OPC_SAD_S16: + case OPC_SAD_S32: // really?? + src_flags |= IR3_REG_HALF; + break; + default: + break; + } + + iassert(instr->regs_count == 4); + iassert(!((src1->flags ^ src_flags) & IR3_REG_HALF)); + iassert(!((src2->flags ^ src_flags) & IR3_REG_HALF)); + iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF)); + + if (src1->flags & IR3_REG_RELATIV) { + iassert(src1->array.offset < (1 << 10)); + cat3->rel1.src1 = reg(src1, info, instr->repeat, + IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R | + IR3_REG_HALF | absneg); + cat3->rel1.src1_c = !!(src1->flags & IR3_REG_CONST); + cat3->rel1.src1_rel = 1; + } else if (src1->flags & IR3_REG_CONST) { + iassert(src1->num < (1 << 12)); + cat3->c1.src1 = reg(src1, info, instr->repeat, + IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF); + cat3->c1.src1_c = 1; + } else { + iassert(src1->num < (1 << 11)); + cat3->src1 = reg(src1, info, instr->repeat, + IR3_REG_R | IR3_REG_HALF | absneg); + } + + cat3->src1_neg = !!(src1->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)); + cat3->src1_r = !!(src1->flags & IR3_REG_R); + + cat3->src2 = reg(src2, info, instr->repeat, + IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF | absneg); + cat3->src2_c = !!(src2->flags & IR3_REG_CONST); + cat3->src2_neg = !!(src2->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)); + cat3->src2_r = !!(src2->flags & IR3_REG_R); + + + if (src3->flags & IR3_REG_RELATIV) { + iassert(src3->array.offset < (1 << 10)); + cat3->rel2.src3 = reg(src3, info, instr->repeat, + IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R | + IR3_REG_HALF | absneg); + cat3->rel2.src3_c = !!(src3->flags & IR3_REG_CONST); + cat3->rel2.src3_rel = 1; + } else if (src3->flags & IR3_REG_CONST) { + iassert(src3->num < (1 << 12)); + cat3->c2.src3 = reg(src3, info, instr->repeat, + IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF); + cat3->c2.src3_c = 1; + } else { + iassert(src3->num < (1 << 11)); + cat3->src3 = reg(src3, info, instr->repeat, + IR3_REG_R | IR3_REG_HALF | absneg); + } + + cat3->src3_neg = !!(src3->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)); + cat3->src3_r = !!(src3->flags & IR3_REG_R); + + cat3->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); + cat3->repeat = instr->repeat; + cat3->sat = !!(instr->flags & IR3_INSTR_SAT); + cat3->ss = !!(instr->flags & IR3_INSTR_SS); + cat3->ul = !!(instr->flags & IR3_INSTR_UL); + cat3->dst_half = !!((src_flags ^ dst->flags) & IR3_REG_HALF); + cat3->opc = instr->opc; + cat3->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat3->sync = !!(instr->flags & IR3_INSTR_SY); + cat3->opc_cat = 3; + + return 0; +} + +static int emit_cat4(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) +{ + struct ir3_register *dst = instr->regs[0]; + struct ir3_register *src = instr->regs[1]; + instr_cat4_t *cat4 = ptr; + + iassert(instr->regs_count == 2); + + if (src->flags & IR3_REG_RELATIV) { + iassert(src->array.offset < (1 << 10)); + cat4->rel.src = reg(src, info, instr->repeat, + IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_FNEG | + IR3_REG_FABS | IR3_REG_R | IR3_REG_HALF); + cat4->rel.src_c = !!(src->flags & IR3_REG_CONST); + cat4->rel.src_rel = 1; + } else if (src->flags & IR3_REG_CONST) { + iassert(src->num < (1 << 12)); + cat4->c.src = reg(src, info, instr->repeat, + IR3_REG_CONST | IR3_REG_FNEG | IR3_REG_FABS | + IR3_REG_R | IR3_REG_HALF); + cat4->c.src_c = 1; + } else { + iassert(src->num < (1 << 11)); + cat4->src = reg(src, info, instr->repeat, + IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS | + IR3_REG_R | IR3_REG_HALF); + } + + cat4->src_im = !!(src->flags & IR3_REG_IMMED); + cat4->src_neg = !!(src->flags & IR3_REG_FNEG); + cat4->src_abs = !!(src->flags & IR3_REG_FABS); + cat4->src_r = !!(src->flags & IR3_REG_R); + + cat4->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); + cat4->repeat = instr->repeat; + cat4->sat = !!(instr->flags & IR3_INSTR_SAT); + cat4->ss = !!(instr->flags & IR3_INSTR_SS); + cat4->ul = !!(instr->flags & IR3_INSTR_UL); + cat4->dst_half = !!((src->flags ^ dst->flags) & IR3_REG_HALF); + cat4->full = ! (src->flags & IR3_REG_HALF); + cat4->opc = instr->opc; + cat4->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat4->sync = !!(instr->flags & IR3_INSTR_SY); + cat4->opc_cat = 4; + + return 0; +} + +static int emit_cat5(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) +{ + struct ir3_register *dst = instr->regs[0]; + struct ir3_register *src1 = instr->regs[1]; + struct ir3_register *src2 = instr->regs[2]; + struct ir3_register *src3 = instr->regs[3]; + instr_cat5_t *cat5 = ptr; + + iassert_type(dst, type_size(instr->cat5.type) == 32) + + assume(src1 || !src2); + assume(src2 || !src3); + + if (src1) { + cat5->full = ! (src1->flags & IR3_REG_HALF); + cat5->src1 = reg(src1, info, instr->repeat, IR3_REG_HALF); + } + + if (instr->flags & IR3_INSTR_S2EN) { + if (src2) { + iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF)); + cat5->s2en.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF); + } + if (src3) { + iassert(src3->flags & IR3_REG_HALF); + cat5->s2en.src3 = reg(src3, info, instr->repeat, IR3_REG_HALF); + } + iassert(!(instr->cat5.samp | instr->cat5.tex)); + } else { + iassert(!src3); + if (src2) { + iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF)); + cat5->norm.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF); + } + cat5->norm.samp = instr->cat5.samp; + cat5->norm.tex = instr->cat5.tex; + } + + cat5->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); + cat5->wrmask = dst->wrmask; + cat5->type = instr->cat5.type; + cat5->is_3d = !!(instr->flags & IR3_INSTR_3D); + cat5->is_a = !!(instr->flags & IR3_INSTR_A); + cat5->is_s = !!(instr->flags & IR3_INSTR_S); + cat5->is_s2en = !!(instr->flags & IR3_INSTR_S2EN); + cat5->is_o = !!(instr->flags & IR3_INSTR_O); + cat5->is_p = !!(instr->flags & IR3_INSTR_P); + cat5->opc = instr->opc; + cat5->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat5->sync = !!(instr->flags & IR3_INSTR_SY); + cat5->opc_cat = 5; + + return 0; +} + +static int emit_cat6(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) +{ + struct ir3_register *dst, *src1, *src2; + instr_cat6_t *cat6 = ptr; + bool type_full = type_size(instr->cat6.type) == 32; + + cat6->type = instr->cat6.type; + cat6->opc = instr->opc; + cat6->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat6->sync = !!(instr->flags & IR3_INSTR_SY); + cat6->g = !!(instr->flags & IR3_INSTR_G); + cat6->opc_cat = 6; + + switch (instr->opc) { + case OPC_RESINFO: + case OPC_RESFMT: + iassert_type(instr->regs[0], type_full); /* dst */ + iassert_type(instr->regs[1], type_full); /* src1 */ + break; + case OPC_L2G: + case OPC_G2L: + iassert_type(instr->regs[0], true); /* dst */ + iassert_type(instr->regs[1], true); /* src1 */ + break; + case OPC_STG: + case OPC_STL: + case OPC_STP: + case OPC_STI: + case OPC_STLW: + case OPC_STIB: + /* no dst, so regs[0] is dummy */ + iassert_type(instr->regs[1], true); /* dst */ + iassert_type(instr->regs[2], type_full); /* src1 */ + iassert_type(instr->regs[3], true); /* src2 */ + break; + default: + iassert_type(instr->regs[0], type_full); /* dst */ + iassert_type(instr->regs[1], true); /* src1 */ + if (instr->regs_count > 2) + iassert_type(instr->regs[2], true); /* src1 */ + break; + } + + /* the "dst" for a store instruction is (from the perspective + * of data flow in the shader, ie. register use/def, etc) in + * fact a register that is read by the instruction, rather + * than written: + */ + if (is_store(instr)) { + iassert(instr->regs_count >= 3); + + dst = instr->regs[1]; + src1 = instr->regs[2]; + src2 = (instr->regs_count >= 4) ? instr->regs[3] : NULL; + } else { + iassert(instr->regs_count >= 2); + + dst = instr->regs[0]; + src1 = instr->regs[1]; + src2 = (instr->regs_count >= 3) ? instr->regs[2] : NULL; + } + + /* TODO we need a more comprehensive list about which instructions + * can be encoded which way. Or possibly use IR3_INSTR_0 flag to + * indicate to use the src_off encoding even if offset is zero + * (but then what to do about dst_off?) + */ + if (is_atomic(instr->opc)) { + instr_cat6ldgb_t *ldgb = ptr; + + /* maybe these two bits both determine the instruction encoding? */ + cat6->src_off = false; + + ldgb->d = instr->cat6.d - 1; + ldgb->typed = instr->cat6.typed; + ldgb->type_size = instr->cat6.iim_val - 1; + + ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); + + if (ldgb->g) { + struct ir3_register *src3 = instr->regs[3]; + struct ir3_register *src4 = instr->regs[4]; + + /* first src is src_ssbo: */ + iassert(src1->flags & IR3_REG_IMMED); + ldgb->src_ssbo = src1->uim_val; + + ldgb->src1 = reg(src2, info, instr->repeat, IR3_REG_IMMED); + ldgb->src1_im = !!(src2->flags & IR3_REG_IMMED); + ldgb->src2 = reg(src3, info, instr->repeat, IR3_REG_IMMED); + ldgb->src2_im = !!(src3->flags & IR3_REG_IMMED); + + ldgb->src3 = reg(src4, info, instr->repeat, 0); + ldgb->pad0 = 0x1; + ldgb->pad3 = 0x1; + } else { + ldgb->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED); + ldgb->src1_im = !!(src1->flags & IR3_REG_IMMED); + ldgb->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED); + ldgb->src2_im = !!(src2->flags & IR3_REG_IMMED); + ldgb->pad0 = 0x1; + ldgb->pad3 = 0x0; + } + + return 0; + } else if (instr->opc == OPC_LDGB) { + struct ir3_register *src3 = instr->regs[3]; + instr_cat6ldgb_t *ldgb = ptr; + + /* maybe these two bits both determine the instruction encoding? */ + cat6->src_off = false; + + ldgb->d = instr->cat6.d - 1; + ldgb->typed = instr->cat6.typed; + ldgb->type_size = instr->cat6.iim_val - 1; + + ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); + + /* first src is src_ssbo: */ + iassert(src1->flags & IR3_REG_IMMED); + ldgb->src_ssbo = src1->uim_val; + + /* then next two are src1/src2: */ + ldgb->src1 = reg(src2, info, instr->repeat, IR3_REG_IMMED); + ldgb->src1_im = !!(src2->flags & IR3_REG_IMMED); + ldgb->src2 = reg(src3, info, instr->repeat, IR3_REG_IMMED); + ldgb->src2_im = !!(src3->flags & IR3_REG_IMMED); + + ldgb->pad0 = 0x0; + ldgb->pad3 = 0x1; + + return 0; + } else if (instr->opc == OPC_RESINFO) { + instr_cat6ldgb_t *ldgb = ptr; + + ldgb->d = instr->cat6.d - 1; + + ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); + + /* first src is src_ssbo: */ + iassert(src1->flags & IR3_REG_IMMED); + ldgb->src_ssbo = src1->uim_val; + + return 0; + } else if ((instr->opc == OPC_STGB) || (instr->opc == OPC_STIB)) { + struct ir3_register *src3 = instr->regs[4]; + instr_cat6stgb_t *stgb = ptr; + + /* maybe these two bits both determine the instruction encoding? */ + cat6->src_off = true; + stgb->pad3 = 0x2; + + stgb->d = instr->cat6.d - 1; + stgb->typed = instr->cat6.typed; + stgb->type_size = instr->cat6.iim_val - 1; + + /* first src is dst_ssbo: */ + iassert(dst->flags & IR3_REG_IMMED); + stgb->dst_ssbo = dst->uim_val; + + /* then src1/src2/src3: */ + stgb->src1 = reg(src1, info, instr->repeat, 0); + stgb->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED); + stgb->src2_im = !!(src2->flags & IR3_REG_IMMED); + stgb->src3 = reg(src3, info, instr->repeat, IR3_REG_IMMED); + stgb->src3_im = !!(src3->flags & IR3_REG_IMMED); + + return 0; + } else if (instr->cat6.src_offset || (instr->opc == OPC_LDG) || + (instr->opc == OPC_LDL)) { + instr_cat6a_t *cat6a = ptr; + + cat6->src_off = true; + + cat6a->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED); + cat6a->src1_im = !!(src1->flags & IR3_REG_IMMED); + if (src2) { + cat6a->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED); + cat6a->src2_im = !!(src2->flags & IR3_REG_IMMED); + } + cat6a->off = instr->cat6.src_offset; + } else { + instr_cat6b_t *cat6b = ptr; + + cat6->src_off = false; + + cat6b->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED | IR3_REG_HALF); + cat6b->src1_im = !!(src1->flags & IR3_REG_IMMED); + if (src2) { + cat6b->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED); + cat6b->src2_im = !!(src2->flags & IR3_REG_IMMED); + } + } + + if (instr->cat6.dst_offset || (instr->opc == OPC_STG) || + (instr->opc == OPC_STL)) { + instr_cat6c_t *cat6c = ptr; + cat6->dst_off = true; + cat6c->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); + cat6c->off = instr->cat6.dst_offset; + } else { + instr_cat6d_t *cat6d = ptr; + cat6->dst_off = false; + cat6d->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); + } + + return 0; +} + +static int emit_cat7(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) +{ + instr_cat7_t *cat7 = ptr; + + cat7->ss = !!(instr->flags & IR3_INSTR_SS); + cat7->w = instr->cat7.w; + cat7->r = instr->cat7.r; + cat7->l = instr->cat7.l; + cat7->g = instr->cat7.g; + cat7->opc = instr->opc; + cat7->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat7->sync = !!(instr->flags & IR3_INSTR_SY); + cat7->opc_cat = 7; + + return 0; +} + +static int (*emit[])(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) = { + emit_cat0, emit_cat1, emit_cat2, emit_cat3, emit_cat4, emit_cat5, emit_cat6, + emit_cat7, +}; + +void * ir3_assemble(struct ir3 *shader, struct ir3_info *info, + uint32_t gpu_id) +{ + uint32_t *ptr, *dwords; + + info->gpu_id = gpu_id; + info->max_reg = -1; + info->max_half_reg = -1; + info->max_const = -1; + info->instrs_count = 0; + info->sizedwords = 0; + info->ss = info->sy = 0; + + list_for_each_entry (struct ir3_block, block, &shader->block_list, node) { + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + info->sizedwords += 2; + } + } + + /* need an integer number of instruction "groups" (sets of 16 + * instructions on a4xx or sets of 4 instructions on a3xx), + * so pad out w/ NOPs if needed: (NOTE each instruction is 64bits) + */ + if (gpu_id >= 400) { + info->sizedwords = align(info->sizedwords, 16 * 2); + } else { + info->sizedwords = align(info->sizedwords, 4 * 2); + } + + ptr = dwords = calloc(4, info->sizedwords); + + list_for_each_entry (struct ir3_block, block, &shader->block_list, node) { + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + int ret = emit[opc_cat(instr->opc)](instr, dwords, info); + if (ret) + goto fail; + info->instrs_count += 1 + instr->repeat; + dwords += 2; + + if (instr->flags & IR3_INSTR_SS) + info->ss++; + + if (instr->flags & IR3_INSTR_SY) + info->sy++; + } + } + + return ptr; + +fail: + free(ptr); + return NULL; +} + +static struct ir3_register * reg_create(struct ir3 *shader, + int num, int flags) +{ + struct ir3_register *reg = + ir3_alloc(shader, sizeof(struct ir3_register)); + reg->wrmask = 1; + reg->flags = flags; + reg->num = num; + return reg; +} + +static void insert_instr(struct ir3_block *block, + struct ir3_instruction *instr) +{ + struct ir3 *shader = block->shader; +#ifdef DEBUG + instr->serialno = ++shader->instr_count; +#endif + list_addtail(&instr->node, &block->instr_list); + + if (is_input(instr)) + array_insert(shader, shader->baryfs, instr); +} + +struct ir3_block * ir3_block_create(struct ir3 *shader) +{ + struct ir3_block *block = ir3_alloc(shader, sizeof(*block)); +#ifdef DEBUG + block->serialno = ++shader->block_count; +#endif + block->shader = shader; + list_inithead(&block->node); + list_inithead(&block->instr_list); + return block; +} + +static struct ir3_instruction *instr_create(struct ir3_block *block, int nreg) +{ + struct ir3_instruction *instr; + unsigned sz = sizeof(*instr) + (nreg * sizeof(instr->regs[0])); + char *ptr = ir3_alloc(block->shader, sz); + + instr = (struct ir3_instruction *)ptr; + ptr += sizeof(*instr); + instr->regs = (struct ir3_register **)ptr; + +#ifdef DEBUG + instr->regs_max = nreg; +#endif + + return instr; +} + +struct ir3_instruction * ir3_instr_create2(struct ir3_block *block, + opc_t opc, int nreg) +{ + struct ir3_instruction *instr = instr_create(block, nreg); + instr->block = block; + instr->opc = opc; + insert_instr(block, instr); + return instr; +} + +struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc) +{ + /* NOTE: we could be slightly more clever, at least for non-meta, + * and choose # of regs based on category. + */ + return ir3_instr_create2(block, opc, 4); +} + +struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr) +{ + struct ir3_instruction *new_instr = instr_create(instr->block, + instr->regs_count); + struct ir3_register **regs; + unsigned i; + + regs = new_instr->regs; + *new_instr = *instr; + new_instr->regs = regs; + + insert_instr(instr->block, new_instr); + + /* clone registers: */ + new_instr->regs_count = 0; + for (i = 0; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + struct ir3_register *new_reg = + ir3_reg_create(new_instr, reg->num, reg->flags); + *new_reg = *reg; + } + + return new_instr; +} + +/* Add a false dependency to instruction, to ensure it is scheduled first: */ +void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep) +{ + array_insert(instr, instr->deps, dep); +} + +struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, + int num, int flags) +{ + struct ir3 *shader = instr->block->shader; + struct ir3_register *reg = reg_create(shader, num, flags); +#ifdef DEBUG + debug_assert(instr->regs_count < instr->regs_max); +#endif + instr->regs[instr->regs_count++] = reg; + return reg; +} + +struct ir3_register * ir3_reg_clone(struct ir3 *shader, + struct ir3_register *reg) +{ + struct ir3_register *new_reg = reg_create(shader, 0, 0); + *new_reg = *reg; + return new_reg; +} + +void +ir3_instr_set_address(struct ir3_instruction *instr, + struct ir3_instruction *addr) +{ + if (instr->address != addr) { + struct ir3 *ir = instr->block->shader; + instr->address = addr; + array_insert(ir, ir->indirects, instr); + } +} + +void +ir3_block_clear_mark(struct ir3_block *block) +{ + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) + instr->flags &= ~IR3_INSTR_MARK; +} + +void +ir3_clear_mark(struct ir3 *ir) +{ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + ir3_block_clear_mark(block); + } +} + +/* note: this will destroy instr->depth, don't do it until after sched! */ +unsigned +ir3_count_instructions(struct ir3 *ir) +{ + unsigned cnt = 0; + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + instr->ip = cnt++; + } + block->start_ip = list_first_entry(&block->instr_list, struct ir3_instruction, node)->ip; + block->end_ip = list_last_entry(&block->instr_list, struct ir3_instruction, node)->ip; + } + return cnt; +} + +struct ir3_array * +ir3_lookup_array(struct ir3 *ir, unsigned id) +{ + list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) + if (arr->id == id) + return arr; + return NULL; +} diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h new file mode 100644 index 00000000000..ea3218828df --- /dev/null +++ b/src/freedreno/ir3/ir3.h @@ -0,0 +1,1394 @@ +/* + * Copyright (c) 2013 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IR3_H_ +#define IR3_H_ + +#include <stdint.h> +#include <stdbool.h> + +#include "compiler/shader_enums.h" + +#include "util/u_debug.h" +#include "util/list.h" + +#include "instr-a3xx.h" + +/* low level intermediate representation of an adreno shader program */ + +struct ir3_compiler; +struct ir3; +struct ir3_instruction; +struct ir3_block; + +struct ir3_info { + uint32_t gpu_id; + uint16_t sizedwords; + uint16_t instrs_count; /* expanded to account for rpt's */ + /* NOTE: max_reg, etc, does not include registers not touched + * by the shader (ie. vertex fetched via VFD_DECODE but not + * touched by shader) + */ + int8_t max_reg; /* highest GPR # used by shader */ + int8_t max_half_reg; + int16_t max_const; + + /* number of sync bits: */ + uint16_t ss, sy; +}; + +struct ir3_register { + enum { + IR3_REG_CONST = 0x001, + IR3_REG_IMMED = 0x002, + IR3_REG_HALF = 0x004, + /* high registers are used for some things in compute shaders, + * for example. Seems to be for things that are global to all + * threads in a wave, so possibly these are global/shared by + * all the threads in the wave? + */ + IR3_REG_HIGH = 0x008, + IR3_REG_RELATIV= 0x010, + IR3_REG_R = 0x020, + /* Most instructions, it seems, can do float abs/neg but not + * integer. The CP pass needs to know what is intended (int or + * float) in order to do the right thing. For this reason the + * abs/neg flags are split out into float and int variants. In + * addition, .b (bitwise) operations, the negate is actually a + * bitwise not, so split that out into a new flag to make it + * more clear. + */ + IR3_REG_FNEG = 0x040, + IR3_REG_FABS = 0x080, + IR3_REG_SNEG = 0x100, + IR3_REG_SABS = 0x200, + IR3_REG_BNOT = 0x400, + IR3_REG_EVEN = 0x800, + IR3_REG_POS_INF= 0x1000, + /* (ei) flag, end-input? Set on last bary, presumably to signal + * that the shader needs no more input: + */ + IR3_REG_EI = 0x2000, + /* meta-flags, for intermediate stages of IR, ie. + * before register assignment is done: + */ + IR3_REG_SSA = 0x4000, /* 'instr' is ptr to assigning instr */ + IR3_REG_ARRAY = 0x8000, + + } flags; + + /* normal registers: + * the component is in the low two bits of the reg #, so + * rN.x becomes: (N << 2) | x + */ + int num; + union { + /* immediate: */ + int32_t iim_val; + uint32_t uim_val; + float fim_val; + /* relative: */ + struct { + uint16_t id; + int16_t offset; + } array; + }; + + /* For IR3_REG_SSA, src registers contain ptr back to assigning + * instruction. + * + * For IR3_REG_ARRAY, the pointer is back to the last dependent + * array access (although the net effect is the same, it points + * back to a previous instruction that we depend on). + */ + struct ir3_instruction *instr; + + union { + /* used for cat5 instructions, but also for internal/IR level + * tracking of what registers are read/written by an instruction. + * wrmask may be a bad name since it is used to represent both + * src and dst that touch multiple adjacent registers. + */ + unsigned wrmask; + /* for relative addressing, 32bits for array size is too small, + * but otoh we don't need to deal with disjoint sets, so instead + * use a simple size field (number of scalar components). + */ + unsigned size; + }; +}; + +/* + * Stupid/simple growable array implementation: + */ +#define DECLARE_ARRAY(type, name) \ + unsigned name ## _count, name ## _sz; \ + type * name; + +#define array_insert(ctx, arr, val) do { \ + if (arr ## _count == arr ## _sz) { \ + arr ## _sz = MAX2(2 * arr ## _sz, 16); \ + arr = reralloc_size(ctx, arr, arr ## _sz * sizeof(arr[0])); \ + } \ + arr[arr ##_count++] = val; \ + } while (0) + +struct ir3_instruction { + struct ir3_block *block; + opc_t opc; + enum { + /* (sy) flag is set on first instruction, and after sample + * instructions (probably just on RAW hazard). + */ + IR3_INSTR_SY = 0x001, + /* (ss) flag is set on first instruction, and first instruction + * to depend on the result of "long" instructions (RAW hazard): + * + * rcp, rsq, log2, exp2, sin, cos, sqrt + * + * It seems to synchronize until all in-flight instructions are + * completed, for example: + * + * rsq hr1.w, hr1.w + * add.f hr2.z, (neg)hr2.z, hc0.y + * mul.f hr2.w, (neg)hr2.y, (neg)hr2.y + * rsq hr2.x, hr2.x + * (rpt1)nop + * mad.f16 hr2.w, hr2.z, hr2.z, hr2.w + * nop + * mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w + * (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w + * (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x + * + * The last mul.f does not have (ss) set, presumably because the + * (ss) on the previous instruction does the job. + * + * The blob driver also seems to set it on WAR hazards, although + * not really clear if this is needed or just blob compiler being + * sloppy. So far I haven't found a case where removing the (ss) + * causes problems for WAR hazard, but I could just be getting + * lucky: + * + * rcp r1.y, r3.y + * (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z + * + */ + IR3_INSTR_SS = 0x002, + /* (jp) flag is set on jump targets: + */ + IR3_INSTR_JP = 0x004, + IR3_INSTR_UL = 0x008, + IR3_INSTR_3D = 0x010, + IR3_INSTR_A = 0x020, + IR3_INSTR_O = 0x040, + IR3_INSTR_P = 0x080, + IR3_INSTR_S = 0x100, + IR3_INSTR_S2EN = 0x200, + IR3_INSTR_G = 0x400, + IR3_INSTR_SAT = 0x800, + /* meta-flags, for intermediate stages of IR, ie. + * before register assignment is done: + */ + IR3_INSTR_MARK = 0x1000, + IR3_INSTR_UNUSED= 0x2000, + } flags; + int repeat; +#ifdef DEBUG + unsigned regs_max; +#endif + unsigned regs_count; + struct ir3_register **regs; + union { + struct { + char inv; + char comp; + int immed; + struct ir3_block *target; + } cat0; + struct { + type_t src_type, dst_type; + } cat1; + struct { + enum { + IR3_COND_LT = 0, + IR3_COND_LE = 1, + IR3_COND_GT = 2, + IR3_COND_GE = 3, + IR3_COND_EQ = 4, + IR3_COND_NE = 5, + } condition; + } cat2; + struct { + unsigned samp, tex; + type_t type; + } cat5; + struct { + type_t type; + int src_offset; + int dst_offset; + int iim_val : 3; /* for ldgb/stgb, # of components */ + int d : 3; + bool typed : 1; + } cat6; + struct { + unsigned w : 1; /* write */ + unsigned r : 1; /* read */ + unsigned l : 1; /* local */ + unsigned g : 1; /* global */ + } cat7; + /* for meta-instructions, just used to hold extra data + * before instruction scheduling, etc + */ + struct { + int off; /* component/offset */ + } fo; + struct { + struct ir3_block *block; + } inout; + }; + + /* transient values used during various algorithms: */ + union { + /* The instruction depth is the max dependency distance to output. + * + * You can also think of it as the "cost", if we did any sort of + * optimization for register footprint. Ie. a value that is just + * result of moving a const to a reg would have a low cost, so to + * it could make sense to duplicate the instruction at various + * points where the result is needed to reduce register footprint. + */ + unsigned depth; + /* When we get to the RA stage, we no longer need depth, but + * we do need instruction's position/name: + */ + struct { + uint16_t ip; + uint16_t name; + }; + }; + + /* used for per-pass extra instruction data. + */ + void *data; + + /* Used during CP and RA stages. For fanin and shader inputs/ + * outputs where we need a sequence of consecutive registers, + * keep track of each src instructions left (ie 'n-1') and right + * (ie 'n+1') neighbor. The front-end must insert enough mov's + * to ensure that each instruction has at most one left and at + * most one right neighbor. During the copy-propagation pass, + * we only remove mov's when we can preserve this constraint. + * And during the RA stage, we use the neighbor information to + * allocate a block of registers in one shot. + * + * TODO: maybe just add something like: + * struct ir3_instruction_ref { + * struct ir3_instruction *instr; + * unsigned cnt; + * } + * + * Or can we get away without the refcnt stuff? It seems like + * it should be overkill.. the problem is if, potentially after + * already eliminating some mov's, if you have a single mov that + * needs to be grouped with it's neighbors in two different + * places (ex. shader output and a fanin). + */ + struct { + struct ir3_instruction *left, *right; + uint16_t left_cnt, right_cnt; + } cp; + + /* an instruction can reference at most one address register amongst + * it's src/dst registers. Beyond that, you need to insert mov's. + * + * NOTE: do not write this directly, use ir3_instr_set_address() + */ + struct ir3_instruction *address; + + /* Tracking for additional dependent instructions. Used to handle + * barriers, WAR hazards for arrays/SSBOs/etc. + */ + DECLARE_ARRAY(struct ir3_instruction *, deps); + + /* + * From PoV of instruction scheduling, not execution (ie. ignores global/ + * local distinction): + * shared image atomic SSBO everything + * barrier()/ - R/W R/W R/W R/W X + * groupMemoryBarrier() + * memoryBarrier() - R/W R/W + * (but only images declared coherent?) + * memoryBarrierAtomic() - R/W + * memoryBarrierBuffer() - R/W + * memoryBarrierImage() - R/W + * memoryBarrierShared() - R/W + * + * TODO I think for SSBO/image/shared, in cases where we can determine + * which variable is accessed, we don't need to care about accesses to + * different variables (unless declared coherent??) + */ + enum { + IR3_BARRIER_EVERYTHING = 1 << 0, + IR3_BARRIER_SHARED_R = 1 << 1, + IR3_BARRIER_SHARED_W = 1 << 2, + IR3_BARRIER_IMAGE_R = 1 << 3, + IR3_BARRIER_IMAGE_W = 1 << 4, + IR3_BARRIER_BUFFER_R = 1 << 5, + IR3_BARRIER_BUFFER_W = 1 << 6, + IR3_BARRIER_ARRAY_R = 1 << 7, + IR3_BARRIER_ARRAY_W = 1 << 8, + } barrier_class, barrier_conflict; + + /* Entry in ir3_block's instruction list: */ + struct list_head node; + + int use_count; /* currently just updated/used by cp */ + +#ifdef DEBUG + uint32_t serialno; +#endif +}; + +static inline struct ir3_instruction * +ir3_neighbor_first(struct ir3_instruction *instr) +{ + int cnt = 0; + while (instr->cp.left) { + instr = instr->cp.left; + if (++cnt > 0xffff) { + debug_assert(0); + break; + } + } + return instr; +} + +static inline int ir3_neighbor_count(struct ir3_instruction *instr) +{ + int num = 1; + + debug_assert(!instr->cp.left); + + while (instr->cp.right) { + num++; + instr = instr->cp.right; + if (num > 0xffff) { + debug_assert(0); + break; + } + } + + return num; +} + +struct ir3 { + struct ir3_compiler *compiler; + + unsigned ninputs, noutputs; + struct ir3_instruction **inputs; + struct ir3_instruction **outputs; + + /* Track bary.f (and ldlv) instructions.. this is needed in + * scheduling to ensure that all varying fetches happen before + * any potential kill instructions. The hw gets grumpy if all + * threads in a group are killed before the last bary.f gets + * a chance to signal end of input (ei). + */ + DECLARE_ARRAY(struct ir3_instruction *, baryfs); + + /* Track all indirect instructions (read and write). To avoid + * deadlock scenario where an address register gets scheduled, + * but other dependent src instructions cannot be scheduled due + * to dependency on a *different* address register value, the + * scheduler needs to ensure that all dependencies other than + * the instruction other than the address register are scheduled + * before the one that writes the address register. Having a + * convenient list of instructions that reference some address + * register simplifies this. + */ + DECLARE_ARRAY(struct ir3_instruction *, indirects); + + /* and same for instructions that consume predicate register: */ + DECLARE_ARRAY(struct ir3_instruction *, predicates); + + /* Track texture sample instructions which need texture state + * patched in (for astc-srgb workaround): + */ + DECLARE_ARRAY(struct ir3_instruction *, astc_srgb); + + /* List of blocks: */ + struct list_head block_list; + + /* List of ir3_array's: */ + struct list_head array_list; + +#ifdef DEBUG + unsigned block_count, instr_count; +#endif +}; + +struct ir3_array { + struct list_head node; + unsigned length; + unsigned id; + + struct nir_register *r; + + /* To avoid array write's from getting DCE'd, keep track of the + * most recent write. Any array access depends on the most + * recent write. This way, nothing depends on writes after the + * last read. But all the writes that happen before that have + * something depending on them + */ + struct ir3_instruction *last_write; + + /* extra stuff used in RA pass: */ + unsigned base; /* base vreg name */ + unsigned reg; /* base physical reg */ + uint16_t start_ip, end_ip; +}; + +struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id); + +struct ir3_block { + struct list_head node; + struct ir3 *shader; + + const struct nir_block *nblock; + + struct list_head instr_list; /* list of ir3_instruction */ + + /* each block has either one or two successors.. in case of + * two successors, 'condition' decides which one to follow. + * A block preceding an if/else has two successors. + */ + struct ir3_instruction *condition; + struct ir3_block *successors[2]; + + unsigned predecessors_count; + struct ir3_block **predecessors; + + uint16_t start_ip, end_ip; + + /* Track instructions which do not write a register but other- + * wise must not be discarded (such as kill, stg, etc) + */ + DECLARE_ARRAY(struct ir3_instruction *, keeps); + + /* used for per-pass extra block data. Mainly used right + * now in RA step to track livein/liveout. + */ + void *data; + +#ifdef DEBUG + uint32_t serialno; +#endif +}; + +static inline uint32_t +block_id(struct ir3_block *block) +{ +#ifdef DEBUG + return block->serialno; +#else + return (uint32_t)(unsigned long)block; +#endif +} + +struct ir3 * ir3_create(struct ir3_compiler *compiler, + unsigned nin, unsigned nout); +void ir3_destroy(struct ir3 *shader); +void * ir3_assemble(struct ir3 *shader, + struct ir3_info *info, uint32_t gpu_id); +void * ir3_alloc(struct ir3 *shader, int sz); + +struct ir3_block * ir3_block_create(struct ir3 *shader); + +struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc); +struct ir3_instruction * ir3_instr_create2(struct ir3_block *block, + opc_t opc, int nreg); +struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr); +void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep); +const char *ir3_instr_name(struct ir3_instruction *instr); + +struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, + int num, int flags); +struct ir3_register * ir3_reg_clone(struct ir3 *shader, + struct ir3_register *reg); + +void ir3_instr_set_address(struct ir3_instruction *instr, + struct ir3_instruction *addr); + +static inline bool ir3_instr_check_mark(struct ir3_instruction *instr) +{ + if (instr->flags & IR3_INSTR_MARK) + return true; /* already visited */ + instr->flags |= IR3_INSTR_MARK; + return false; +} + +void ir3_block_clear_mark(struct ir3_block *block); +void ir3_clear_mark(struct ir3 *shader); + +unsigned ir3_count_instructions(struct ir3 *ir); + +static inline int ir3_instr_regno(struct ir3_instruction *instr, + struct ir3_register *reg) +{ + unsigned i; + for (i = 0; i < instr->regs_count; i++) + if (reg == instr->regs[i]) + return i; + return -1; +} + + +#define MAX_ARRAYS 16 + +/* comp: + * 0 - x + * 1 - y + * 2 - z + * 3 - w + */ +static inline uint32_t regid(int num, int comp) +{ + return (num << 2) | (comp & 0x3); +} + +static inline uint32_t reg_num(struct ir3_register *reg) +{ + return reg->num >> 2; +} + +static inline uint32_t reg_comp(struct ir3_register *reg) +{ + return reg->num & 0x3; +} + +static inline bool is_flow(struct ir3_instruction *instr) +{ + return (opc_cat(instr->opc) == 0); +} + +static inline bool is_kill(struct ir3_instruction *instr) +{ + return instr->opc == OPC_KILL; +} + +static inline bool is_nop(struct ir3_instruction *instr) +{ + return instr->opc == OPC_NOP; +} + +/* Is it a non-transformative (ie. not type changing) mov? This can + * also include absneg.s/absneg.f, which for the most part can be + * treated as a mov (single src argument). + */ +static inline bool is_same_type_mov(struct ir3_instruction *instr) +{ + struct ir3_register *dst; + + switch (instr->opc) { + case OPC_MOV: + if (instr->cat1.src_type != instr->cat1.dst_type) + return false; + break; + case OPC_ABSNEG_F: + case OPC_ABSNEG_S: + if (instr->flags & IR3_INSTR_SAT) + return false; + break; + default: + return false; + } + + dst = instr->regs[0]; + + /* mov's that write to a0.x or p0.x are special: */ + if (dst->num == regid(REG_P0, 0)) + return false; + if (dst->num == regid(REG_A0, 0)) + return false; + + if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY)) + return false; + + return true; +} + +static inline bool is_alu(struct ir3_instruction *instr) +{ + return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3); +} + +static inline bool is_sfu(struct ir3_instruction *instr) +{ + return (opc_cat(instr->opc) == 4); +} + +static inline bool is_tex(struct ir3_instruction *instr) +{ + return (opc_cat(instr->opc) == 5); +} + +static inline bool is_mem(struct ir3_instruction *instr) +{ + return (opc_cat(instr->opc) == 6); +} + +static inline bool is_barrier(struct ir3_instruction *instr) +{ + return (opc_cat(instr->opc) == 7); +} + +static inline bool +is_store(struct ir3_instruction *instr) +{ + /* these instructions, the "destination" register is + * actually a source, the address to store to. + */ + switch (instr->opc) { + case OPC_STG: + case OPC_STGB: + case OPC_STIB: + case OPC_STP: + case OPC_STL: + case OPC_STLW: + case OPC_L2G: + case OPC_G2L: + return true; + default: + return false; + } +} + +static inline bool is_load(struct ir3_instruction *instr) +{ + switch (instr->opc) { + case OPC_LDG: + case OPC_LDGB: + case OPC_LDL: + case OPC_LDP: + case OPC_L2G: + case OPC_LDLW: + case OPC_LDC: + case OPC_LDLV: + /* probably some others too.. */ + return true; + default: + return false; + } +} + +static inline bool is_input(struct ir3_instruction *instr) +{ + /* in some cases, ldlv is used to fetch varying without + * interpolation.. fortunately inloc is the first src + * register in either case + */ + switch (instr->opc) { + case OPC_LDLV: + case OPC_BARY_F: + return true; + default: + return false; + } +} + +static inline bool is_bool(struct ir3_instruction *instr) +{ + switch (instr->opc) { + case OPC_CMPS_F: + case OPC_CMPS_S: + case OPC_CMPS_U: + return true; + default: + return false; + } +} + +static inline bool is_meta(struct ir3_instruction *instr) +{ + /* TODO how should we count PHI (and maybe fan-in/out) which + * might actually contribute some instructions to the final + * result? + */ + return (opc_cat(instr->opc) == -1); +} + +static inline bool writes_addr(struct ir3_instruction *instr) +{ + if (instr->regs_count > 0) { + struct ir3_register *dst = instr->regs[0]; + return reg_num(dst) == REG_A0; + } + return false; +} + +static inline bool writes_pred(struct ir3_instruction *instr) +{ + if (instr->regs_count > 0) { + struct ir3_register *dst = instr->regs[0]; + return reg_num(dst) == REG_P0; + } + return false; +} + +/* returns defining instruction for reg */ +/* TODO better name */ +static inline struct ir3_instruction *ssa(struct ir3_register *reg) +{ + if (reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) { + return reg->instr; + } + return NULL; +} + +static inline bool conflicts(struct ir3_instruction *a, + struct ir3_instruction *b) +{ + return (a && b) && (a != b); +} + +static inline bool reg_gpr(struct ir3_register *r) +{ + if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED)) + return false; + if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0)) + return false; + return true; +} + +static inline type_t half_type(type_t type) +{ + switch (type) { + case TYPE_F32: return TYPE_F16; + case TYPE_U32: return TYPE_U16; + case TYPE_S32: return TYPE_S16; + case TYPE_F16: + case TYPE_U16: + case TYPE_S16: + return type; + default: + assert(0); + return ~0; + } +} + +/* some cat2 instructions (ie. those which are not float) can embed an + * immediate: + */ +static inline bool ir3_cat2_int(opc_t opc) +{ + switch (opc) { + case OPC_ADD_U: + case OPC_ADD_S: + case OPC_SUB_U: + case OPC_SUB_S: + case OPC_CMPS_U: + case OPC_CMPS_S: + case OPC_MIN_U: + case OPC_MIN_S: + case OPC_MAX_U: + case OPC_MAX_S: + case OPC_CMPV_U: + case OPC_CMPV_S: + case OPC_MUL_U: + case OPC_MUL_S: + case OPC_MULL_U: + case OPC_CLZ_S: + case OPC_ABSNEG_S: + case OPC_AND_B: + case OPC_OR_B: + case OPC_NOT_B: + case OPC_XOR_B: + case OPC_BFREV_B: + case OPC_CLZ_B: + case OPC_SHL_B: + case OPC_SHR_B: + case OPC_ASHR_B: + case OPC_MGEN_B: + case OPC_GETBIT_B: + case OPC_CBITS_B: + case OPC_BARY_F: + return true; + + default: + return false; + } +} + + +/* map cat2 instruction to valid abs/neg flags: */ +static inline unsigned ir3_cat2_absneg(opc_t opc) +{ + switch (opc) { + case OPC_ADD_F: + case OPC_MIN_F: + case OPC_MAX_F: + case OPC_MUL_F: + case OPC_SIGN_F: + case OPC_CMPS_F: + case OPC_ABSNEG_F: + case OPC_CMPV_F: + case OPC_FLOOR_F: + case OPC_CEIL_F: + case OPC_RNDNE_F: + case OPC_RNDAZ_F: + case OPC_TRUNC_F: + case OPC_BARY_F: + return IR3_REG_FABS | IR3_REG_FNEG; + + case OPC_ADD_U: + case OPC_ADD_S: + case OPC_SUB_U: + case OPC_SUB_S: + case OPC_CMPS_U: + case OPC_CMPS_S: + case OPC_MIN_U: + case OPC_MIN_S: + case OPC_MAX_U: + case OPC_MAX_S: + case OPC_CMPV_U: + case OPC_CMPV_S: + case OPC_MUL_U: + case OPC_MUL_S: + case OPC_MULL_U: + case OPC_CLZ_S: + return 0; + + case OPC_ABSNEG_S: + return IR3_REG_SABS | IR3_REG_SNEG; + + case OPC_AND_B: + case OPC_OR_B: + case OPC_NOT_B: + case OPC_XOR_B: + case OPC_BFREV_B: + case OPC_CLZ_B: + case OPC_SHL_B: + case OPC_SHR_B: + case OPC_ASHR_B: + case OPC_MGEN_B: + case OPC_GETBIT_B: + case OPC_CBITS_B: + return IR3_REG_BNOT; + + default: + return 0; + } +} + +/* map cat3 instructions to valid abs/neg flags: */ +static inline unsigned ir3_cat3_absneg(opc_t opc) +{ + switch (opc) { + case OPC_MAD_F16: + case OPC_MAD_F32: + case OPC_SEL_F16: + case OPC_SEL_F32: + return IR3_REG_FNEG; + + case OPC_MAD_U16: + case OPC_MADSH_U16: + case OPC_MAD_S16: + case OPC_MADSH_M16: + case OPC_MAD_U24: + case OPC_MAD_S24: + case OPC_SEL_S16: + case OPC_SEL_S32: + case OPC_SAD_S16: + case OPC_SAD_S32: + /* neg *may* work on 3rd src.. */ + + case OPC_SEL_B16: + case OPC_SEL_B32: + + default: + return 0; + } +} + +#define MASK(n) ((1 << (n)) - 1) + +/* iterator for an instructions's sources (reg), also returns src #: */ +#define foreach_src_n(__srcreg, __n, __instr) \ + if ((__instr)->regs_count) \ + for (unsigned __cnt = (__instr)->regs_count - 1, __n = 0; __n < __cnt; __n++) \ + if ((__srcreg = (__instr)->regs[__n + 1])) + +/* iterator for an instructions's sources (reg): */ +#define foreach_src(__srcreg, __instr) \ + foreach_src_n(__srcreg, __i, __instr) + +static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr) +{ + unsigned cnt = instr->regs_count + instr->deps_count; + if (instr->address) + cnt++; + return cnt; +} + +static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n) +{ + if (n == (instr->regs_count + instr->deps_count)) + return instr->address; + if (n >= instr->regs_count) + return instr->deps[n - instr->regs_count]; + return ssa(instr->regs[n]); +} + +static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n) +{ + if (n == (instr->regs_count + instr->deps_count)) + return false; + if (n >= instr->regs_count) + return true; + return false; +} + +#define __src_cnt(__instr) ((__instr)->address ? (__instr)->regs_count : (__instr)->regs_count - 1) + +/* iterator for an instruction's SSA sources (instr), also returns src #: */ +#define foreach_ssa_src_n(__srcinst, __n, __instr) \ + for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \ + if ((__srcinst = __ssa_src_n(__instr, __n))) + +/* iterator for an instruction's SSA sources (instr): */ +#define foreach_ssa_src(__srcinst, __instr) \ + foreach_ssa_src_n(__srcinst, __i, __instr) + + +/* dump: */ +void ir3_print(struct ir3 *ir); +void ir3_print_instr(struct ir3_instruction *instr); + +/* depth calculation: */ +int ir3_delayslots(struct ir3_instruction *assigner, + struct ir3_instruction *consumer, unsigned n); +void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list); +void ir3_depth(struct ir3 *ir); + +/* copy-propagate: */ +struct ir3_shader_variant; +void ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so); + +/* group neighbors and insert mov's to resolve conflicts: */ +void ir3_group(struct ir3 *ir); + +/* scheduling: */ +void ir3_sched_add_deps(struct ir3 *ir); +int ir3_sched(struct ir3 *ir); + +/* register assignment: */ +struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(struct ir3_compiler *compiler); +int ir3_ra(struct ir3 *ir3, gl_shader_stage type, + bool frag_coord, bool frag_face); + +/* legalize: */ +void ir3_legalize(struct ir3 *ir, int *num_samp, bool *has_ssbo, int *max_bary); + +/* ************************************************************************* */ +/* instruction helpers */ + +/* creates SSA src of correct type (ie. half vs full precision) */ +static inline struct ir3_register * __ssa_src(struct ir3_instruction *instr, + struct ir3_instruction *src, unsigned flags) +{ + struct ir3_register *reg; + if (src->regs[0]->flags & IR3_REG_HALF) + flags |= IR3_REG_HALF; + reg = ir3_reg_create(instr, 0, IR3_REG_SSA | flags); + reg->instr = src; + return reg; +} + +static inline struct ir3_instruction * +ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type) +{ + struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV); + ir3_reg_create(instr, 0, 0); /* dst */ + if (src->regs[0]->flags & IR3_REG_ARRAY) { + struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY); + src_reg->array = src->regs[0]->array; + } else { + __ssa_src(instr, src, 0); + } + debug_assert(!(src->regs[0]->flags & IR3_REG_RELATIV)); + instr->cat1.src_type = type; + instr->cat1.dst_type = type; + return instr; +} + +static inline struct ir3_instruction * +ir3_COV(struct ir3_block *block, struct ir3_instruction *src, + type_t src_type, type_t dst_type) +{ + struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV); + unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0; + unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0; + + debug_assert((src->regs[0]->flags & IR3_REG_HALF) == src_flags); + + ir3_reg_create(instr, 0, dst_flags); /* dst */ + __ssa_src(instr, src, 0); + instr->cat1.src_type = src_type; + instr->cat1.dst_type = dst_type; + debug_assert(!(src->regs[0]->flags & IR3_REG_ARRAY)); + return instr; +} + +static inline struct ir3_instruction * +ir3_NOP(struct ir3_block *block) +{ + return ir3_instr_create(block, OPC_NOP); +} + +#define INSTR0(name) \ +static inline struct ir3_instruction * \ +ir3_##name(struct ir3_block *block) \ +{ \ + struct ir3_instruction *instr = \ + ir3_instr_create(block, OPC_##name); \ + return instr; \ +} + +#define INSTR1(name) \ +static inline struct ir3_instruction * \ +ir3_##name(struct ir3_block *block, \ + struct ir3_instruction *a, unsigned aflags) \ +{ \ + struct ir3_instruction *instr = \ + ir3_instr_create(block, OPC_##name); \ + ir3_reg_create(instr, 0, 0); /* dst */ \ + __ssa_src(instr, a, aflags); \ + return instr; \ +} + +#define INSTR2(name) \ +static inline struct ir3_instruction * \ +ir3_##name(struct ir3_block *block, \ + struct ir3_instruction *a, unsigned aflags, \ + struct ir3_instruction *b, unsigned bflags) \ +{ \ + struct ir3_instruction *instr = \ + ir3_instr_create(block, OPC_##name); \ + ir3_reg_create(instr, 0, 0); /* dst */ \ + __ssa_src(instr, a, aflags); \ + __ssa_src(instr, b, bflags); \ + return instr; \ +} + +#define INSTR3(name) \ +static inline struct ir3_instruction * \ +ir3_##name(struct ir3_block *block, \ + struct ir3_instruction *a, unsigned aflags, \ + struct ir3_instruction *b, unsigned bflags, \ + struct ir3_instruction *c, unsigned cflags) \ +{ \ + struct ir3_instruction *instr = \ + ir3_instr_create(block, OPC_##name); \ + ir3_reg_create(instr, 0, 0); /* dst */ \ + __ssa_src(instr, a, aflags); \ + __ssa_src(instr, b, bflags); \ + __ssa_src(instr, c, cflags); \ + return instr; \ +} + +#define INSTR4(name) \ +static inline struct ir3_instruction * \ +ir3_##name(struct ir3_block *block, \ + struct ir3_instruction *a, unsigned aflags, \ + struct ir3_instruction *b, unsigned bflags, \ + struct ir3_instruction *c, unsigned cflags, \ + struct ir3_instruction *d, unsigned dflags) \ +{ \ + struct ir3_instruction *instr = \ + ir3_instr_create2(block, OPC_##name, 5); \ + ir3_reg_create(instr, 0, 0); /* dst */ \ + __ssa_src(instr, a, aflags); \ + __ssa_src(instr, b, bflags); \ + __ssa_src(instr, c, cflags); \ + __ssa_src(instr, d, dflags); \ + return instr; \ +} + +#define INSTR4F(f, name) \ +static inline struct ir3_instruction * \ +ir3_##name##_##f(struct ir3_block *block, \ + struct ir3_instruction *a, unsigned aflags, \ + struct ir3_instruction *b, unsigned bflags, \ + struct ir3_instruction *c, unsigned cflags, \ + struct ir3_instruction *d, unsigned dflags) \ +{ \ + struct ir3_instruction *instr = \ + ir3_instr_create2(block, OPC_##name, 5); \ + ir3_reg_create(instr, 0, 0); /* dst */ \ + __ssa_src(instr, a, aflags); \ + __ssa_src(instr, b, bflags); \ + __ssa_src(instr, c, cflags); \ + __ssa_src(instr, d, dflags); \ + instr->flags |= IR3_INSTR_##f; \ + return instr; \ +} + +/* cat0 instructions: */ +INSTR0(BR) +INSTR0(JUMP) +INSTR1(KILL) +INSTR0(END) + +/* cat2 instructions, most 2 src but some 1 src: */ +INSTR2(ADD_F) +INSTR2(MIN_F) +INSTR2(MAX_F) +INSTR2(MUL_F) +INSTR1(SIGN_F) +INSTR2(CMPS_F) +INSTR1(ABSNEG_F) +INSTR2(CMPV_F) +INSTR1(FLOOR_F) +INSTR1(CEIL_F) +INSTR1(RNDNE_F) +INSTR1(RNDAZ_F) +INSTR1(TRUNC_F) +INSTR2(ADD_U) +INSTR2(ADD_S) +INSTR2(SUB_U) +INSTR2(SUB_S) +INSTR2(CMPS_U) +INSTR2(CMPS_S) +INSTR2(MIN_U) +INSTR2(MIN_S) +INSTR2(MAX_U) +INSTR2(MAX_S) +INSTR1(ABSNEG_S) +INSTR2(AND_B) +INSTR2(OR_B) +INSTR1(NOT_B) +INSTR2(XOR_B) +INSTR2(CMPV_U) +INSTR2(CMPV_S) +INSTR2(MUL_U) +INSTR2(MUL_S) +INSTR2(MULL_U) +INSTR1(BFREV_B) +INSTR1(CLZ_S) +INSTR1(CLZ_B) +INSTR2(SHL_B) +INSTR2(SHR_B) +INSTR2(ASHR_B) +INSTR2(BARY_F) +INSTR2(MGEN_B) +INSTR2(GETBIT_B) +INSTR1(SETRM) +INSTR1(CBITS_B) +INSTR2(SHB) +INSTR2(MSAD) + +/* cat3 instructions: */ +INSTR3(MAD_U16) +INSTR3(MADSH_U16) +INSTR3(MAD_S16) +INSTR3(MADSH_M16) +INSTR3(MAD_U24) +INSTR3(MAD_S24) +INSTR3(MAD_F16) +INSTR3(MAD_F32) +INSTR3(SEL_B16) +INSTR3(SEL_B32) +INSTR3(SEL_S16) +INSTR3(SEL_S32) +INSTR3(SEL_F16) +INSTR3(SEL_F32) +INSTR3(SAD_S16) +INSTR3(SAD_S32) + +/* cat4 instructions: */ +INSTR1(RCP) +INSTR1(RSQ) +INSTR1(LOG2) +INSTR1(EXP2) +INSTR1(SIN) +INSTR1(COS) +INSTR1(SQRT) + +/* cat5 instructions: */ +INSTR1(DSX) +INSTR1(DSY) + +static inline struct ir3_instruction * +ir3_SAM(struct ir3_block *block, opc_t opc, type_t type, + unsigned wrmask, unsigned flags, unsigned samp, unsigned tex, + struct ir3_instruction *src0, struct ir3_instruction *src1) +{ + struct ir3_instruction *sam; + struct ir3_register *reg; + + sam = ir3_instr_create(block, opc); + sam->flags |= flags; + ir3_reg_create(sam, 0, 0)->wrmask = wrmask; + if (src0) { + reg = ir3_reg_create(sam, 0, IR3_REG_SSA); + reg->wrmask = (1 << (src0->regs_count - 1)) - 1; + reg->instr = src0; + } + if (src1) { + reg = ir3_reg_create(sam, 0, IR3_REG_SSA); + reg->instr = src1; + reg->wrmask = (1 << (src1->regs_count - 1)) - 1; + } + sam->cat5.samp = samp; + sam->cat5.tex = tex; + sam->cat5.type = type; + + return sam; +} + +/* cat6 instructions: */ +INSTR2(LDLV) +INSTR2(LDG) +INSTR2(LDL) +INSTR3(STG) +INSTR3(STL) +INSTR3(LDGB) +INSTR4(STGB) +INSTR4(STIB) +INSTR1(RESINFO) +INSTR1(RESFMT) +INSTR2(ATOMIC_ADD) +INSTR2(ATOMIC_SUB) +INSTR2(ATOMIC_XCHG) +INSTR2(ATOMIC_INC) +INSTR2(ATOMIC_DEC) +INSTR2(ATOMIC_CMPXCHG) +INSTR2(ATOMIC_MIN) +INSTR2(ATOMIC_MAX) +INSTR2(ATOMIC_AND) +INSTR2(ATOMIC_OR) +INSTR2(ATOMIC_XOR) +INSTR4F(G, ATOMIC_ADD) +INSTR4F(G, ATOMIC_SUB) +INSTR4F(G, ATOMIC_XCHG) +INSTR4F(G, ATOMIC_INC) +INSTR4F(G, ATOMIC_DEC) +INSTR4F(G, ATOMIC_CMPXCHG) +INSTR4F(G, ATOMIC_MIN) +INSTR4F(G, ATOMIC_MAX) +INSTR4F(G, ATOMIC_AND) +INSTR4F(G, ATOMIC_OR) +INSTR4F(G, ATOMIC_XOR) + +/* cat7 instructions: */ +INSTR0(BAR) +INSTR0(FENCE) + +/* ************************************************************************* */ +/* split this out or find some helper to use.. like main/bitset.h.. */ + +#include <string.h> + +#define MAX_REG 256 + +typedef uint8_t regmask_t[2 * MAX_REG / 8]; + +static inline unsigned regmask_idx(struct ir3_register *reg) +{ + unsigned num = (reg->flags & IR3_REG_RELATIV) ? reg->array.offset : reg->num; + debug_assert(num < MAX_REG); + if (reg->flags & IR3_REG_HALF) + num += MAX_REG; + return num; +} + +static inline void regmask_init(regmask_t *regmask) +{ + memset(regmask, 0, sizeof(*regmask)); +} + +static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg) +{ + unsigned idx = regmask_idx(reg); + if (reg->flags & IR3_REG_RELATIV) { + unsigned i; + for (i = 0; i < reg->size; i++, idx++) + (*regmask)[idx / 8] |= 1 << (idx % 8); + } else { + unsigned mask; + for (mask = reg->wrmask; mask; mask >>= 1, idx++) + if (mask & 1) + (*regmask)[idx / 8] |= 1 << (idx % 8); + } +} + +static inline void regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b) +{ + unsigned i; + for (i = 0; i < ARRAY_SIZE(*dst); i++) + (*dst)[i] = (*a)[i] | (*b)[i]; +} + +/* set bits in a if not set in b, conceptually: + * a |= (reg & ~b) + */ +static inline void regmask_set_if_not(regmask_t *a, + struct ir3_register *reg, regmask_t *b) +{ + unsigned idx = regmask_idx(reg); + if (reg->flags & IR3_REG_RELATIV) { + unsigned i; + for (i = 0; i < reg->size; i++, idx++) + if (!((*b)[idx / 8] & (1 << (idx % 8)))) + (*a)[idx / 8] |= 1 << (idx % 8); + } else { + unsigned mask; + for (mask = reg->wrmask; mask; mask >>= 1, idx++) + if (mask & 1) + if (!((*b)[idx / 8] & (1 << (idx % 8)))) + (*a)[idx / 8] |= 1 << (idx % 8); + } +} + +static inline bool regmask_get(regmask_t *regmask, + struct ir3_register *reg) +{ + unsigned idx = regmask_idx(reg); + if (reg->flags & IR3_REG_RELATIV) { + unsigned i; + for (i = 0; i < reg->size; i++, idx++) + if ((*regmask)[idx / 8] & (1 << (idx % 8))) + return true; + } else { + unsigned mask; + for (mask = reg->wrmask; mask; mask >>= 1, idx++) + if (mask & 1) + if ((*regmask)[idx / 8] & (1 << (idx % 8))) + return true; + } + return false; +} + +/* ************************************************************************* */ + +#endif /* IR3_H_ */ diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c new file mode 100644 index 00000000000..f00daebabf5 --- /dev/null +++ b/src/freedreno/ir3/ir3_compiler.c @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2015 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include "util/ralloc.h" + +#include "ir3_compiler.h" + +static const struct debug_named_value shader_debug_options[] = { + {"vs", IR3_DBG_SHADER_VS, "Print shader disasm for vertex shaders"}, + {"fs", IR3_DBG_SHADER_FS, "Print shader disasm for fragment shaders"}, + {"cs", IR3_DBG_SHADER_CS, "Print shader disasm for compute shaders"}, + {"disasm", IR3_DBG_DISASM, "Dump NIR and adreno shader disassembly"}, + {"optmsgs", IR3_DBG_OPTMSGS,"Enable optimizer debug messages"}, + DEBUG_NAMED_VALUE_END +}; + +DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG", shader_debug_options, 0) + +enum ir3_shader_debug ir3_shader_debug = 0; + +struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id) +{ + struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler); + + ir3_shader_debug = debug_get_option_ir3_shader_debug(); + + compiler->dev = dev; + compiler->gpu_id = gpu_id; + compiler->set = ir3_ra_alloc_reg_set(compiler); + + if (compiler->gpu_id >= 400) { + /* need special handling for "flat" */ + compiler->flat_bypass = true; + compiler->levels_add_one = false; + compiler->unminify_coords = false; + compiler->txf_ms_with_isaml = false; + compiler->array_index_add_half = true; + } else { + /* no special handling for "flat" */ + compiler->flat_bypass = false; + compiler->levels_add_one = true; + compiler->unminify_coords = true; + compiler->txf_ms_with_isaml = true; + compiler->array_index_add_half = false; + } + + return compiler; +} diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h new file mode 100644 index 00000000000..e2336062b29 --- /dev/null +++ b/src/freedreno/ir3/ir3_compiler.h @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2013 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#ifndef IR3_COMPILER_H_ +#define IR3_COMPILER_H_ + +#include "ir3_shader.h" + +struct ir3_ra_reg_set; + +struct ir3_compiler { + struct fd_device *dev; + uint32_t gpu_id; + struct ir3_ra_reg_set *set; + uint32_t shader_count; + + /* + * Configuration options for things that are handled differently on + * different generations: + */ + + /* a4xx (and later) drops SP_FS_FLAT_SHAD_MODE_REG_* for flat-interpolate + * so we need to use ldlv.u32 to load the varying directly: + */ + bool flat_bypass; + + /* on a3xx, we need to add one to # of array levels: + */ + bool levels_add_one; + + /* on a3xx, we need to scale up integer coords for isaml based + * on LoD: + */ + bool unminify_coords; + + /* on a3xx do txf_ms w/ isaml and scaled coords: */ + bool txf_ms_with_isaml; + + /* on a4xx, for array textures we need to add 0.5 to the array + * index coordinate: + */ + bool array_index_add_half; +}; + +struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id); + +int ir3_compile_shader_nir(struct ir3_compiler *compiler, + struct ir3_shader_variant *so); + +enum ir3_shader_debug { + IR3_DBG_SHADER_VS = 0x01, + IR3_DBG_SHADER_FS = 0x02, + IR3_DBG_SHADER_CS = 0x04, + IR3_DBG_DISASM = 0x08, + IR3_DBG_OPTMSGS = 0x10, +}; + +extern enum ir3_shader_debug ir3_shader_debug; + +static inline bool +shader_debug_enabled(gl_shader_stage type) +{ + switch (type) { + case MESA_SHADER_VERTEX: return !!(ir3_shader_debug & IR3_DBG_SHADER_VS); + case MESA_SHADER_FRAGMENT: return !!(ir3_shader_debug & IR3_DBG_SHADER_FS); + case MESA_SHADER_COMPUTE: return !!(ir3_shader_debug & IR3_DBG_SHADER_CS); + default: + debug_assert(0); + return false; + } +} + +#endif /* IR3_COMPILER_H_ */ diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c new file mode 100644 index 00000000000..445a2b291e9 --- /dev/null +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -0,0 +1,3818 @@ +/* + * Copyright (C) 2015 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include <stdarg.h> + +#include "util/u_string.h" +#include "util/u_memory.h" +#include "util/u_math.h" + +#include "ir3_compiler.h" +#include "ir3_shader.h" +#include "ir3_nir.h" + +#include "instr-a3xx.h" +#include "ir3.h" + +/* for conditionally setting boolean flag(s): */ +#define COND(bool, val) ((bool) ? (val) : 0) + +#define DBG(fmt, ...) \ + do { debug_printf("%s:%d: "fmt "\n", \ + __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0) + +struct ir3_context { + struct ir3_compiler *compiler; + + struct nir_shader *s; + + struct nir_instr *cur_instr; /* current instruction, just for debug */ + + struct ir3 *ir; + struct ir3_shader_variant *so; + + struct ir3_block *block; /* the current block */ + struct ir3_block *in_block; /* block created for shader inputs */ + + nir_function_impl *impl; + + /* For fragment shaders, varyings are not actual shader inputs, + * instead the hw passes a varying-coord which is used with + * bary.f. + * + * But NIR doesn't know that, it still declares varyings as + * inputs. So we do all the input tracking normally and fix + * things up after compile_instructions() + * + * NOTE that frag_vcoord is the hardware position (possibly it + * is actually an index or tag or some such.. it is *not* + * values that can be directly used for gl_FragCoord..) + */ + struct ir3_instruction *frag_vcoord; + + /* for fragment shaders, for gl_FrontFacing and gl_FragCoord: */ + struct ir3_instruction *frag_face, *frag_coord; + + /* For vertex shaders, keep track of the system values sources */ + struct ir3_instruction *vertex_id, *basevertex, *instance_id; + + /* For fragment shaders: */ + struct ir3_instruction *samp_id, *samp_mask_in; + + /* Compute shader inputs: */ + struct ir3_instruction *local_invocation_id, *work_group_id; + + /* mapping from nir_register to defining instruction: */ + struct hash_table *def_ht; + + unsigned num_arrays; + + /* a common pattern for indirect addressing is to request the + * same address register multiple times. To avoid generating + * duplicate instruction sequences (which our backend does not + * try to clean up, since that should be done as the NIR stage) + * we cache the address value generated for a given src value: + * + * Note that we have to cache these per alignment, since same + * src used for an array of vec1 cannot be also used for an + * array of vec4. + */ + struct hash_table *addr_ht[4]; + + /* last dst array, for indirect we need to insert a var-store. + */ + struct ir3_instruction **last_dst; + unsigned last_dst_n; + + /* maps nir_block to ir3_block, mostly for the purposes of + * figuring out the blocks successors + */ + struct hash_table *block_ht; + + /* on a4xx, bitmask of samplers which need astc+srgb workaround: */ + unsigned astc_srgb; + + unsigned samples; /* bitmask of x,y sample shifts */ + + unsigned max_texture_index; + + /* set if we encounter something we can't handle yet, so we + * can bail cleanly and fallback to TGSI compiler f/e + */ + bool error; +}; + +/* gpu pointer size in units of 32bit registers/slots */ +static unsigned pointer_size(struct ir3_context *ctx) +{ + return (ctx->compiler->gpu_id >= 500) ? 2 : 1; +} + +static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val); +static struct ir3_block * get_block(struct ir3_context *ctx, const nir_block *nblock); + + +static struct ir3_context * +compile_init(struct ir3_compiler *compiler, + struct ir3_shader_variant *so) +{ + struct ir3_context *ctx = rzalloc(NULL, struct ir3_context); + + if (compiler->gpu_id >= 400) { + if (so->type == MESA_SHADER_VERTEX) { + ctx->astc_srgb = so->key.vastc_srgb; + } else if (so->type == MESA_SHADER_FRAGMENT) { + ctx->astc_srgb = so->key.fastc_srgb; + } + + } else { + if (so->type == MESA_SHADER_VERTEX) { + ctx->samples = so->key.vsamples; + } else if (so->type == MESA_SHADER_FRAGMENT) { + ctx->samples = so->key.fsamples; + } + } + + ctx->compiler = compiler; + ctx->so = so; + ctx->def_ht = _mesa_hash_table_create(ctx, + _mesa_hash_pointer, _mesa_key_pointer_equal); + ctx->block_ht = _mesa_hash_table_create(ctx, + _mesa_hash_pointer, _mesa_key_pointer_equal); + + /* TODO: maybe generate some sort of bitmask of what key + * lowers vs what shader has (ie. no need to lower + * texture clamp lowering if no texture sample instrs).. + * although should be done further up the stack to avoid + * creating duplicate variants.. + */ + + if (ir3_key_lowers_nir(&so->key)) { + nir_shader *s = nir_shader_clone(ctx, so->shader->nir); + ctx->s = ir3_optimize_nir(so->shader, s, &so->key); + } else { + /* fast-path for shader key that lowers nothing in NIR: */ + ctx->s = so->shader->nir; + } + + /* this needs to be the last pass run, so do this here instead of + * in ir3_optimize_nir(): + */ + NIR_PASS_V(ctx->s, nir_lower_locals_to_regs); + NIR_PASS_V(ctx->s, nir_convert_from_ssa, true); + + if (ir3_shader_debug & IR3_DBG_DISASM) { + printf("dump nir%dv%d: type=%d, k={cts=%u,hp=%u}", + so->shader->id, so->id, so->type, + so->key.color_two_side, so->key.half_precision); + nir_print_shader(ctx->s, stdout); + } + + if (shader_debug_enabled(so->type)) { + fprintf(stderr, "NIR (final form) for %s shader:\n", + _mesa_shader_stage_to_string(so->type)); + nir_print_shader(ctx->s, stderr); + } + + ir3_nir_scan_driver_consts(ctx->s, &so->const_layout); + + so->num_uniforms = ctx->s->num_uniforms; + so->num_ubos = ctx->s->info.num_ubos; + + /* Layout of constant registers, each section aligned to vec4. Note + * that pointer size (ubo, etc) changes depending on generation. + * + * user consts + * UBO addresses + * SSBO sizes + * if (vertex shader) { + * driver params (IR3_DP_*) + * if (stream_output.num_outputs > 0) + * stream-out addresses + * } + * immediates + * + * Immediates go last mostly because they are inserted in the CP pass + * after the nir -> ir3 frontend. + */ + unsigned constoff = align(ctx->s->num_uniforms, 4); + unsigned ptrsz = pointer_size(ctx); + + memset(&so->constbase, ~0, sizeof(so->constbase)); + + if (so->num_ubos > 0) { + so->constbase.ubo = constoff; + constoff += align(ctx->s->info.num_ubos * ptrsz, 4) / 4; + } + + if (so->const_layout.ssbo_size.count > 0) { + unsigned cnt = so->const_layout.ssbo_size.count; + so->constbase.ssbo_sizes = constoff; + constoff += align(cnt, 4) / 4; + } + + if (so->const_layout.image_dims.count > 0) { + unsigned cnt = so->const_layout.image_dims.count; + so->constbase.image_dims = constoff; + constoff += align(cnt, 4) / 4; + } + + unsigned num_driver_params = 0; + if (so->type == MESA_SHADER_VERTEX) { + num_driver_params = IR3_DP_VS_COUNT; + } else if (so->type == MESA_SHADER_COMPUTE) { + num_driver_params = IR3_DP_CS_COUNT; + } + + so->constbase.driver_param = constoff; + constoff += align(num_driver_params, 4) / 4; + + if ((so->type == MESA_SHADER_VERTEX) && + (compiler->gpu_id < 500) && + so->shader->stream_output.num_outputs > 0) { + so->constbase.tfbo = constoff; + constoff += align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4; + } + + so->constbase.immediate = constoff; + + return ctx; +} + +static void +compile_error(struct ir3_context *ctx, const char *format, ...) +{ + struct hash_table *errors = NULL; + va_list ap; + va_start(ap, format); + if (ctx->cur_instr) { + errors = _mesa_hash_table_create(NULL, + _mesa_hash_pointer, + _mesa_key_pointer_equal); + char *msg = ralloc_vasprintf(errors, format, ap); + _mesa_hash_table_insert(errors, ctx->cur_instr, msg); + } else { + _debug_vprintf(format, ap); + } + va_end(ap); + nir_print_shader_annotated(ctx->s, stdout, errors); + ralloc_free(errors); + ctx->error = true; + debug_assert(0); +} + +#define compile_assert(ctx, cond) do { \ + if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \ + } while (0) + +static void +compile_free(struct ir3_context *ctx) +{ + ralloc_free(ctx); +} + +static void +declare_array(struct ir3_context *ctx, nir_register *reg) +{ + struct ir3_array *arr = rzalloc(ctx, struct ir3_array); + arr->id = ++ctx->num_arrays; + /* NOTE: sometimes we get non array regs, for example for arrays of + * length 1. See fs-const-array-of-struct-of-array.shader_test. So + * treat a non-array as if it was an array of length 1. + * + * It would be nice if there was a nir pass to convert arrays of + * length 1 to ssa. + */ + arr->length = reg->num_components * MAX2(1, reg->num_array_elems); + compile_assert(ctx, arr->length > 0); + arr->r = reg; + list_addtail(&arr->node, &ctx->ir->array_list); +} + +static struct ir3_array * +get_array(struct ir3_context *ctx, nir_register *reg) +{ + list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { + if (arr->r == reg) + return arr; + } + compile_error(ctx, "bogus reg: %s\n", reg->name); + return NULL; +} + +/* relative (indirect) if address!=NULL */ +static struct ir3_instruction * +create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n, + struct ir3_instruction *address) +{ + struct ir3_block *block = ctx->block; + struct ir3_instruction *mov; + struct ir3_register *src; + + mov = ir3_instr_create(block, OPC_MOV); + mov->cat1.src_type = TYPE_U32; + mov->cat1.dst_type = TYPE_U32; + mov->barrier_class = IR3_BARRIER_ARRAY_R; + mov->barrier_conflict = IR3_BARRIER_ARRAY_W; + ir3_reg_create(mov, 0, 0); + src = ir3_reg_create(mov, 0, IR3_REG_ARRAY | + COND(address, IR3_REG_RELATIV)); + src->instr = arr->last_write; + src->size = arr->length; + src->array.id = arr->id; + src->array.offset = n; + + if (address) + ir3_instr_set_address(mov, address); + + return mov; +} + +/* relative (indirect) if address!=NULL */ +static void +create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n, + struct ir3_instruction *src, struct ir3_instruction *address) +{ + struct ir3_block *block = ctx->block; + struct ir3_instruction *mov; + struct ir3_register *dst; + + /* if not relative store, don't create an extra mov, since that + * ends up being difficult for cp to remove. + */ + if (!address) { + dst = src->regs[0]; + + src->barrier_class |= IR3_BARRIER_ARRAY_W; + src->barrier_conflict |= IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W; + + dst->flags |= IR3_REG_ARRAY; + dst->instr = arr->last_write; + dst->size = arr->length; + dst->array.id = arr->id; + dst->array.offset = n; + + arr->last_write = src; + + array_insert(block, block->keeps, src); + + return; + } + + mov = ir3_instr_create(block, OPC_MOV); + mov->cat1.src_type = TYPE_U32; + mov->cat1.dst_type = TYPE_U32; + mov->barrier_class = IR3_BARRIER_ARRAY_W; + mov->barrier_conflict = IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W; + dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY | + COND(address, IR3_REG_RELATIV)); + dst->instr = arr->last_write; + dst->size = arr->length; + dst->array.id = arr->id; + dst->array.offset = n; + ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src; + + if (address) + ir3_instr_set_address(mov, address); + + arr->last_write = mov; + + /* the array store may only matter to something in an earlier + * block (ie. loops), but since arrays are not in SSA, depth + * pass won't know this.. so keep all array stores: + */ + array_insert(block, block->keeps, mov); +} + +static inline type_t utype_for_size(unsigned bit_size) +{ + switch (bit_size) { + case 32: return TYPE_U32; + case 16: return TYPE_U16; + case 8: return TYPE_U8; + default: unreachable("bad bitsize"); return ~0; + } +} + +static inline type_t utype_src(nir_src src) +{ return utype_for_size(nir_src_bit_size(src)); } + +static inline type_t utype_dst(nir_dest dst) +{ return utype_for_size(nir_dest_bit_size(dst)); } + +/* allocate a n element value array (to be populated by caller) and + * insert in def_ht + */ +static struct ir3_instruction ** +get_dst_ssa(struct ir3_context *ctx, nir_ssa_def *dst, unsigned n) +{ + struct ir3_instruction **value = + ralloc_array(ctx->def_ht, struct ir3_instruction *, n); + _mesa_hash_table_insert(ctx->def_ht, dst, value); + return value; +} + +static struct ir3_instruction ** +get_dst(struct ir3_context *ctx, nir_dest *dst, unsigned n) +{ + struct ir3_instruction **value; + + if (dst->is_ssa) { + value = get_dst_ssa(ctx, &dst->ssa, n); + } else { + value = ralloc_array(ctx, struct ir3_instruction *, n); + } + + /* NOTE: in non-ssa case, we don't really need to store last_dst + * but this helps us catch cases where put_dst() call is forgotten + */ + compile_assert(ctx, !ctx->last_dst); + ctx->last_dst = value; + ctx->last_dst_n = n; + + return value; +} + +static struct ir3_instruction * get_addr(struct ir3_context *ctx, struct ir3_instruction *src, int align); + +static struct ir3_instruction * const * +get_src(struct ir3_context *ctx, nir_src *src) +{ + if (src->is_ssa) { + struct hash_entry *entry; + entry = _mesa_hash_table_search(ctx->def_ht, src->ssa); + compile_assert(ctx, entry); + return entry->data; + } else { + nir_register *reg = src->reg.reg; + struct ir3_array *arr = get_array(ctx, reg); + unsigned num_components = arr->r->num_components; + struct ir3_instruction *addr = NULL; + struct ir3_instruction **value = + ralloc_array(ctx, struct ir3_instruction *, num_components); + + if (src->reg.indirect) + addr = get_addr(ctx, get_src(ctx, src->reg.indirect)[0], + reg->num_components); + + for (unsigned i = 0; i < num_components; i++) { + unsigned n = src->reg.base_offset * reg->num_components + i; + compile_assert(ctx, n < arr->length); + value[i] = create_array_load(ctx, arr, n, addr); + } + + return value; + } +} + +static void +put_dst(struct ir3_context *ctx, nir_dest *dst) +{ + unsigned bit_size = nir_dest_bit_size(*dst); + + if (bit_size < 32) { + for (unsigned i = 0; i < ctx->last_dst_n; i++) { + struct ir3_instruction *dst = ctx->last_dst[i]; + dst->regs[0]->flags |= IR3_REG_HALF; + if (ctx->last_dst[i]->opc == OPC_META_FO) + dst->regs[1]->instr->regs[0]->flags |= IR3_REG_HALF; + } + } + + if (!dst->is_ssa) { + nir_register *reg = dst->reg.reg; + struct ir3_array *arr = get_array(ctx, reg); + unsigned num_components = ctx->last_dst_n; + struct ir3_instruction *addr = NULL; + + if (dst->reg.indirect) + addr = get_addr(ctx, get_src(ctx, dst->reg.indirect)[0], + reg->num_components); + + for (unsigned i = 0; i < num_components; i++) { + unsigned n = dst->reg.base_offset * reg->num_components + i; + compile_assert(ctx, n < arr->length); + if (!ctx->last_dst[i]) + continue; + create_array_store(ctx, arr, n, ctx->last_dst[i], addr); + } + + ralloc_free(ctx->last_dst); + } + ctx->last_dst = NULL; + ctx->last_dst_n = 0; +} + +static struct ir3_instruction * +create_immed_typed(struct ir3_block *block, uint32_t val, type_t type) +{ + struct ir3_instruction *mov; + unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0; + + mov = ir3_instr_create(block, OPC_MOV); + mov->cat1.src_type = type; + mov->cat1.dst_type = type; + ir3_reg_create(mov, 0, flags); + ir3_reg_create(mov, 0, IR3_REG_IMMED)->uim_val = val; + + return mov; +} + +static struct ir3_instruction * +create_immed(struct ir3_block *block, uint32_t val) +{ + return create_immed_typed(block, val, TYPE_U32); +} + +static struct ir3_instruction * +create_addr(struct ir3_block *block, struct ir3_instruction *src, int align) +{ + struct ir3_instruction *instr, *immed; + + /* TODO in at least some cases, the backend could probably be + * made clever enough to propagate IR3_REG_HALF.. + */ + instr = ir3_COV(block, src, TYPE_U32, TYPE_S16); + instr->regs[0]->flags |= IR3_REG_HALF; + + switch(align){ + case 1: + /* src *= 1: */ + break; + case 2: + /* src *= 2 => src <<= 1: */ + immed = create_immed(block, 1); + immed->regs[0]->flags |= IR3_REG_HALF; + + instr = ir3_SHL_B(block, instr, 0, immed, 0); + instr->regs[0]->flags |= IR3_REG_HALF; + instr->regs[1]->flags |= IR3_REG_HALF; + break; + case 3: + /* src *= 3: */ + immed = create_immed(block, 3); + immed->regs[0]->flags |= IR3_REG_HALF; + + instr = ir3_MULL_U(block, instr, 0, immed, 0); + instr->regs[0]->flags |= IR3_REG_HALF; + instr->regs[1]->flags |= IR3_REG_HALF; + break; + case 4: + /* src *= 4 => src <<= 2: */ + immed = create_immed(block, 2); + immed->regs[0]->flags |= IR3_REG_HALF; + + instr = ir3_SHL_B(block, instr, 0, immed, 0); + instr->regs[0]->flags |= IR3_REG_HALF; + instr->regs[1]->flags |= IR3_REG_HALF; + break; + default: + unreachable("bad align"); + return NULL; + } + + instr = ir3_MOV(block, instr, TYPE_S16); + instr->regs[0]->num = regid(REG_A0, 0); + instr->regs[0]->flags |= IR3_REG_HALF; + instr->regs[1]->flags |= IR3_REG_HALF; + + return instr; +} + +/* caches addr values to avoid generating multiple cov/shl/mova + * sequences for each use of a given NIR level src as address + */ +static struct ir3_instruction * +get_addr(struct ir3_context *ctx, struct ir3_instruction *src, int align) +{ + struct ir3_instruction *addr; + unsigned idx = align - 1; + + compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr_ht)); + + if (!ctx->addr_ht[idx]) { + ctx->addr_ht[idx] = _mesa_hash_table_create(ctx, + _mesa_hash_pointer, _mesa_key_pointer_equal); + } else { + struct hash_entry *entry; + entry = _mesa_hash_table_search(ctx->addr_ht[idx], src); + if (entry) + return entry->data; + } + + addr = create_addr(ctx->block, src, align); + _mesa_hash_table_insert(ctx->addr_ht[idx], src, addr); + + return addr; +} + +static struct ir3_instruction * +get_predicate(struct ir3_context *ctx, struct ir3_instruction *src) +{ + struct ir3_block *b = ctx->block; + struct ir3_instruction *cond; + + /* NOTE: only cmps.*.* can write p0.x: */ + cond = ir3_CMPS_S(b, src, 0, create_immed(b, 0), 0); + cond->cat2.condition = IR3_COND_NE; + + /* condition always goes in predicate register: */ + cond->regs[0]->num = regid(REG_P0, 0); + + return cond; +} + +static struct ir3_instruction * +create_uniform(struct ir3_context *ctx, unsigned n) +{ + struct ir3_instruction *mov; + + mov = ir3_instr_create(ctx->block, OPC_MOV); + /* TODO get types right? */ + mov->cat1.src_type = TYPE_F32; + mov->cat1.dst_type = TYPE_F32; + ir3_reg_create(mov, 0, 0); + ir3_reg_create(mov, n, IR3_REG_CONST); + + return mov; +} + +static struct ir3_instruction * +create_uniform_indirect(struct ir3_context *ctx, int n, + struct ir3_instruction *address) +{ + struct ir3_instruction *mov; + + mov = ir3_instr_create(ctx->block, OPC_MOV); + mov->cat1.src_type = TYPE_U32; + mov->cat1.dst_type = TYPE_U32; + ir3_reg_create(mov, 0, 0); + ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n; + + ir3_instr_set_address(mov, address); + + return mov; +} + +static struct ir3_instruction * +create_collect(struct ir3_context *ctx, struct ir3_instruction *const *arr, + unsigned arrsz) +{ + struct ir3_block *block = ctx->block; + struct ir3_instruction *collect; + + if (arrsz == 0) + return NULL; + + unsigned flags = arr[0]->regs[0]->flags & IR3_REG_HALF; + + collect = ir3_instr_create2(block, OPC_META_FI, 1 + arrsz); + ir3_reg_create(collect, 0, flags); /* dst */ + for (unsigned i = 0; i < arrsz; i++) { + struct ir3_instruction *elem = arr[i]; + + /* Since arrays are pre-colored in RA, we can't assume that + * things will end up in the right place. (Ie. if a collect + * joins elements from two different arrays.) So insert an + * extra mov. + * + * We could possibly skip this if all the collected elements + * are contiguous elements in a single array.. not sure how + * likely that is to happen. + * + * Fixes a problem with glamor shaders, that in effect do + * something like: + * + * if (foo) + * texcoord = .. + * else + * texcoord = .. + * color = texture2D(tex, texcoord); + * + * In this case, texcoord will end up as nir registers (which + * translate to ir3 array's of length 1. And we can't assume + * the two (or more) arrays will get allocated in consecutive + * scalar registers. + * + */ + if (elem->regs[0]->flags & IR3_REG_ARRAY) { + type_t type = (flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; + elem = ir3_MOV(block, elem, type); + } + + compile_assert(ctx, (elem->regs[0]->flags & IR3_REG_HALF) == flags); + ir3_reg_create(collect, 0, IR3_REG_SSA | flags)->instr = elem; + } + + return collect; +} + +static struct ir3_instruction * +create_indirect_load(struct ir3_context *ctx, unsigned arrsz, int n, + struct ir3_instruction *address, struct ir3_instruction *collect) +{ + struct ir3_block *block = ctx->block; + struct ir3_instruction *mov; + struct ir3_register *src; + + mov = ir3_instr_create(block, OPC_MOV); + mov->cat1.src_type = TYPE_U32; + mov->cat1.dst_type = TYPE_U32; + ir3_reg_create(mov, 0, 0); + src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV); + src->instr = collect; + src->size = arrsz; + src->array.offset = n; + + ir3_instr_set_address(mov, address); + + return mov; +} + +static struct ir3_instruction * +create_input_compmask(struct ir3_context *ctx, unsigned n, unsigned compmask) +{ + struct ir3_instruction *in; + + in = ir3_instr_create(ctx->in_block, OPC_META_INPUT); + in->inout.block = ctx->in_block; + ir3_reg_create(in, n, 0); + + in->regs[0]->wrmask = compmask; + + return in; +} + +static struct ir3_instruction * +create_input(struct ir3_context *ctx, unsigned n) +{ + return create_input_compmask(ctx, n, 0x1); +} + +static struct ir3_instruction * +create_frag_input(struct ir3_context *ctx, bool use_ldlv) +{ + struct ir3_block *block = ctx->block; + struct ir3_instruction *instr; + /* actual inloc is assigned and fixed up later: */ + struct ir3_instruction *inloc = create_immed(block, 0); + + if (use_ldlv) { + instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0); + instr->cat6.type = TYPE_U32; + instr->cat6.iim_val = 1; + } else { + instr = ir3_BARY_F(block, inloc, 0, ctx->frag_vcoord, 0); + instr->regs[2]->wrmask = 0x3; + } + + return instr; +} + +static struct ir3_instruction * +create_driver_param(struct ir3_context *ctx, enum ir3_driver_param dp) +{ + /* first four vec4 sysval's reserved for UBOs: */ + /* NOTE: dp is in scalar, but there can be >4 dp components: */ + unsigned n = ctx->so->constbase.driver_param; + unsigned r = regid(n + dp / 4, dp % 4); + return create_uniform(ctx, r); +} + +/* helper for instructions that produce multiple consecutive scalar + * outputs which need to have a split/fanout meta instruction inserted + */ +static void +split_dest(struct ir3_block *block, struct ir3_instruction **dst, + struct ir3_instruction *src, unsigned base, unsigned n) +{ + struct ir3_instruction *prev = NULL; + + if ((n == 1) && (src->regs[0]->wrmask == 0x1)) { + dst[0] = src; + return; + } + + for (int i = 0, j = 0; i < n; i++) { + struct ir3_instruction *split = ir3_instr_create(block, OPC_META_FO); + ir3_reg_create(split, 0, IR3_REG_SSA); + ir3_reg_create(split, 0, IR3_REG_SSA)->instr = src; + split->fo.off = i + base; + + if (prev) { + split->cp.left = prev; + split->cp.left_cnt++; + prev->cp.right = split; + prev->cp.right_cnt++; + } + prev = split; + + if (src->regs[0]->wrmask & (1 << (i + base))) + dst[j++] = split; + } +} + +/* + * Adreno uses uint rather than having dedicated bool type, + * which (potentially) requires some conversion, in particular + * when using output of an bool instr to int input, or visa + * versa. + * + * | Adreno | NIR | + * -------+---------+-------+- + * true | 1 | ~0 | + * false | 0 | 0 | + * + * To convert from an adreno bool (uint) to nir, use: + * + * absneg.s dst, (neg)src + * + * To convert back in the other direction: + * + * absneg.s dst, (abs)arc + * + * The CP step can clean up the absneg.s that cancel each other + * out, and with a slight bit of extra cleverness (to recognize + * the instructions which produce either a 0 or 1) can eliminate + * the absneg.s's completely when an instruction that wants + * 0/1 consumes the result. For example, when a nir 'bcsel' + * consumes the result of 'feq'. So we should be able to get by + * without a boolean resolve step, and without incuring any + * extra penalty in instruction count. + */ + +/* NIR bool -> native (adreno): */ +static struct ir3_instruction * +ir3_b2n(struct ir3_block *block, struct ir3_instruction *instr) +{ + return ir3_ABSNEG_S(block, instr, IR3_REG_SABS); +} + +/* native (adreno) -> NIR bool: */ +static struct ir3_instruction * +ir3_n2b(struct ir3_block *block, struct ir3_instruction *instr) +{ + return ir3_ABSNEG_S(block, instr, IR3_REG_SNEG); +} + +/* + * alu/sfu instructions: + */ + +static struct ir3_instruction * +create_cov(struct ir3_context *ctx, struct ir3_instruction *src, + unsigned src_bitsize, nir_op op) +{ + type_t src_type, dst_type; + + switch (op) { + case nir_op_f2f32: + case nir_op_f2f16_rtne: + case nir_op_f2f16_rtz: + case nir_op_f2f16: + case nir_op_f2i32: + case nir_op_f2i16: + case nir_op_f2i8: + case nir_op_f2u32: + case nir_op_f2u16: + case nir_op_f2u8: + switch (src_bitsize) { + case 32: + src_type = TYPE_F32; + break; + case 16: + src_type = TYPE_F16; + break; + default: + compile_error(ctx, "invalid src bit size: %u", src_bitsize); + } + break; + + case nir_op_i2f32: + case nir_op_i2f16: + case nir_op_i2i32: + case nir_op_i2i16: + case nir_op_i2i8: + switch (src_bitsize) { + case 32: + src_type = TYPE_S32; + break; + case 16: + src_type = TYPE_S16; + break; + case 8: + src_type = TYPE_S8; + break; + default: + compile_error(ctx, "invalid src bit size: %u", src_bitsize); + } + break; + + case nir_op_u2f32: + case nir_op_u2f16: + case nir_op_u2u32: + case nir_op_u2u16: + case nir_op_u2u8: + switch (src_bitsize) { + case 32: + src_type = TYPE_U32; + break; + case 16: + src_type = TYPE_U16; + break; + case 8: + src_type = TYPE_U8; + break; + default: + compile_error(ctx, "invalid src bit size: %u", src_bitsize); + } + break; + + default: + compile_error(ctx, "invalid conversion op: %u", op); + } + + switch (op) { + case nir_op_f2f32: + case nir_op_i2f32: + case nir_op_u2f32: + dst_type = TYPE_F32; + break; + + case nir_op_f2f16_rtne: + case nir_op_f2f16_rtz: + case nir_op_f2f16: + /* TODO how to handle rounding mode? */ + case nir_op_i2f16: + case nir_op_u2f16: + dst_type = TYPE_F16; + break; + + case nir_op_f2i32: + case nir_op_i2i32: + dst_type = TYPE_S32; + break; + + case nir_op_f2i16: + case nir_op_i2i16: + dst_type = TYPE_S16; + break; + + case nir_op_f2i8: + case nir_op_i2i8: + dst_type = TYPE_S8; + break; + + case nir_op_f2u32: + case nir_op_u2u32: + dst_type = TYPE_U32; + break; + + case nir_op_f2u16: + case nir_op_u2u16: + dst_type = TYPE_U16; + break; + + case nir_op_f2u8: + case nir_op_u2u8: + dst_type = TYPE_U8; + break; + + default: + compile_error(ctx, "invalid conversion op: %u", op); + } + + return ir3_COV(ctx->block, src, src_type, dst_type); +} + +static void +emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) +{ + const nir_op_info *info = &nir_op_infos[alu->op]; + struct ir3_instruction **dst, *src[info->num_inputs]; + unsigned bs[info->num_inputs]; /* bit size */ + struct ir3_block *b = ctx->block; + unsigned dst_sz, wrmask; + + if (alu->dest.dest.is_ssa) { + dst_sz = alu->dest.dest.ssa.num_components; + wrmask = (1 << dst_sz) - 1; + } else { + dst_sz = alu->dest.dest.reg.reg->num_components; + wrmask = alu->dest.write_mask; + } + + dst = get_dst(ctx, &alu->dest.dest, dst_sz); + + /* Vectors are special in that they have non-scalarized writemasks, + * and just take the first swizzle channel for each argument in + * order into each writemask channel. + */ + if ((alu->op == nir_op_vec2) || + (alu->op == nir_op_vec3) || + (alu->op == nir_op_vec4)) { + + for (int i = 0; i < info->num_inputs; i++) { + nir_alu_src *asrc = &alu->src[i]; + + compile_assert(ctx, !asrc->abs); + compile_assert(ctx, !asrc->negate); + + src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[0]]; + if (!src[i]) + src[i] = create_immed(ctx->block, 0); + dst[i] = ir3_MOV(b, src[i], TYPE_U32); + } + + put_dst(ctx, &alu->dest.dest); + return; + } + + /* We also get mov's with more than one component for mov's so + * handle those specially: + */ + if ((alu->op == nir_op_imov) || (alu->op == nir_op_fmov)) { + type_t type = (alu->op == nir_op_imov) ? TYPE_U32 : TYPE_F32; + nir_alu_src *asrc = &alu->src[0]; + struct ir3_instruction *const *src0 = get_src(ctx, &asrc->src); + + for (unsigned i = 0; i < dst_sz; i++) { + if (wrmask & (1 << i)) { + dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], type); + } else { + dst[i] = NULL; + } + } + + put_dst(ctx, &alu->dest.dest); + return; + } + + /* General case: We can just grab the one used channel per src. */ + for (int i = 0; i < info->num_inputs; i++) { + unsigned chan = ffs(alu->dest.write_mask) - 1; + nir_alu_src *asrc = &alu->src[i]; + + compile_assert(ctx, !asrc->abs); + compile_assert(ctx, !asrc->negate); + + src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[chan]]; + bs[i] = nir_src_bit_size(asrc->src); + + compile_assert(ctx, src[i]); + } + + switch (alu->op) { + case nir_op_f2f32: + case nir_op_f2f16_rtne: + case nir_op_f2f16_rtz: + case nir_op_f2f16: + case nir_op_f2i32: + case nir_op_f2i16: + case nir_op_f2i8: + case nir_op_f2u32: + case nir_op_f2u16: + case nir_op_f2u8: + case nir_op_i2f32: + case nir_op_i2f16: + case nir_op_i2i32: + case nir_op_i2i16: + case nir_op_i2i8: + case nir_op_u2f32: + case nir_op_u2f16: + case nir_op_u2u32: + case nir_op_u2u16: + case nir_op_u2u8: + dst[0] = create_cov(ctx, src[0], bs[0], alu->op); + break; + case nir_op_f2b: + dst[0] = ir3_CMPS_F(b, src[0], 0, create_immed(b, fui(0.0)), 0); + dst[0]->cat2.condition = IR3_COND_NE; + dst[0] = ir3_n2b(b, dst[0]); + break; + case nir_op_b2f: + dst[0] = ir3_COV(b, ir3_b2n(b, src[0]), TYPE_U32, TYPE_F32); + break; + case nir_op_b2i: + dst[0] = ir3_b2n(b, src[0]); + break; + case nir_op_i2b: + dst[0] = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0); + dst[0]->cat2.condition = IR3_COND_NE; + dst[0] = ir3_n2b(b, dst[0]); + break; + + case nir_op_fneg: + dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG); + break; + case nir_op_fabs: + dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS); + break; + case nir_op_fmax: + dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0); + break; + case nir_op_fmin: + dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0); + break; + case nir_op_fsat: + /* if there is just a single use of the src, and it supports + * (sat) bit, we can just fold the (sat) flag back to the + * src instruction and create a mov. This is easier for cp + * to eliminate. + * + * TODO probably opc_cat==4 is ok too + */ + if (alu->src[0].src.is_ssa && + (list_length(&alu->src[0].src.ssa->uses) == 1) && + ((opc_cat(src[0]->opc) == 2) || (opc_cat(src[0]->opc) == 3))) { + src[0]->flags |= IR3_INSTR_SAT; + dst[0] = ir3_MOV(b, src[0], TYPE_U32); + } else { + /* otherwise generate a max.f that saturates.. blob does + * similar (generating a cat2 mov using max.f) + */ + dst[0] = ir3_MAX_F(b, src[0], 0, src[0], 0); + dst[0]->flags |= IR3_INSTR_SAT; + } + break; + case nir_op_fmul: + dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0); + break; + case nir_op_fadd: + dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0); + break; + case nir_op_fsub: + dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG); + break; + case nir_op_ffma: + dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0); + break; + case nir_op_fddx: + dst[0] = ir3_DSX(b, src[0], 0); + dst[0]->cat5.type = TYPE_F32; + break; + case nir_op_fddy: + dst[0] = ir3_DSY(b, src[0], 0); + dst[0]->cat5.type = TYPE_F32; + break; + break; + case nir_op_flt: + dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); + dst[0]->cat2.condition = IR3_COND_LT; + dst[0] = ir3_n2b(b, dst[0]); + break; + case nir_op_fge: + dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); + dst[0]->cat2.condition = IR3_COND_GE; + dst[0] = ir3_n2b(b, dst[0]); + break; + case nir_op_feq: + dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); + dst[0]->cat2.condition = IR3_COND_EQ; + dst[0] = ir3_n2b(b, dst[0]); + break; + case nir_op_fne: + dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); + dst[0]->cat2.condition = IR3_COND_NE; + dst[0] = ir3_n2b(b, dst[0]); + break; + case nir_op_fceil: + dst[0] = ir3_CEIL_F(b, src[0], 0); + break; + case nir_op_ffloor: + dst[0] = ir3_FLOOR_F(b, src[0], 0); + break; + case nir_op_ftrunc: + dst[0] = ir3_TRUNC_F(b, src[0], 0); + break; + case nir_op_fround_even: + dst[0] = ir3_RNDNE_F(b, src[0], 0); + break; + case nir_op_fsign: + dst[0] = ir3_SIGN_F(b, src[0], 0); + break; + + case nir_op_fsin: + dst[0] = ir3_SIN(b, src[0], 0); + break; + case nir_op_fcos: + dst[0] = ir3_COS(b, src[0], 0); + break; + case nir_op_frsq: + dst[0] = ir3_RSQ(b, src[0], 0); + break; + case nir_op_frcp: + dst[0] = ir3_RCP(b, src[0], 0); + break; + case nir_op_flog2: + dst[0] = ir3_LOG2(b, src[0], 0); + break; + case nir_op_fexp2: + dst[0] = ir3_EXP2(b, src[0], 0); + break; + case nir_op_fsqrt: + dst[0] = ir3_SQRT(b, src[0], 0); + break; + + case nir_op_iabs: + dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS); + break; + case nir_op_iadd: + dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0); + break; + case nir_op_iand: + dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0); + break; + case nir_op_imax: + dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0); + break; + case nir_op_umax: + dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0); + break; + case nir_op_imin: + dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0); + break; + case nir_op_umin: + dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0); + break; + case nir_op_imul: + /* + * dst = (al * bl) + (ah * bl << 16) + (al * bh << 16) + * mull.u tmp0, a, b ; mul low, i.e. al * bl + * madsh.m16 tmp1, a, b, tmp0 ; mul-add shift high mix, i.e. ah * bl << 16 + * madsh.m16 dst, b, a, tmp1 ; i.e. al * bh << 16 + */ + dst[0] = ir3_MADSH_M16(b, src[1], 0, src[0], 0, + ir3_MADSH_M16(b, src[0], 0, src[1], 0, + ir3_MULL_U(b, src[0], 0, src[1], 0), 0), 0); + break; + case nir_op_ineg: + dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG); + break; + case nir_op_inot: + dst[0] = ir3_NOT_B(b, src[0], 0); + break; + case nir_op_ior: + dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0); + break; + case nir_op_ishl: + dst[0] = ir3_SHL_B(b, src[0], 0, src[1], 0); + break; + case nir_op_ishr: + dst[0] = ir3_ASHR_B(b, src[0], 0, src[1], 0); + break; + case nir_op_isign: { + /* maybe this would be sane to lower in nir.. */ + struct ir3_instruction *neg, *pos; + + neg = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0); + neg->cat2.condition = IR3_COND_LT; + + pos = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0); + pos->cat2.condition = IR3_COND_GT; + + dst[0] = ir3_SUB_U(b, pos, 0, neg, 0); + + break; + } + case nir_op_isub: + dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0); + break; + case nir_op_ixor: + dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0); + break; + case nir_op_ushr: + dst[0] = ir3_SHR_B(b, src[0], 0, src[1], 0); + break; + case nir_op_ilt: + dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); + dst[0]->cat2.condition = IR3_COND_LT; + dst[0] = ir3_n2b(b, dst[0]); + break; + case nir_op_ige: + dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); + dst[0]->cat2.condition = IR3_COND_GE; + dst[0] = ir3_n2b(b, dst[0]); + break; + case nir_op_ieq: + dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); + dst[0]->cat2.condition = IR3_COND_EQ; + dst[0] = ir3_n2b(b, dst[0]); + break; + case nir_op_ine: + dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); + dst[0]->cat2.condition = IR3_COND_NE; + dst[0] = ir3_n2b(b, dst[0]); + break; + case nir_op_ult: + dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0); + dst[0]->cat2.condition = IR3_COND_LT; + dst[0] = ir3_n2b(b, dst[0]); + break; + case nir_op_uge: + dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0); + dst[0]->cat2.condition = IR3_COND_GE; + dst[0] = ir3_n2b(b, dst[0]); + break; + + case nir_op_bcsel: { + struct ir3_instruction *cond = ir3_b2n(b, src[0]); + compile_assert(ctx, bs[1] == bs[2]); + /* the boolean condition is 32b even if src[1] and src[2] are + * half-precision, but sel.b16 wants all three src's to be the + * same type. + */ + if (bs[1] < 32) + cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16); + dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0); + break; + } + case nir_op_bit_count: + dst[0] = ir3_CBITS_B(b, src[0], 0); + break; + case nir_op_ifind_msb: { + struct ir3_instruction *cmp; + dst[0] = ir3_CLZ_S(b, src[0], 0); + cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0); + cmp->cat2.condition = IR3_COND_GE; + dst[0] = ir3_SEL_B32(b, + ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0, + cmp, 0, dst[0], 0); + break; + } + case nir_op_ufind_msb: + dst[0] = ir3_CLZ_B(b, src[0], 0); + dst[0] = ir3_SEL_B32(b, + ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0, + src[0], 0, dst[0], 0); + break; + case nir_op_find_lsb: + dst[0] = ir3_BFREV_B(b, src[0], 0); + dst[0] = ir3_CLZ_B(b, dst[0], 0); + break; + case nir_op_bitfield_reverse: + dst[0] = ir3_BFREV_B(b, src[0], 0); + break; + + default: + compile_error(ctx, "Unhandled ALU op: %s\n", + nir_op_infos[alu->op].name); + break; + } + + put_dst(ctx, &alu->dest.dest); +} + +/* handles direct/indirect UBO reads: */ +static void +emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr, + struct ir3_instruction **dst) +{ + struct ir3_block *b = ctx->block; + struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1; + nir_const_value *const_offset; + /* UBO addresses are the first driver params: */ + unsigned ubo = regid(ctx->so->constbase.ubo, 0); + const unsigned ptrsz = pointer_size(ctx); + + int off = 0; + + /* First src is ubo index, which could either be an immed or not: */ + src0 = get_src(ctx, &intr->src[0])[0]; + if (is_same_type_mov(src0) && + (src0->regs[1]->flags & IR3_REG_IMMED)) { + base_lo = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz)); + base_hi = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz) + 1); + } else { + base_lo = create_uniform_indirect(ctx, ubo, get_addr(ctx, src0, 4)); + base_hi = create_uniform_indirect(ctx, ubo + 1, get_addr(ctx, src0, 4)); + } + + /* note: on 32bit gpu's base_hi is ignored and DCE'd */ + addr = base_lo; + + const_offset = nir_src_as_const_value(intr->src[1]); + if (const_offset) { + off += const_offset->u32[0]; + } else { + /* For load_ubo_indirect, second src is indirect offset: */ + src1 = get_src(ctx, &intr->src[1])[0]; + + /* and add offset to addr: */ + addr = ir3_ADD_S(b, addr, 0, src1, 0); + } + + /* if offset is to large to encode in the ldg, split it out: */ + if ((off + (intr->num_components * 4)) > 1024) { + /* split out the minimal amount to improve the odds that + * cp can fit the immediate in the add.s instruction: + */ + unsigned off2 = off + (intr->num_components * 4) - 1024; + addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0); + off -= off2; + } + + if (ptrsz == 2) { + struct ir3_instruction *carry; + + /* handle 32b rollover, ie: + * if (addr < base_lo) + * base_hi++ + */ + carry = ir3_CMPS_U(b, addr, 0, base_lo, 0); + carry->cat2.condition = IR3_COND_LT; + base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0); + + addr = create_collect(ctx, (struct ir3_instruction*[]){ addr, base_hi }, 2); + } + + for (int i = 0; i < intr->num_components; i++) { + struct ir3_instruction *load = + ir3_LDG(b, addr, 0, create_immed(b, 1), 0); + load->cat6.type = TYPE_U32; + load->cat6.src_offset = off + i * 4; /* byte offset */ + dst[i] = load; + } +} + +/* src[] = { buffer_index, offset }. No const_index */ +static void +emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr, + struct ir3_instruction **dst) +{ + struct ir3_block *b = ctx->block; + struct ir3_instruction *ldgb, *src0, *src1, *offset; + nir_const_value *const_offset; + + /* can this be non-const buffer_index? how do we handle that? */ + const_offset = nir_src_as_const_value(intr->src[0]); + compile_assert(ctx, const_offset); + + offset = get_src(ctx, &intr->src[1])[0]; + + /* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */ + src0 = create_collect(ctx, (struct ir3_instruction*[]){ + offset, + create_immed(b, 0), + }, 2); + src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0); + + ldgb = ir3_LDGB(b, create_immed(b, const_offset->u32[0]), 0, + src0, 0, src1, 0); + ldgb->regs[0]->wrmask = MASK(intr->num_components); + ldgb->cat6.iim_val = intr->num_components; + ldgb->cat6.d = 4; + ldgb->cat6.type = TYPE_U32; + ldgb->barrier_class = IR3_BARRIER_BUFFER_R; + ldgb->barrier_conflict = IR3_BARRIER_BUFFER_W; + + split_dest(b, dst, ldgb, 0, intr->num_components); +} + +/* src[] = { value, block_index, offset }. const_index[] = { write_mask } */ +static void +emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr) +{ + struct ir3_block *b = ctx->block; + struct ir3_instruction *stgb, *src0, *src1, *src2, *offset; + nir_const_value *const_offset; + /* TODO handle wrmask properly, see _store_shared().. but I think + * it is more a PITA than that, since blob ends up loading the + * masked components and writing them back out. + */ + unsigned wrmask = intr->const_index[0]; + unsigned ncomp = ffs(~wrmask) - 1; + + /* can this be non-const buffer_index? how do we handle that? */ + const_offset = nir_src_as_const_value(intr->src[1]); + compile_assert(ctx, const_offset); + + offset = get_src(ctx, &intr->src[2])[0]; + + /* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0).. + * nir already *= 4: + */ + src0 = create_collect(ctx, get_src(ctx, &intr->src[0]), ncomp); + src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0); + src2 = create_collect(ctx, (struct ir3_instruction*[]){ + offset, + create_immed(b, 0), + }, 2); + + stgb = ir3_STGB(b, create_immed(b, const_offset->u32[0]), 0, + src0, 0, src1, 0, src2, 0); + stgb->cat6.iim_val = ncomp; + stgb->cat6.d = 4; + stgb->cat6.type = TYPE_U32; + stgb->barrier_class = IR3_BARRIER_BUFFER_W; + stgb->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; + + array_insert(b, b->keeps, stgb); +} + +/* src[] = { block_index } */ +static void +emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr, + struct ir3_instruction **dst) +{ + /* SSBO size stored as a const starting at ssbo_sizes: */ + unsigned blk_idx = nir_src_as_const_value(intr->src[0])->u32[0]; + unsigned idx = regid(ctx->so->constbase.ssbo_sizes, 0) + + ctx->so->const_layout.ssbo_size.off[blk_idx]; + + debug_assert(ctx->so->const_layout.ssbo_size.mask & (1 << blk_idx)); + + dst[0] = create_uniform(ctx, idx); +} + +/* + * SSBO atomic intrinsics + * + * All of the SSBO atomic memory operations read a value from memory, + * compute a new value using one of the operations below, write the new + * value to memory, and return the original value read. + * + * All operations take 3 sources except CompSwap that takes 4. These + * sources represent: + * + * 0: The SSBO buffer index. + * 1: The offset into the SSBO buffer of the variable that the atomic + * operation will operate on. + * 2: The data parameter to the atomic function (i.e. the value to add + * in ssbo_atomic_add, etc). + * 3: For CompSwap only: the second data parameter. + */ +static struct ir3_instruction * +emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr) +{ + struct ir3_block *b = ctx->block; + struct ir3_instruction *atomic, *ssbo, *src0, *src1, *src2, *offset; + nir_const_value *const_offset; + type_t type = TYPE_U32; + + /* can this be non-const buffer_index? how do we handle that? */ + const_offset = nir_src_as_const_value(intr->src[0]); + compile_assert(ctx, const_offset); + ssbo = create_immed(b, const_offset->u32[0]); + + offset = get_src(ctx, &intr->src[1])[0]; + + /* src0 is data (or uvec2(data, compare)) + * src1 is offset + * src2 is uvec2(offset*4, 0) (appears to be 64b byte offset) + * + * Note that nir already multiplies the offset by four + */ + src0 = get_src(ctx, &intr->src[2])[0]; + src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0); + src2 = create_collect(ctx, (struct ir3_instruction*[]){ + offset, + create_immed(b, 0), + }, 2); + + switch (intr->intrinsic) { + case nir_intrinsic_ssbo_atomic_add: + atomic = ir3_ATOMIC_ADD_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_ssbo_atomic_imin: + atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); + type = TYPE_S32; + break; + case nir_intrinsic_ssbo_atomic_umin: + atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_ssbo_atomic_imax: + atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); + type = TYPE_S32; + break; + case nir_intrinsic_ssbo_atomic_umax: + atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_ssbo_atomic_and: + atomic = ir3_ATOMIC_AND_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_ssbo_atomic_or: + atomic = ir3_ATOMIC_OR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_ssbo_atomic_xor: + atomic = ir3_ATOMIC_XOR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_ssbo_atomic_exchange: + atomic = ir3_ATOMIC_XCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_ssbo_atomic_comp_swap: + /* for cmpxchg, src0 is [ui]vec2(data, compare): */ + src0 = create_collect(ctx, (struct ir3_instruction*[]){ + get_src(ctx, &intr->src[3])[0], + src0, + }, 2); + atomic = ir3_ATOMIC_CMPXCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); + break; + default: + unreachable("boo"); + } + + atomic->cat6.iim_val = 1; + atomic->cat6.d = 4; + atomic->cat6.type = type; + atomic->barrier_class = IR3_BARRIER_BUFFER_W; + atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; + + /* even if nothing consume the result, we can't DCE the instruction: */ + array_insert(b, b->keeps, atomic); + + return atomic; +} + +/* src[] = { offset }. const_index[] = { base } */ +static void +emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr, + struct ir3_instruction **dst) +{ + struct ir3_block *b = ctx->block; + struct ir3_instruction *ldl, *offset; + unsigned base; + + offset = get_src(ctx, &intr->src[0])[0]; + base = nir_intrinsic_base(intr); + + ldl = ir3_LDL(b, offset, 0, create_immed(b, intr->num_components), 0); + ldl->cat6.src_offset = base; + ldl->cat6.type = utype_dst(intr->dest); + ldl->regs[0]->wrmask = MASK(intr->num_components); + + ldl->barrier_class = IR3_BARRIER_SHARED_R; + ldl->barrier_conflict = IR3_BARRIER_SHARED_W; + + split_dest(b, dst, ldl, 0, intr->num_components); +} + +/* src[] = { value, offset }. const_index[] = { base, write_mask } */ +static void +emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr) +{ + struct ir3_block *b = ctx->block; + struct ir3_instruction *stl, *offset; + struct ir3_instruction * const *value; + unsigned base, wrmask; + + value = get_src(ctx, &intr->src[0]); + offset = get_src(ctx, &intr->src[1])[0]; + + base = nir_intrinsic_base(intr); + wrmask = nir_intrinsic_write_mask(intr); + + /* Combine groups of consecutive enabled channels in one write + * message. We use ffs to find the first enabled channel and then ffs on + * the bit-inverse, down-shifted writemask to determine the length of + * the block of enabled bits. + * + * (trick stolen from i965's fs_visitor::nir_emit_cs_intrinsic()) + */ + while (wrmask) { + unsigned first_component = ffs(wrmask) - 1; + unsigned length = ffs(~(wrmask >> first_component)) - 1; + + stl = ir3_STL(b, offset, 0, + create_collect(ctx, &value[first_component], length), 0, + create_immed(b, length), 0); + stl->cat6.dst_offset = first_component + base; + stl->cat6.type = utype_src(intr->src[0]); + stl->barrier_class = IR3_BARRIER_SHARED_W; + stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W; + + array_insert(b, b->keeps, stl); + + /* Clear the bits in the writemask that we just wrote, then try + * again to see if more channels are left. + */ + wrmask &= (15 << (first_component + length)); + } +} + +/* + * CS shared variable atomic intrinsics + * + * All of the shared variable atomic memory operations read a value from + * memory, compute a new value using one of the operations below, write the + * new value to memory, and return the original value read. + * + * All operations take 2 sources except CompSwap that takes 3. These + * sources represent: + * + * 0: The offset into the shared variable storage region that the atomic + * operation will operate on. + * 1: The data parameter to the atomic function (i.e. the value to add + * in shared_atomic_add, etc). + * 2: For CompSwap only: the second data parameter. + */ +static struct ir3_instruction * +emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr) +{ + struct ir3_block *b = ctx->block; + struct ir3_instruction *atomic, *src0, *src1; + type_t type = TYPE_U32; + + src0 = get_src(ctx, &intr->src[0])[0]; /* offset */ + src1 = get_src(ctx, &intr->src[1])[0]; /* value */ + + switch (intr->intrinsic) { + case nir_intrinsic_shared_atomic_add: + atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0); + break; + case nir_intrinsic_shared_atomic_imin: + atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0); + type = TYPE_S32; + break; + case nir_intrinsic_shared_atomic_umin: + atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0); + break; + case nir_intrinsic_shared_atomic_imax: + atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0); + type = TYPE_S32; + break; + case nir_intrinsic_shared_atomic_umax: + atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0); + break; + case nir_intrinsic_shared_atomic_and: + atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0); + break; + case nir_intrinsic_shared_atomic_or: + atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0); + break; + case nir_intrinsic_shared_atomic_xor: + atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0); + break; + case nir_intrinsic_shared_atomic_exchange: + atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0); + break; + case nir_intrinsic_shared_atomic_comp_swap: + /* for cmpxchg, src1 is [ui]vec2(data, compare): */ + src1 = create_collect(ctx, (struct ir3_instruction*[]){ + get_src(ctx, &intr->src[2])[0], + src1, + }, 2); + atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0); + break; + default: + unreachable("boo"); + } + + atomic->cat6.iim_val = 1; + atomic->cat6.d = 1; + atomic->cat6.type = type; + atomic->barrier_class = IR3_BARRIER_SHARED_W; + atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W; + + /* even if nothing consume the result, we can't DCE the instruction: */ + array_insert(b, b->keeps, atomic); + + return atomic; +} + +/* Images get mapped into SSBO/image state (for store/atomic) and texture + * state block (for load). To simplify things, invert the image id and + * map it from end of state block, ie. image 0 becomes num-1, image 1 + * becomes num-2, etc. This potentially avoids needing to re-emit texture + * state when switching shaders. + * + * TODO is max # of samplers and SSBOs the same. This shouldn't be hard- + * coded. Also, since all the gl shader stages (ie. everything but CS) + * share the same SSBO/image state block, this might require some more + * logic if we supported images in anything other than FS.. + */ +static unsigned +get_image_slot(struct ir3_context *ctx, nir_deref_instr *deref) +{ + unsigned int loc = 0; + unsigned inner_size = 1; + + while (deref->deref_type != nir_deref_type_var) { + assert(deref->deref_type == nir_deref_type_array); + nir_const_value *const_index = nir_src_as_const_value(deref->arr.index); + assert(const_index); + + /* Go to the next instruction */ + deref = nir_deref_instr_parent(deref); + + assert(glsl_type_is_array(deref->type)); + const unsigned array_len = glsl_get_length(deref->type); + loc += MIN2(const_index->u32[0], array_len - 1) * inner_size; + + /* Update the inner size */ + inner_size *= array_len; + } + + loc += deref->var->data.driver_location; + + /* TODO figure out real limit per generation, and don't hardcode: */ + const unsigned max_samplers = 16; + return max_samplers - loc - 1; +} + +/* see tex_info() for equiv logic for texture instructions.. it would be + * nice if this could be better unified.. + */ +static unsigned +get_image_coords(const nir_variable *var, unsigned *flagsp) +{ + const struct glsl_type *type = glsl_without_array(var->type); + unsigned coords, flags = 0; + + switch (glsl_get_sampler_dim(type)) { + case GLSL_SAMPLER_DIM_1D: + case GLSL_SAMPLER_DIM_BUF: + coords = 1; + break; + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_RECT: + case GLSL_SAMPLER_DIM_EXTERNAL: + case GLSL_SAMPLER_DIM_MS: + coords = 2; + break; + case GLSL_SAMPLER_DIM_3D: + case GLSL_SAMPLER_DIM_CUBE: + flags |= IR3_INSTR_3D; + coords = 3; + break; + default: + unreachable("bad sampler dim"); + return 0; + } + + if (glsl_sampler_type_is_array(type)) { + /* note: unlike tex_info(), adjust # of coords to include array idx: */ + coords++; + flags |= IR3_INSTR_A; + } + + if (flagsp) + *flagsp = flags; + + return coords; +} + +static type_t +get_image_type(const nir_variable *var) +{ + switch (glsl_get_sampler_result_type(glsl_without_array(var->type))) { + case GLSL_TYPE_UINT: + return TYPE_U32; + case GLSL_TYPE_INT: + return TYPE_S32; + case GLSL_TYPE_FLOAT: + return TYPE_F32; + default: + unreachable("bad sampler type."); + return 0; + } +} + +static struct ir3_instruction * +get_image_offset(struct ir3_context *ctx, const nir_variable *var, + struct ir3_instruction * const *coords, bool byteoff) +{ + struct ir3_block *b = ctx->block; + struct ir3_instruction *offset; + unsigned ncoords = get_image_coords(var, NULL); + + /* to calculate the byte offset (yes, uggg) we need (up to) three + * const values to know the bytes per pixel, and y and z stride: + */ + unsigned cb = regid(ctx->so->constbase.image_dims, 0) + + ctx->so->const_layout.image_dims.off[var->data.driver_location]; + + debug_assert(ctx->so->const_layout.image_dims.mask & + (1 << var->data.driver_location)); + + /* offset = coords.x * bytes_per_pixel: */ + offset = ir3_MUL_S(b, coords[0], 0, create_uniform(ctx, cb + 0), 0); + if (ncoords > 1) { + /* offset += coords.y * y_pitch: */ + offset = ir3_MAD_S24(b, create_uniform(ctx, cb + 1), 0, + coords[1], 0, offset, 0); + } + if (ncoords > 2) { + /* offset += coords.z * z_pitch: */ + offset = ir3_MAD_S24(b, create_uniform(ctx, cb + 2), 0, + coords[2], 0, offset, 0); + } + + if (!byteoff) { + /* Some cases, like atomics, seem to use dword offset instead + * of byte offsets.. blob just puts an extra shr.b in there + * in those cases: + */ + offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0); + } + + return create_collect(ctx, (struct ir3_instruction*[]){ + offset, + create_immed(b, 0), + }, 2); +} + +/* src[] = { deref, coord, sample_index }. const_index[] = {} */ +static void +emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr, + struct ir3_instruction **dst) +{ + struct ir3_block *b = ctx->block; + const nir_variable *var = nir_intrinsic_get_var(intr, 0); + struct ir3_instruction *sam; + struct ir3_instruction * const *src0 = get_src(ctx, &intr->src[1]); + struct ir3_instruction *coords[4]; + unsigned flags, ncoords = get_image_coords(var, &flags); + unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0])); + type_t type = get_image_type(var); + + /* hmm, this seems a bit odd, but it is what blob does and (at least + * a5xx) just faults on bogus addresses otherwise: + */ + if (flags & IR3_INSTR_3D) { + flags &= ~IR3_INSTR_3D; + flags |= IR3_INSTR_A; + } + + for (unsigned i = 0; i < ncoords; i++) + coords[i] = src0[i]; + + if (ncoords == 1) + coords[ncoords++] = create_immed(b, 0); + + sam = ir3_SAM(b, OPC_ISAM, type, 0b1111, flags, + tex_idx, tex_idx, create_collect(ctx, coords, ncoords), NULL); + + sam->barrier_class = IR3_BARRIER_IMAGE_R; + sam->barrier_conflict = IR3_BARRIER_IMAGE_W; + + split_dest(b, dst, sam, 0, 4); +} + +/* src[] = { deref, coord, sample_index, value }. const_index[] = {} */ +static void +emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr) +{ + struct ir3_block *b = ctx->block; + const nir_variable *var = nir_intrinsic_get_var(intr, 0); + struct ir3_instruction *stib, *offset; + struct ir3_instruction * const *value = get_src(ctx, &intr->src[3]); + struct ir3_instruction * const *coords = get_src(ctx, &intr->src[1]); + unsigned ncoords = get_image_coords(var, NULL); + unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0])); + + /* src0 is value + * src1 is coords + * src2 is 64b byte offset + */ + + offset = get_image_offset(ctx, var, coords, true); + + /* NOTE: stib seems to take byte offset, but stgb.typed can be used + * too and takes a dword offset.. not quite sure yet why blob uses + * one over the other in various cases. + */ + + stib = ir3_STIB(b, create_immed(b, tex_idx), 0, + create_collect(ctx, value, 4), 0, + create_collect(ctx, coords, ncoords), 0, + offset, 0); + stib->cat6.iim_val = 4; + stib->cat6.d = ncoords; + stib->cat6.type = get_image_type(var); + stib->cat6.typed = true; + stib->barrier_class = IR3_BARRIER_IMAGE_W; + stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W; + + array_insert(b, b->keeps, stib); +} + +static void +emit_intrinsic_image_size(struct ir3_context *ctx, nir_intrinsic_instr *intr, + struct ir3_instruction **dst) +{ + struct ir3_block *b = ctx->block; + const nir_variable *var = nir_intrinsic_get_var(intr, 0); + unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0])); + struct ir3_instruction *sam, *lod; + unsigned flags, ncoords = get_image_coords(var, &flags); + + lod = create_immed(b, 0); + sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, 0b1111, flags, + tex_idx, tex_idx, lod, NULL); + + /* Array size actually ends up in .w rather than .z. This doesn't + * matter for miplevel 0, but for higher mips the value in z is + * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is + * returned, which means that we have to add 1 to it for arrays for + * a3xx. + * + * Note use a temporary dst and then copy, since the size of the dst + * array that is passed in is based on nir's understanding of the + * result size, not the hardware's + */ + struct ir3_instruction *tmp[4]; + + split_dest(b, tmp, sam, 0, 4); + + /* get_size instruction returns size in bytes instead of texels + * for imageBuffer, so we need to divide it by the pixel size + * of the image format. + * + * TODO: This is at least true on a5xx. Check other gens. + */ + enum glsl_sampler_dim dim = + glsl_get_sampler_dim(glsl_without_array(var->type)); + if (dim == GLSL_SAMPLER_DIM_BUF) { + /* Since all the possible values the divisor can take are + * power-of-two (4, 8, or 16), the division is implemented + * as a shift-right. + * During shader setup, the log2 of the image format's + * bytes-per-pixel should have been emitted in 2nd slot of + * image_dims. See ir3_shader::emit_image_dims(). + */ + unsigned cb = regid(ctx->so->constbase.image_dims, 0) + + ctx->so->const_layout.image_dims.off[var->data.driver_location]; + struct ir3_instruction *aux = create_uniform(ctx, cb + 1); + + tmp[0] = ir3_SHR_B(b, tmp[0], 0, aux, 0); + } + + for (unsigned i = 0; i < ncoords; i++) + dst[i] = tmp[i]; + + if (flags & IR3_INSTR_A) { + if (ctx->compiler->levels_add_one) { + dst[ncoords-1] = ir3_ADD_U(b, tmp[3], 0, create_immed(b, 1), 0); + } else { + dst[ncoords-1] = ir3_MOV(b, tmp[3], TYPE_U32); + } + } +} + +/* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */ +static struct ir3_instruction * +emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr) +{ + struct ir3_block *b = ctx->block; + const nir_variable *var = nir_intrinsic_get_var(intr, 0); + struct ir3_instruction *atomic, *image, *src0, *src1, *src2; + struct ir3_instruction * const *coords = get_src(ctx, &intr->src[1]); + unsigned ncoords = get_image_coords(var, NULL); + + image = create_immed(b, get_image_slot(ctx, nir_src_as_deref(intr->src[0]))); + + /* src0 is value (or uvec2(value, compare)) + * src1 is coords + * src2 is 64b byte offset + */ + src0 = get_src(ctx, &intr->src[3])[0]; + src1 = create_collect(ctx, coords, ncoords); + src2 = get_image_offset(ctx, var, coords, false); + + switch (intr->intrinsic) { + case nir_intrinsic_image_deref_atomic_add: + atomic = ir3_ATOMIC_ADD_G(b, image, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_image_deref_atomic_min: + atomic = ir3_ATOMIC_MIN_G(b, image, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_image_deref_atomic_max: + atomic = ir3_ATOMIC_MAX_G(b, image, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_image_deref_atomic_and: + atomic = ir3_ATOMIC_AND_G(b, image, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_image_deref_atomic_or: + atomic = ir3_ATOMIC_OR_G(b, image, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_image_deref_atomic_xor: + atomic = ir3_ATOMIC_XOR_G(b, image, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_image_deref_atomic_exchange: + atomic = ir3_ATOMIC_XCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_image_deref_atomic_comp_swap: + /* for cmpxchg, src0 is [ui]vec2(data, compare): */ + src0 = create_collect(ctx, (struct ir3_instruction*[]){ + get_src(ctx, &intr->src[4])[0], + src0, + }, 2); + atomic = ir3_ATOMIC_CMPXCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0); + break; + default: + unreachable("boo"); + } + + atomic->cat6.iim_val = 1; + atomic->cat6.d = ncoords; + atomic->cat6.type = get_image_type(var); + atomic->cat6.typed = true; + atomic->barrier_class = IR3_BARRIER_IMAGE_W; + atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W; + + /* even if nothing consume the result, we can't DCE the instruction: */ + array_insert(b, b->keeps, atomic); + + return atomic; +} + +static void +emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr) +{ + struct ir3_block *b = ctx->block; + struct ir3_instruction *barrier; + + switch (intr->intrinsic) { + case nir_intrinsic_barrier: + barrier = ir3_BAR(b); + barrier->cat7.g = true; + barrier->cat7.l = true; + barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY; + barrier->barrier_class = IR3_BARRIER_EVERYTHING; + break; + case nir_intrinsic_memory_barrier: + barrier = ir3_FENCE(b); + barrier->cat7.g = true; + barrier->cat7.r = true; + barrier->cat7.w = true; + barrier->barrier_class = IR3_BARRIER_IMAGE_W | + IR3_BARRIER_BUFFER_W; + barrier->barrier_conflict = + IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W | + IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; + break; + case nir_intrinsic_memory_barrier_atomic_counter: + case nir_intrinsic_memory_barrier_buffer: + barrier = ir3_FENCE(b); + barrier->cat7.g = true; + barrier->cat7.r = true; + barrier->cat7.w = true; + barrier->barrier_class = IR3_BARRIER_BUFFER_W; + barrier->barrier_conflict = IR3_BARRIER_BUFFER_R | + IR3_BARRIER_BUFFER_W; + break; + case nir_intrinsic_memory_barrier_image: + // TODO double check if this should have .g set + barrier = ir3_FENCE(b); + barrier->cat7.g = true; + barrier->cat7.r = true; + barrier->cat7.w = true; + barrier->barrier_class = IR3_BARRIER_IMAGE_W; + barrier->barrier_conflict = IR3_BARRIER_IMAGE_R | + IR3_BARRIER_IMAGE_W; + break; + case nir_intrinsic_memory_barrier_shared: + barrier = ir3_FENCE(b); + barrier->cat7.g = true; + barrier->cat7.l = true; + barrier->cat7.r = true; + barrier->cat7.w = true; + barrier->barrier_class = IR3_BARRIER_SHARED_W; + barrier->barrier_conflict = IR3_BARRIER_SHARED_R | + IR3_BARRIER_SHARED_W; + break; + case nir_intrinsic_group_memory_barrier: + barrier = ir3_FENCE(b); + barrier->cat7.g = true; + barrier->cat7.l = true; + barrier->cat7.r = true; + barrier->cat7.w = true; + barrier->barrier_class = IR3_BARRIER_SHARED_W | + IR3_BARRIER_IMAGE_W | + IR3_BARRIER_BUFFER_W; + barrier->barrier_conflict = + IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W | + IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W | + IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; + break; + default: + unreachable("boo"); + } + + /* make sure barrier doesn't get DCE'd */ + array_insert(b, b->keeps, barrier); +} + +static void add_sysval_input_compmask(struct ir3_context *ctx, + gl_system_value slot, unsigned compmask, + struct ir3_instruction *instr) +{ + struct ir3_shader_variant *so = ctx->so; + unsigned r = regid(so->inputs_count, 0); + unsigned n = so->inputs_count++; + + so->inputs[n].sysval = true; + so->inputs[n].slot = slot; + so->inputs[n].compmask = compmask; + so->inputs[n].regid = r; + so->inputs[n].interpolate = INTERP_MODE_FLAT; + so->total_in++; + + ctx->ir->ninputs = MAX2(ctx->ir->ninputs, r + 1); + ctx->ir->inputs[r] = instr; +} + +static void add_sysval_input(struct ir3_context *ctx, gl_system_value slot, + struct ir3_instruction *instr) +{ + add_sysval_input_compmask(ctx, slot, 0x1, instr); +} + +static void +emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) +{ + const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic]; + struct ir3_instruction **dst; + struct ir3_instruction * const *src; + struct ir3_block *b = ctx->block; + nir_const_value *const_offset; + int idx, comp; + + if (info->has_dest) { + unsigned n = nir_intrinsic_dest_components(intr); + dst = get_dst(ctx, &intr->dest, n); + } else { + dst = NULL; + } + + switch (intr->intrinsic) { + case nir_intrinsic_load_uniform: + idx = nir_intrinsic_base(intr); + const_offset = nir_src_as_const_value(intr->src[0]); + if (const_offset) { + idx += const_offset->u32[0]; + for (int i = 0; i < intr->num_components; i++) { + unsigned n = idx * 4 + i; + dst[i] = create_uniform(ctx, n); + } + } else { + src = get_src(ctx, &intr->src[0]); + for (int i = 0; i < intr->num_components; i++) { + int n = idx * 4 + i; + dst[i] = create_uniform_indirect(ctx, n, + get_addr(ctx, src[0], 4)); + } + /* NOTE: if relative addressing is used, we set + * constlen in the compiler (to worst-case value) + * since we don't know in the assembler what the max + * addr reg value can be: + */ + ctx->so->constlen = ctx->s->num_uniforms; + } + break; + case nir_intrinsic_load_ubo: + emit_intrinsic_load_ubo(ctx, intr, dst); + break; + case nir_intrinsic_load_input: + idx = nir_intrinsic_base(intr); + comp = nir_intrinsic_component(intr); + const_offset = nir_src_as_const_value(intr->src[0]); + if (const_offset) { + idx += const_offset->u32[0]; + for (int i = 0; i < intr->num_components; i++) { + unsigned n = idx * 4 + i + comp; + dst[i] = ctx->ir->inputs[n]; + } + } else { + src = get_src(ctx, &intr->src[0]); + struct ir3_instruction *collect = + create_collect(ctx, ctx->ir->inputs, ctx->ir->ninputs); + struct ir3_instruction *addr = get_addr(ctx, src[0], 4); + for (int i = 0; i < intr->num_components; i++) { + unsigned n = idx * 4 + i + comp; + dst[i] = create_indirect_load(ctx, ctx->ir->ninputs, + n, addr, collect); + } + } + break; + case nir_intrinsic_load_ssbo: + emit_intrinsic_load_ssbo(ctx, intr, dst); + break; + case nir_intrinsic_store_ssbo: + emit_intrinsic_store_ssbo(ctx, intr); + break; + case nir_intrinsic_get_buffer_size: + emit_intrinsic_ssbo_size(ctx, intr, dst); + break; + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + dst[0] = emit_intrinsic_atomic_ssbo(ctx, intr); + break; + case nir_intrinsic_load_shared: + emit_intrinsic_load_shared(ctx, intr, dst); + break; + case nir_intrinsic_store_shared: + emit_intrinsic_store_shared(ctx, intr); + break; + case nir_intrinsic_shared_atomic_add: + case nir_intrinsic_shared_atomic_imin: + case nir_intrinsic_shared_atomic_umin: + case nir_intrinsic_shared_atomic_imax: + case nir_intrinsic_shared_atomic_umax: + case nir_intrinsic_shared_atomic_and: + case nir_intrinsic_shared_atomic_or: + case nir_intrinsic_shared_atomic_xor: + case nir_intrinsic_shared_atomic_exchange: + case nir_intrinsic_shared_atomic_comp_swap: + dst[0] = emit_intrinsic_atomic_shared(ctx, intr); + break; + case nir_intrinsic_image_deref_load: + emit_intrinsic_load_image(ctx, intr, dst); + break; + case nir_intrinsic_image_deref_store: + emit_intrinsic_store_image(ctx, intr); + break; + case nir_intrinsic_image_deref_size: + emit_intrinsic_image_size(ctx, intr, dst); + break; + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_min: + case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + dst[0] = emit_intrinsic_atomic_image(ctx, intr); + break; + case nir_intrinsic_barrier: + case nir_intrinsic_memory_barrier: + case nir_intrinsic_group_memory_barrier: + case nir_intrinsic_memory_barrier_atomic_counter: + case nir_intrinsic_memory_barrier_buffer: + case nir_intrinsic_memory_barrier_image: + case nir_intrinsic_memory_barrier_shared: + emit_intrinsic_barrier(ctx, intr); + /* note that blk ptr no longer valid, make that obvious: */ + b = NULL; + break; + case nir_intrinsic_store_output: + idx = nir_intrinsic_base(intr); + comp = nir_intrinsic_component(intr); + const_offset = nir_src_as_const_value(intr->src[1]); + compile_assert(ctx, const_offset != NULL); + idx += const_offset->u32[0]; + + src = get_src(ctx, &intr->src[0]); + for (int i = 0; i < intr->num_components; i++) { + unsigned n = idx * 4 + i + comp; + ctx->ir->outputs[n] = src[i]; + } + break; + case nir_intrinsic_load_base_vertex: + case nir_intrinsic_load_first_vertex: + if (!ctx->basevertex) { + ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE); + add_sysval_input(ctx, SYSTEM_VALUE_FIRST_VERTEX, ctx->basevertex); + } + dst[0] = ctx->basevertex; + break; + case nir_intrinsic_load_vertex_id_zero_base: + case nir_intrinsic_load_vertex_id: + if (!ctx->vertex_id) { + gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id) ? + SYSTEM_VALUE_VERTEX_ID : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE; + ctx->vertex_id = create_input(ctx, 0); + add_sysval_input(ctx, sv, ctx->vertex_id); + } + dst[0] = ctx->vertex_id; + break; + case nir_intrinsic_load_instance_id: + if (!ctx->instance_id) { + ctx->instance_id = create_input(ctx, 0); + add_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID, + ctx->instance_id); + } + dst[0] = ctx->instance_id; + break; + case nir_intrinsic_load_sample_id: + case nir_intrinsic_load_sample_id_no_per_sample: + if (!ctx->samp_id) { + ctx->samp_id = create_input(ctx, 0); + ctx->samp_id->regs[0]->flags |= IR3_REG_HALF; + add_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_ID, + ctx->samp_id); + } + dst[0] = ir3_COV(b, ctx->samp_id, TYPE_U16, TYPE_U32); + break; + case nir_intrinsic_load_sample_mask_in: + if (!ctx->samp_mask_in) { + ctx->samp_mask_in = create_input(ctx, 0); + add_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN, + ctx->samp_mask_in); + } + dst[0] = ctx->samp_mask_in; + break; + case nir_intrinsic_load_user_clip_plane: + idx = nir_intrinsic_ucp_id(intr); + for (int i = 0; i < intr->num_components; i++) { + unsigned n = idx * 4 + i; + dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n); + } + break; + case nir_intrinsic_load_front_face: + if (!ctx->frag_face) { + ctx->so->frag_face = true; + ctx->frag_face = create_input(ctx, 0); + add_sysval_input(ctx, SYSTEM_VALUE_FRONT_FACE, ctx->frag_face); + ctx->frag_face->regs[0]->flags |= IR3_REG_HALF; + } + /* for fragface, we get -1 for back and 0 for front. However this is + * the inverse of what nir expects (where ~0 is true). + */ + dst[0] = ir3_COV(b, ctx->frag_face, TYPE_S16, TYPE_S32); + dst[0] = ir3_NOT_B(b, dst[0], 0); + break; + case nir_intrinsic_load_local_invocation_id: + if (!ctx->local_invocation_id) { + ctx->local_invocation_id = create_input_compmask(ctx, 0, 0x7); + add_sysval_input_compmask(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID, + 0x7, ctx->local_invocation_id); + } + split_dest(b, dst, ctx->local_invocation_id, 0, 3); + break; + case nir_intrinsic_load_work_group_id: + if (!ctx->work_group_id) { + ctx->work_group_id = create_input_compmask(ctx, 0, 0x7); + add_sysval_input_compmask(ctx, SYSTEM_VALUE_WORK_GROUP_ID, + 0x7, ctx->work_group_id); + ctx->work_group_id->regs[0]->flags |= IR3_REG_HIGH; + } + split_dest(b, dst, ctx->work_group_id, 0, 3); + break; + case nir_intrinsic_load_num_work_groups: + for (int i = 0; i < intr->num_components; i++) { + dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i); + } + break; + case nir_intrinsic_load_local_group_size: + for (int i = 0; i < intr->num_components; i++) { + dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i); + } + break; + case nir_intrinsic_discard_if: + case nir_intrinsic_discard: { + struct ir3_instruction *cond, *kill; + + if (intr->intrinsic == nir_intrinsic_discard_if) { + /* conditional discard: */ + src = get_src(ctx, &intr->src[0]); + cond = ir3_b2n(b, src[0]); + } else { + /* unconditional discard: */ + cond = create_immed(b, 1); + } + + /* NOTE: only cmps.*.* can write p0.x: */ + cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0); + cond->cat2.condition = IR3_COND_NE; + + /* condition always goes in predicate register: */ + cond->regs[0]->num = regid(REG_P0, 0); + + kill = ir3_KILL(b, cond, 0); + array_insert(ctx->ir, ctx->ir->predicates, kill); + + array_insert(b, b->keeps, kill); + ctx->so->has_kill = true; + + break; + } + default: + compile_error(ctx, "Unhandled intrinsic type: %s\n", + nir_intrinsic_infos[intr->intrinsic].name); + break; + } + + if (info->has_dest) + put_dst(ctx, &intr->dest); +} + +static void +emit_load_const(struct ir3_context *ctx, nir_load_const_instr *instr) +{ + struct ir3_instruction **dst = get_dst_ssa(ctx, &instr->def, + instr->def.num_components); + type_t type = (instr->def.bit_size < 32) ? TYPE_U16 : TYPE_U32; + + for (int i = 0; i < instr->def.num_components; i++) + dst[i] = create_immed_typed(ctx->block, instr->value.u32[i], type); +} + +static void +emit_undef(struct ir3_context *ctx, nir_ssa_undef_instr *undef) +{ + struct ir3_instruction **dst = get_dst_ssa(ctx, &undef->def, + undef->def.num_components); + type_t type = (undef->def.bit_size < 32) ? TYPE_U16 : TYPE_U32; + + /* backend doesn't want undefined instructions, so just plug + * in 0.0.. + */ + for (int i = 0; i < undef->def.num_components; i++) + dst[i] = create_immed_typed(ctx->block, fui(0.0), type); +} + +/* + * texture fetch/sample instructions: + */ + +static void +tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp) +{ + unsigned coords, flags = 0; + + /* note: would use tex->coord_components.. except txs.. also, + * since array index goes after shadow ref, we don't want to + * count it: + */ + switch (tex->sampler_dim) { + case GLSL_SAMPLER_DIM_1D: + case GLSL_SAMPLER_DIM_BUF: + coords = 1; + break; + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_RECT: + case GLSL_SAMPLER_DIM_EXTERNAL: + case GLSL_SAMPLER_DIM_MS: + coords = 2; + break; + case GLSL_SAMPLER_DIM_3D: + case GLSL_SAMPLER_DIM_CUBE: + coords = 3; + flags |= IR3_INSTR_3D; + break; + default: + unreachable("bad sampler_dim"); + } + + if (tex->is_shadow && tex->op != nir_texop_lod) + flags |= IR3_INSTR_S; + + if (tex->is_array && tex->op != nir_texop_lod) + flags |= IR3_INSTR_A; + + *flagsp = flags; + *coordsp = coords; +} + +static void +emit_tex(struct ir3_context *ctx, nir_tex_instr *tex) +{ + struct ir3_block *b = ctx->block; + struct ir3_instruction **dst, *sam, *src0[12], *src1[4]; + struct ir3_instruction * const *coord, * const *off, * const *ddx, * const *ddy; + struct ir3_instruction *lod, *compare, *proj, *sample_index; + bool has_bias = false, has_lod = false, has_proj = false, has_off = false; + unsigned i, coords, flags; + unsigned nsrc0 = 0, nsrc1 = 0; + type_t type; + opc_t opc = 0; + + coord = off = ddx = ddy = NULL; + lod = proj = compare = sample_index = NULL; + + /* TODO: might just be one component for gathers? */ + dst = get_dst(ctx, &tex->dest, 4); + + for (unsigned i = 0; i < tex->num_srcs; i++) { + switch (tex->src[i].src_type) { + case nir_tex_src_coord: + coord = get_src(ctx, &tex->src[i].src); + break; + case nir_tex_src_bias: + lod = get_src(ctx, &tex->src[i].src)[0]; + has_bias = true; + break; + case nir_tex_src_lod: + lod = get_src(ctx, &tex->src[i].src)[0]; + has_lod = true; + break; + case nir_tex_src_comparator: /* shadow comparator */ + compare = get_src(ctx, &tex->src[i].src)[0]; + break; + case nir_tex_src_projector: + proj = get_src(ctx, &tex->src[i].src)[0]; + has_proj = true; + break; + case nir_tex_src_offset: + off = get_src(ctx, &tex->src[i].src); + has_off = true; + break; + case nir_tex_src_ddx: + ddx = get_src(ctx, &tex->src[i].src); + break; + case nir_tex_src_ddy: + ddy = get_src(ctx, &tex->src[i].src); + break; + case nir_tex_src_ms_index: + sample_index = get_src(ctx, &tex->src[i].src)[0]; + break; + default: + compile_error(ctx, "Unhandled NIR tex src type: %d\n", + tex->src[i].src_type); + return; + } + } + + switch (tex->op) { + case nir_texop_tex: opc = has_lod ? OPC_SAML : OPC_SAM; break; + case nir_texop_txb: opc = OPC_SAMB; break; + case nir_texop_txl: opc = OPC_SAML; break; + case nir_texop_txd: opc = OPC_SAMGQ; break; + case nir_texop_txf: opc = OPC_ISAML; break; + case nir_texop_lod: opc = OPC_GETLOD; break; + case nir_texop_tg4: + /* NOTE: a4xx might need to emulate gather w/ txf (this is + * what blob does, seems gather is broken?), and a3xx did + * not support it (but probably could also emulate). + */ + switch (tex->component) { + case 0: opc = OPC_GATHER4R; break; + case 1: opc = OPC_GATHER4G; break; + case 2: opc = OPC_GATHER4B; break; + case 3: opc = OPC_GATHER4A; break; + } + break; + case nir_texop_txf_ms: opc = OPC_ISAMM; break; + case nir_texop_txs: + case nir_texop_query_levels: + case nir_texop_texture_samples: + case nir_texop_samples_identical: + case nir_texop_txf_ms_mcs: + compile_error(ctx, "Unhandled NIR tex type: %d\n", tex->op); + return; + } + + tex_info(tex, &flags, &coords); + + /* + * lay out the first argument in the proper order: + * - actual coordinates first + * - shadow reference + * - array index + * - projection w + * - starting at offset 4, dpdx.xy, dpdy.xy + * + * bias/lod go into the second arg + */ + + /* insert tex coords: */ + for (i = 0; i < coords; i++) + src0[i] = coord[i]; + + nsrc0 = i; + + /* NOTE a3xx (and possibly a4xx?) might be different, using isaml + * with scaled x coord according to requested sample: + */ + if (tex->op == nir_texop_txf_ms) { + if (ctx->compiler->txf_ms_with_isaml) { + /* the samples are laid out in x dimension as + * 0 1 2 3 + * x_ms = (x << ms) + sample_index; + */ + struct ir3_instruction *ms; + ms = create_immed(b, (ctx->samples >> (2 * tex->texture_index)) & 3); + + src0[0] = ir3_SHL_B(b, src0[0], 0, ms, 0); + src0[0] = ir3_ADD_U(b, src0[0], 0, sample_index, 0); + + opc = OPC_ISAML; + } else { + src0[nsrc0++] = sample_index; + } + } + + /* scale up integer coords for TXF based on the LOD */ + if (ctx->compiler->unminify_coords && (opc == OPC_ISAML)) { + assert(has_lod); + for (i = 0; i < coords; i++) + src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0); + } + + if (coords == 1) { + /* hw doesn't do 1d, so we treat it as 2d with + * height of 1, and patch up the y coord. + * TODO: y coord should be (int)0 in some cases.. + */ + src0[nsrc0++] = create_immed(b, fui(0.5)); + } + + if (tex->is_shadow && tex->op != nir_texop_lod) + src0[nsrc0++] = compare; + + if (tex->is_array && tex->op != nir_texop_lod) { + struct ir3_instruction *idx = coord[coords]; + + /* the array coord for cube arrays needs 0.5 added to it */ + if (ctx->compiler->array_index_add_half && (opc != OPC_ISAML)) + idx = ir3_ADD_F(b, idx, 0, create_immed(b, fui(0.5)), 0); + + src0[nsrc0++] = idx; + } + + if (has_proj) { + src0[nsrc0++] = proj; + flags |= IR3_INSTR_P; + } + + /* pad to 4, then ddx/ddy: */ + if (tex->op == nir_texop_txd) { + while (nsrc0 < 4) + src0[nsrc0++] = create_immed(b, fui(0.0)); + for (i = 0; i < coords; i++) + src0[nsrc0++] = ddx[i]; + if (coords < 2) + src0[nsrc0++] = create_immed(b, fui(0.0)); + for (i = 0; i < coords; i++) + src0[nsrc0++] = ddy[i]; + if (coords < 2) + src0[nsrc0++] = create_immed(b, fui(0.0)); + } + + /* + * second argument (if applicable): + * - offsets + * - lod + * - bias + */ + if (has_off | has_lod | has_bias) { + if (has_off) { + unsigned off_coords = coords; + if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE) + off_coords--; + for (i = 0; i < off_coords; i++) + src1[nsrc1++] = off[i]; + if (off_coords < 2) + src1[nsrc1++] = create_immed(b, fui(0.0)); + flags |= IR3_INSTR_O; + } + + if (has_lod | has_bias) + src1[nsrc1++] = lod; + } + + switch (tex->dest_type) { + case nir_type_invalid: + case nir_type_float: + type = TYPE_F32; + break; + case nir_type_int: + type = TYPE_S32; + break; + case nir_type_uint: + case nir_type_bool: + type = TYPE_U32; + break; + default: + unreachable("bad dest_type"); + } + + if (opc == OPC_GETLOD) + type = TYPE_U32; + + unsigned tex_idx = tex->texture_index; + + ctx->max_texture_index = MAX2(ctx->max_texture_index, tex_idx); + + struct ir3_instruction *col0 = create_collect(ctx, src0, nsrc0); + struct ir3_instruction *col1 = create_collect(ctx, src1, nsrc1); + + sam = ir3_SAM(b, opc, type, 0b1111, flags, + tex_idx, tex_idx, col0, col1); + + if ((ctx->astc_srgb & (1 << tex_idx)) && !nir_tex_instr_is_query(tex)) { + /* only need first 3 components: */ + sam->regs[0]->wrmask = 0x7; + split_dest(b, dst, sam, 0, 3); + + /* we need to sample the alpha separately with a non-ASTC + * texture state: + */ + sam = ir3_SAM(b, opc, type, 0b1000, flags, + tex_idx, tex_idx, col0, col1); + + array_insert(ctx->ir, ctx->ir->astc_srgb, sam); + + /* fixup .w component: */ + split_dest(b, &dst[3], sam, 3, 1); + } else { + /* normal (non-workaround) case: */ + split_dest(b, dst, sam, 0, 4); + } + + /* GETLOD returns results in 4.8 fixed point */ + if (opc == OPC_GETLOD) { + struct ir3_instruction *factor = create_immed(b, fui(1.0 / 256)); + + compile_assert(ctx, tex->dest_type == nir_type_float); + for (i = 0; i < 2; i++) { + dst[i] = ir3_MUL_F(b, ir3_COV(b, dst[i], TYPE_U32, TYPE_F32), 0, + factor, 0); + } + } + + put_dst(ctx, &tex->dest); +} + +static void +emit_tex_query_levels(struct ir3_context *ctx, nir_tex_instr *tex) +{ + struct ir3_block *b = ctx->block; + struct ir3_instruction **dst, *sam; + + dst = get_dst(ctx, &tex->dest, 1); + + sam = ir3_SAM(b, OPC_GETINFO, TYPE_U32, 0b0100, 0, + tex->texture_index, tex->texture_index, NULL, NULL); + + /* even though there is only one component, since it ends + * up in .z rather than .x, we need a split_dest() + */ + split_dest(b, dst, sam, 0, 3); + + /* The # of levels comes from getinfo.z. We need to add 1 to it, since + * the value in TEX_CONST_0 is zero-based. + */ + if (ctx->compiler->levels_add_one) + dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0); + + put_dst(ctx, &tex->dest); +} + +static void +emit_tex_txs(struct ir3_context *ctx, nir_tex_instr *tex) +{ + struct ir3_block *b = ctx->block; + struct ir3_instruction **dst, *sam; + struct ir3_instruction *lod; + unsigned flags, coords; + + tex_info(tex, &flags, &coords); + + /* Actually we want the number of dimensions, not coordinates. This + * distinction only matters for cubes. + */ + if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE) + coords = 2; + + dst = get_dst(ctx, &tex->dest, 4); + + compile_assert(ctx, tex->num_srcs == 1); + compile_assert(ctx, tex->src[0].src_type == nir_tex_src_lod); + + lod = get_src(ctx, &tex->src[0].src)[0]; + + sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, 0b1111, flags, + tex->texture_index, tex->texture_index, lod, NULL); + + split_dest(b, dst, sam, 0, 4); + + /* Array size actually ends up in .w rather than .z. This doesn't + * matter for miplevel 0, but for higher mips the value in z is + * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is + * returned, which means that we have to add 1 to it for arrays. + */ + if (tex->is_array) { + if (ctx->compiler->levels_add_one) { + dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0); + } else { + dst[coords] = ir3_MOV(b, dst[3], TYPE_U32); + } + } + + put_dst(ctx, &tex->dest); +} + +static void +emit_jump(struct ir3_context *ctx, nir_jump_instr *jump) +{ + switch (jump->type) { + case nir_jump_break: + case nir_jump_continue: + case nir_jump_return: + /* I *think* we can simply just ignore this, and use the + * successor block link to figure out where we need to + * jump to for break/continue + */ + break; + default: + compile_error(ctx, "Unhandled NIR jump type: %d\n", jump->type); + break; + } +} + +static void +emit_instr(struct ir3_context *ctx, nir_instr *instr) +{ + switch (instr->type) { + case nir_instr_type_alu: + emit_alu(ctx, nir_instr_as_alu(instr)); + break; + case nir_instr_type_deref: + /* ignored, handled as part of the intrinsic they are src to */ + break; + case nir_instr_type_intrinsic: + emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); + break; + case nir_instr_type_load_const: + emit_load_const(ctx, nir_instr_as_load_const(instr)); + break; + case nir_instr_type_ssa_undef: + emit_undef(ctx, nir_instr_as_ssa_undef(instr)); + break; + case nir_instr_type_tex: { + nir_tex_instr *tex = nir_instr_as_tex(instr); + /* couple tex instructions get special-cased: + */ + switch (tex->op) { + case nir_texop_txs: + emit_tex_txs(ctx, tex); + break; + case nir_texop_query_levels: + emit_tex_query_levels(ctx, tex); + break; + default: + emit_tex(ctx, tex); + break; + } + break; + } + case nir_instr_type_jump: + emit_jump(ctx, nir_instr_as_jump(instr)); + break; + case nir_instr_type_phi: + /* we have converted phi webs to regs in NIR by now */ + compile_error(ctx, "Unexpected NIR instruction type: %d\n", instr->type); + break; + case nir_instr_type_call: + case nir_instr_type_parallel_copy: + compile_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type); + break; + } +} + +static struct ir3_block * +get_block(struct ir3_context *ctx, const nir_block *nblock) +{ + struct ir3_block *block; + struct hash_entry *hentry; + unsigned i; + + hentry = _mesa_hash_table_search(ctx->block_ht, nblock); + if (hentry) + return hentry->data; + + block = ir3_block_create(ctx->ir); + block->nblock = nblock; + _mesa_hash_table_insert(ctx->block_ht, nblock, block); + + block->predecessors_count = nblock->predecessors->entries; + block->predecessors = ralloc_array_size(block, + sizeof(block->predecessors[0]), block->predecessors_count); + i = 0; + set_foreach(nblock->predecessors, sentry) { + block->predecessors[i++] = get_block(ctx, sentry->key); + } + + return block; +} + +static void +emit_block(struct ir3_context *ctx, nir_block *nblock) +{ + struct ir3_block *block = get_block(ctx, nblock); + + for (int i = 0; i < ARRAY_SIZE(block->successors); i++) { + if (nblock->successors[i]) { + block->successors[i] = + get_block(ctx, nblock->successors[i]); + } + } + + ctx->block = block; + list_addtail(&block->node, &ctx->ir->block_list); + + /* re-emit addr register in each block if needed: */ + for (int i = 0; i < ARRAY_SIZE(ctx->addr_ht); i++) { + _mesa_hash_table_destroy(ctx->addr_ht[i], NULL); + ctx->addr_ht[i] = NULL; + } + + nir_foreach_instr(instr, nblock) { + ctx->cur_instr = instr; + emit_instr(ctx, instr); + ctx->cur_instr = NULL; + if (ctx->error) + return; + } +} + +static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list); + +static void +emit_if(struct ir3_context *ctx, nir_if *nif) +{ + struct ir3_instruction *condition = get_src(ctx, &nif->condition)[0]; + + ctx->block->condition = + get_predicate(ctx, ir3_b2n(condition->block, condition)); + + emit_cf_list(ctx, &nif->then_list); + emit_cf_list(ctx, &nif->else_list); +} + +static void +emit_loop(struct ir3_context *ctx, nir_loop *nloop) +{ + emit_cf_list(ctx, &nloop->body); +} + +static void +emit_cf_list(struct ir3_context *ctx, struct exec_list *list) +{ + foreach_list_typed(nir_cf_node, node, node, list) { + switch (node->type) { + case nir_cf_node_block: + emit_block(ctx, nir_cf_node_as_block(node)); + break; + case nir_cf_node_if: + emit_if(ctx, nir_cf_node_as_if(node)); + break; + case nir_cf_node_loop: + emit_loop(ctx, nir_cf_node_as_loop(node)); + break; + case nir_cf_node_function: + compile_error(ctx, "TODO\n"); + break; + } + } +} + +/* emit stream-out code. At this point, the current block is the original + * (nir) end block, and nir ensures that all flow control paths terminate + * into the end block. We re-purpose the original end block to generate + * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional + * block holding stream-out write instructions, followed by the new end + * block: + * + * blockOrigEnd { + * p0.x = (vtxcnt < maxvtxcnt) + * // succs: blockStreamOut, blockNewEnd + * } + * blockStreamOut { + * ... stream-out instructions ... + * // succs: blockNewEnd + * } + * blockNewEnd { + * } + */ +static void +emit_stream_out(struct ir3_context *ctx) +{ + struct ir3_shader_variant *v = ctx->so; + struct ir3 *ir = ctx->ir; + struct ir3_stream_output_info *strmout = + &ctx->so->shader->stream_output; + struct ir3_block *orig_end_block, *stream_out_block, *new_end_block; + struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond; + struct ir3_instruction *bases[IR3_MAX_SO_BUFFERS]; + + /* create vtxcnt input in input block at top of shader, + * so that it is seen as live over the entire duration + * of the shader: + */ + vtxcnt = create_input(ctx, 0); + add_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, vtxcnt); + + maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX); + + /* at this point, we are at the original 'end' block, + * re-purpose this block to stream-out condition, then + * append stream-out block and new-end block + */ + orig_end_block = ctx->block; + +// TODO these blocks need to update predecessors.. +// maybe w/ store_global intrinsic, we could do this +// stuff in nir->nir pass + + stream_out_block = ir3_block_create(ir); + list_addtail(&stream_out_block->node, &ir->block_list); + + new_end_block = ir3_block_create(ir); + list_addtail(&new_end_block->node, &ir->block_list); + + orig_end_block->successors[0] = stream_out_block; + orig_end_block->successors[1] = new_end_block; + stream_out_block->successors[0] = new_end_block; + + /* setup 'if (vtxcnt < maxvtxcnt)' condition: */ + cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0); + cond->regs[0]->num = regid(REG_P0, 0); + cond->cat2.condition = IR3_COND_LT; + + /* condition goes on previous block to the conditional, + * since it is used to pick which of the two successor + * paths to take: + */ + orig_end_block->condition = cond; + + /* switch to stream_out_block to generate the stream-out + * instructions: + */ + ctx->block = stream_out_block; + + /* Calculate base addresses based on vtxcnt. Instructions + * generated for bases not used in following loop will be + * stripped out in the backend. + */ + for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) { + unsigned stride = strmout->stride[i]; + struct ir3_instruction *base, *off; + + base = create_uniform(ctx, regid(v->constbase.tfbo, i)); + + /* 24-bit should be enough: */ + off = ir3_MUL_U(ctx->block, vtxcnt, 0, + create_immed(ctx->block, stride * 4), 0); + + bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0); + } + + /* Generate the per-output store instructions: */ + for (unsigned i = 0; i < strmout->num_outputs; i++) { + for (unsigned j = 0; j < strmout->output[i].num_components; j++) { + unsigned c = j + strmout->output[i].start_component; + struct ir3_instruction *base, *out, *stg; + + base = bases[strmout->output[i].output_buffer]; + out = ctx->ir->outputs[regid(strmout->output[i].register_index, c)]; + + stg = ir3_STG(ctx->block, base, 0, out, 0, + create_immed(ctx->block, 1), 0); + stg->cat6.type = TYPE_U32; + stg->cat6.dst_offset = (strmout->output[i].dst_offset + j) * 4; + + array_insert(ctx->block, ctx->block->keeps, stg); + } + } + + /* and finally switch to the new_end_block: */ + ctx->block = new_end_block; +} + +static void +emit_function(struct ir3_context *ctx, nir_function_impl *impl) +{ + nir_metadata_require(impl, nir_metadata_block_index); + + emit_cf_list(ctx, &impl->body); + emit_block(ctx, impl->end_block); + + /* at this point, we should have a single empty block, + * into which we emit the 'end' instruction. + */ + compile_assert(ctx, list_empty(&ctx->block->instr_list)); + + /* If stream-out (aka transform-feedback) enabled, emit the + * stream-out instructions, followed by a new empty block (into + * which the 'end' instruction lands). + * + * NOTE: it is done in this order, rather than inserting before + * we emit end_block, because NIR guarantees that all blocks + * flow into end_block, and that end_block has no successors. + * So by re-purposing end_block as the first block of stream- + * out, we guarantee that all exit paths flow into the stream- + * out instructions. + */ + if ((ctx->compiler->gpu_id < 500) && + (ctx->so->shader->stream_output.num_outputs > 0) && + !ctx->so->binning_pass) { + debug_assert(ctx->so->type == MESA_SHADER_VERTEX); + emit_stream_out(ctx); + } + + ir3_END(ctx->block); +} + +static struct ir3_instruction * +create_frag_coord(struct ir3_context *ctx, unsigned comp) +{ + struct ir3_block *block = ctx->block; + struct ir3_instruction *instr; + + if (!ctx->frag_coord) { + ctx->frag_coord = create_input_compmask(ctx, 0, 0xf); + /* defer add_sysval_input() until after all inputs created */ + } + + split_dest(block, &instr, ctx->frag_coord, comp, 1); + + switch (comp) { + case 0: /* .x */ + case 1: /* .y */ + /* for frag_coord, we get unsigned values.. we need + * to subtract (integer) 8 and divide by 16 (right- + * shift by 4) then convert to float: + * + * sub.s tmp, src, 8 + * shr.b tmp, tmp, 4 + * mov.u32f32 dst, tmp + * + */ + instr = ir3_SUB_S(block, instr, 0, + create_immed(block, 8), 0); + instr = ir3_SHR_B(block, instr, 0, + create_immed(block, 4), 0); + instr = ir3_COV(block, instr, TYPE_U32, TYPE_F32); + + return instr; + case 2: /* .z */ + case 3: /* .w */ + default: + /* seems that we can use these as-is: */ + return instr; + } +} + +static void +setup_input(struct ir3_context *ctx, nir_variable *in) +{ + struct ir3_shader_variant *so = ctx->so; + unsigned ncomp = glsl_get_components(in->type); + unsigned n = in->data.driver_location; + unsigned slot = in->data.location; + + /* let's pretend things other than vec4 don't exist: */ + ncomp = MAX2(ncomp, 4); + + /* skip unread inputs, we could end up with (for example), unsplit + * matrix/etc inputs in the case they are not read, so just silently + * skip these. + */ + if (ncomp > 4) + return; + + compile_assert(ctx, ncomp == 4); + + so->inputs[n].slot = slot; + so->inputs[n].compmask = (1 << ncomp) - 1; + so->inputs_count = MAX2(so->inputs_count, n + 1); + so->inputs[n].interpolate = in->data.interpolation; + + if (ctx->so->type == MESA_SHADER_FRAGMENT) { + for (int i = 0; i < ncomp; i++) { + struct ir3_instruction *instr = NULL; + unsigned idx = (n * 4) + i; + + if (slot == VARYING_SLOT_POS) { + so->inputs[n].bary = false; + so->frag_coord = true; + instr = create_frag_coord(ctx, i); + } else if (slot == VARYING_SLOT_PNTC) { + /* see for example st_nir_fixup_varying_slots().. this is + * maybe a bit mesa/st specific. But we need things to line + * up for this in fdN_program: + * unsigned texmask = 1 << (slot - VARYING_SLOT_VAR0); + * if (emit->sprite_coord_enable & texmask) { + * ... + * } + */ + so->inputs[n].slot = VARYING_SLOT_VAR8; + so->inputs[n].bary = true; + instr = create_frag_input(ctx, false); + } else { + bool use_ldlv = false; + + /* detect the special case for front/back colors where + * we need to do flat vs smooth shading depending on + * rast state: + */ + if (in->data.interpolation == INTERP_MODE_NONE) { + switch (slot) { + case VARYING_SLOT_COL0: + case VARYING_SLOT_COL1: + case VARYING_SLOT_BFC0: + case VARYING_SLOT_BFC1: + so->inputs[n].rasterflat = true; + break; + default: + break; + } + } + + if (ctx->compiler->flat_bypass) { + if ((so->inputs[n].interpolate == INTERP_MODE_FLAT) || + (so->inputs[n].rasterflat && ctx->so->key.rasterflat)) + use_ldlv = true; + } + + so->inputs[n].bary = true; + + instr = create_frag_input(ctx, use_ldlv); + } + + compile_assert(ctx, idx < ctx->ir->ninputs); + + ctx->ir->inputs[idx] = instr; + } + } else if (ctx->so->type == MESA_SHADER_VERTEX) { + for (int i = 0; i < ncomp; i++) { + unsigned idx = (n * 4) + i; + compile_assert(ctx, idx < ctx->ir->ninputs); + ctx->ir->inputs[idx] = create_input(ctx, idx); + } + } else { + compile_error(ctx, "unknown shader type: %d\n", ctx->so->type); + } + + if (so->inputs[n].bary || (ctx->so->type == MESA_SHADER_VERTEX)) { + so->total_in += ncomp; + } +} + +static void +setup_output(struct ir3_context *ctx, nir_variable *out) +{ + struct ir3_shader_variant *so = ctx->so; + unsigned ncomp = glsl_get_components(out->type); + unsigned n = out->data.driver_location; + unsigned slot = out->data.location; + unsigned comp = 0; + + /* let's pretend things other than vec4 don't exist: */ + ncomp = MAX2(ncomp, 4); + compile_assert(ctx, ncomp == 4); + + if (ctx->so->type == MESA_SHADER_FRAGMENT) { + switch (slot) { + case FRAG_RESULT_DEPTH: + comp = 2; /* tgsi will write to .z component */ + so->writes_pos = true; + break; + case FRAG_RESULT_COLOR: + so->color0_mrt = 1; + break; + default: + if (slot >= FRAG_RESULT_DATA0) + break; + compile_error(ctx, "unknown FS output name: %s\n", + gl_frag_result_name(slot)); + } + } else if (ctx->so->type == MESA_SHADER_VERTEX) { + switch (slot) { + case VARYING_SLOT_POS: + so->writes_pos = true; + break; + case VARYING_SLOT_PSIZ: + so->writes_psize = true; + break; + case VARYING_SLOT_COL0: + case VARYING_SLOT_COL1: + case VARYING_SLOT_BFC0: + case VARYING_SLOT_BFC1: + case VARYING_SLOT_FOGC: + case VARYING_SLOT_CLIP_DIST0: + case VARYING_SLOT_CLIP_DIST1: + case VARYING_SLOT_CLIP_VERTEX: + break; + default: + if (slot >= VARYING_SLOT_VAR0) + break; + if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7)) + break; + compile_error(ctx, "unknown VS output name: %s\n", + gl_varying_slot_name(slot)); + } + } else { + compile_error(ctx, "unknown shader type: %d\n", ctx->so->type); + } + + compile_assert(ctx, n < ARRAY_SIZE(so->outputs)); + + so->outputs[n].slot = slot; + so->outputs[n].regid = regid(n, comp); + so->outputs_count = MAX2(so->outputs_count, n + 1); + + for (int i = 0; i < ncomp; i++) { + unsigned idx = (n * 4) + i; + compile_assert(ctx, idx < ctx->ir->noutputs); + ctx->ir->outputs[idx] = create_immed(ctx->block, fui(0.0)); + } +} + +static int +max_drvloc(struct exec_list *vars) +{ + int drvloc = -1; + nir_foreach_variable(var, vars) { + drvloc = MAX2(drvloc, (int)var->data.driver_location); + } + return drvloc; +} + +static const unsigned max_sysvals[] = { + [MESA_SHADER_FRAGMENT] = 24, // TODO + [MESA_SHADER_VERTEX] = 16, + [MESA_SHADER_COMPUTE] = 16, // TODO how many do we actually need? +}; + +static void +emit_instructions(struct ir3_context *ctx) +{ + unsigned ninputs, noutputs; + nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s); + + ninputs = (max_drvloc(&ctx->s->inputs) + 1) * 4; + noutputs = (max_drvloc(&ctx->s->outputs) + 1) * 4; + + /* we need to leave room for sysvals: + */ + ninputs += max_sysvals[ctx->so->type]; + + ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs); + + /* Create inputs in first block: */ + ctx->block = get_block(ctx, nir_start_block(fxn)); + ctx->in_block = ctx->block; + list_addtail(&ctx->block->node, &ctx->ir->block_list); + + ninputs -= max_sysvals[ctx->so->type]; + + /* for fragment shader, the vcoord input register is used as the + * base for bary.f varying fetch instrs: + */ + struct ir3_instruction *vcoord = NULL; + if (ctx->so->type == MESA_SHADER_FRAGMENT) { + struct ir3_instruction *xy[2]; + + vcoord = create_input_compmask(ctx, 0, 0x3); + split_dest(ctx->block, xy, vcoord, 0, 2); + + ctx->frag_vcoord = create_collect(ctx, xy, 2); + } + + /* Setup inputs: */ + nir_foreach_variable(var, &ctx->s->inputs) { + setup_input(ctx, var); + } + + /* Defer add_sysval_input() stuff until after setup_inputs(), + * because sysvals need to be appended after varyings: + */ + if (vcoord) { + add_sysval_input_compmask(ctx, SYSTEM_VALUE_VARYING_COORD, + 0x3, vcoord); + } + + if (ctx->frag_coord) { + add_sysval_input_compmask(ctx, SYSTEM_VALUE_FRAG_COORD, + 0xf, ctx->frag_coord); + } + + /* Setup outputs: */ + nir_foreach_variable(var, &ctx->s->outputs) { + setup_output(ctx, var); + } + + /* Setup registers (which should only be arrays): */ + nir_foreach_register(reg, &ctx->s->registers) { + declare_array(ctx, reg); + } + + /* NOTE: need to do something more clever when we support >1 fxn */ + nir_foreach_register(reg, &fxn->registers) { + declare_array(ctx, reg); + } + /* And emit the body: */ + ctx->impl = fxn; + emit_function(ctx, fxn); +} + +/* from NIR perspective, we actually have varying inputs. But the varying + * inputs, from an IR standpoint, are just bary.f/ldlv instructions. The + * only actual inputs are the sysvals. + */ +static void +fixup_frag_inputs(struct ir3_context *ctx) +{ + struct ir3_shader_variant *so = ctx->so; + struct ir3 *ir = ctx->ir; + unsigned i = 0; + + /* sysvals should appear at the end of the inputs, drop everything else: */ + while ((i < so->inputs_count) && !so->inputs[i].sysval) + i++; + + /* at IR level, inputs are always blocks of 4 scalars: */ + i *= 4; + + ir->inputs = &ir->inputs[i]; + ir->ninputs -= i; +} + +/* Fixup tex sampler state for astc/srgb workaround instructions. We + * need to assign the tex state indexes for these after we know the + * max tex index. + */ +static void +fixup_astc_srgb(struct ir3_context *ctx) +{ + struct ir3_shader_variant *so = ctx->so; + /* indexed by original tex idx, value is newly assigned alpha sampler + * state tex idx. Zero is invalid since there is at least one sampler + * if we get here. + */ + unsigned alt_tex_state[16] = {0}; + unsigned tex_idx = ctx->max_texture_index + 1; + unsigned idx = 0; + + so->astc_srgb.base = tex_idx; + + for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) { + struct ir3_instruction *sam = ctx->ir->astc_srgb[i]; + + compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state)); + + if (alt_tex_state[sam->cat5.tex] == 0) { + /* assign new alternate/alpha tex state slot: */ + alt_tex_state[sam->cat5.tex] = tex_idx++; + so->astc_srgb.orig_idx[idx++] = sam->cat5.tex; + so->astc_srgb.count++; + } + + sam->cat5.tex = alt_tex_state[sam->cat5.tex]; + } +} + +static void +fixup_binning_pass(struct ir3_context *ctx) +{ + struct ir3_shader_variant *so = ctx->so; + struct ir3 *ir = ctx->ir; + unsigned i, j; + + for (i = 0, j = 0; i < so->outputs_count; i++) { + unsigned slot = so->outputs[i].slot; + + /* throw away everything but first position/psize */ + if ((slot == VARYING_SLOT_POS) || (slot == VARYING_SLOT_PSIZ)) { + if (i != j) { + so->outputs[j] = so->outputs[i]; + ir->outputs[(j*4)+0] = ir->outputs[(i*4)+0]; + ir->outputs[(j*4)+1] = ir->outputs[(i*4)+1]; + ir->outputs[(j*4)+2] = ir->outputs[(i*4)+2]; + ir->outputs[(j*4)+3] = ir->outputs[(i*4)+3]; + } + j++; + } + } + so->outputs_count = j; + ir->noutputs = j * 4; +} + +int +ir3_compile_shader_nir(struct ir3_compiler *compiler, + struct ir3_shader_variant *so) +{ + struct ir3_context *ctx; + struct ir3 *ir; + struct ir3_instruction **inputs; + unsigned i, actual_in, inloc; + int ret = 0, max_bary; + + assert(!so->ir); + + ctx = compile_init(compiler, so); + if (!ctx) { + DBG("INIT failed!"); + ret = -1; + goto out; + } + + emit_instructions(ctx); + + if (ctx->error) { + DBG("EMIT failed!"); + ret = -1; + goto out; + } + + ir = so->ir = ctx->ir; + + /* keep track of the inputs from TGSI perspective.. */ + inputs = ir->inputs; + + /* but fixup actual inputs for frag shader: */ + if (so->type == MESA_SHADER_FRAGMENT) + fixup_frag_inputs(ctx); + + /* at this point, for binning pass, throw away unneeded outputs: */ + if (so->binning_pass && (ctx->compiler->gpu_id < 600)) + fixup_binning_pass(ctx); + + /* if we want half-precision outputs, mark the output registers + * as half: + */ + if (so->key.half_precision) { + for (i = 0; i < ir->noutputs; i++) { + struct ir3_instruction *out = ir->outputs[i]; + + if (!out) + continue; + + /* if frag shader writes z, that needs to be full precision: */ + if (so->outputs[i/4].slot == FRAG_RESULT_DEPTH) + continue; + + out->regs[0]->flags |= IR3_REG_HALF; + /* output could be a fanout (ie. texture fetch output) + * in which case we need to propagate the half-reg flag + * up to the definer so that RA sees it: + */ + if (out->opc == OPC_META_FO) { + out = out->regs[1]->instr; + out->regs[0]->flags |= IR3_REG_HALF; + } + + if (out->opc == OPC_MOV) { + out->cat1.dst_type = half_type(out->cat1.dst_type); + } + } + } + + if (ir3_shader_debug & IR3_DBG_OPTMSGS) { + printf("BEFORE CP:\n"); + ir3_print(ir); + } + + ir3_cp(ir, so); + + /* at this point, for binning pass, throw away unneeded outputs: + * Note that for a6xx and later, we do this after ir3_cp to ensure + * that the uniform/constant layout for BS and VS matches, so that + * we can re-use same VS_CONST state group. + */ + if (so->binning_pass && (ctx->compiler->gpu_id >= 600)) + fixup_binning_pass(ctx); + + /* Insert mov if there's same instruction for each output. + * eg. dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.const_expression.vertex.sampler2dshadow + */ + for (int i = ir->noutputs - 1; i >= 0; i--) { + if (!ir->outputs[i]) + continue; + for (unsigned j = 0; j < i; j++) { + if (ir->outputs[i] == ir->outputs[j]) { + ir->outputs[i] = + ir3_MOV(ir->outputs[i]->block, ir->outputs[i], TYPE_F32); + } + } + } + + if (ir3_shader_debug & IR3_DBG_OPTMSGS) { + printf("BEFORE GROUPING:\n"); + ir3_print(ir); + } + + ir3_sched_add_deps(ir); + + /* Group left/right neighbors, inserting mov's where needed to + * solve conflicts: + */ + ir3_group(ir); + + if (ir3_shader_debug & IR3_DBG_OPTMSGS) { + printf("AFTER GROUPING:\n"); + ir3_print(ir); + } + + ir3_depth(ir); + + if (ir3_shader_debug & IR3_DBG_OPTMSGS) { + printf("AFTER DEPTH:\n"); + ir3_print(ir); + } + + ret = ir3_sched(ir); + if (ret) { + DBG("SCHED failed!"); + goto out; + } + + if (ir3_shader_debug & IR3_DBG_OPTMSGS) { + printf("AFTER SCHED:\n"); + ir3_print(ir); + } + + ret = ir3_ra(ir, so->type, so->frag_coord, so->frag_face); + if (ret) { + DBG("RA failed!"); + goto out; + } + + if (ir3_shader_debug & IR3_DBG_OPTMSGS) { + printf("AFTER RA:\n"); + ir3_print(ir); + } + + /* fixup input/outputs: */ + for (i = 0; i < so->outputs_count; i++) { + so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num; + } + + /* Note that some or all channels of an input may be unused: */ + actual_in = 0; + inloc = 0; + for (i = 0; i < so->inputs_count; i++) { + unsigned j, reg = regid(63,0), compmask = 0, maxcomp = 0; + so->inputs[i].ncomp = 0; + so->inputs[i].inloc = inloc; + for (j = 0; j < 4; j++) { + struct ir3_instruction *in = inputs[(i*4) + j]; + if (in && !(in->flags & IR3_INSTR_UNUSED)) { + compmask |= (1 << j); + reg = in->regs[0]->num - j; + actual_in++; + so->inputs[i].ncomp++; + if ((so->type == MESA_SHADER_FRAGMENT) && so->inputs[i].bary) { + /* assign inloc: */ + assert(in->regs[1]->flags & IR3_REG_IMMED); + in->regs[1]->iim_val = inloc + j; + maxcomp = j + 1; + } + } + } + if ((so->type == MESA_SHADER_FRAGMENT) && compmask && so->inputs[i].bary) { + so->varying_in++; + so->inputs[i].compmask = (1 << maxcomp) - 1; + inloc += maxcomp; + } else if (!so->inputs[i].sysval) { + so->inputs[i].compmask = compmask; + } + so->inputs[i].regid = reg; + } + + if (ctx->astc_srgb) + fixup_astc_srgb(ctx); + + /* We need to do legalize after (for frag shader's) the "bary.f" + * offsets (inloc) have been assigned. + */ + ir3_legalize(ir, &so->num_samp, &so->has_ssbo, &max_bary); + + if (ir3_shader_debug & IR3_DBG_OPTMSGS) { + printf("AFTER LEGALIZE:\n"); + ir3_print(ir); + } + + /* Note that actual_in counts inputs that are not bary.f'd for FS: */ + if (so->type == MESA_SHADER_VERTEX) + so->total_in = actual_in; + else + so->total_in = max_bary + 1; + +out: + if (ret) { + if (so->ir) + ir3_destroy(so->ir); + so->ir = NULL; + } + compile_free(ctx); + + return ret; +} diff --git a/src/freedreno/ir3/ir3_cp.c b/src/freedreno/ir3/ir3_cp.c new file mode 100644 index 00000000000..e8e8cc311e3 --- /dev/null +++ b/src/freedreno/ir3/ir3_cp.c @@ -0,0 +1,653 @@ +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include <math.h> + +#include "ir3.h" +#include "ir3_shader.h" + +/* + * Copy Propagate: + */ + +struct ir3_cp_ctx { + struct ir3 *shader; + struct ir3_shader_variant *so; + unsigned immediate_idx; +}; + +/* is it a type preserving mov, with ok flags? */ +static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags) +{ + if (is_same_type_mov(instr)) { + struct ir3_register *dst = instr->regs[0]; + struct ir3_register *src = instr->regs[1]; + struct ir3_instruction *src_instr = ssa(src); + + /* only if mov src is SSA (not const/immed): */ + if (!src_instr) + return false; + + /* no indirect: */ + if (dst->flags & IR3_REG_RELATIV) + return false; + if (src->flags & IR3_REG_RELATIV) + return false; + + if (src->flags & IR3_REG_ARRAY) + return false; + + if (!allow_flags) + if (src->flags & (IR3_REG_FABS | IR3_REG_FNEG | + IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT)) + return false; + + /* TODO: remove this hack: */ + if (src_instr->opc == OPC_META_FO) + return false; + + return true; + } + return false; +} + +static unsigned cp_flags(unsigned flags) +{ + /* only considering these flags (at least for now): */ + flags &= (IR3_REG_CONST | IR3_REG_IMMED | + IR3_REG_FNEG | IR3_REG_FABS | + IR3_REG_SNEG | IR3_REG_SABS | + IR3_REG_BNOT | IR3_REG_RELATIV); + return flags; +} + +static bool valid_flags(struct ir3_instruction *instr, unsigned n, + unsigned flags) +{ + unsigned valid_flags; + flags = cp_flags(flags); + + /* If destination is indirect, then source cannot be.. at least + * I don't think so.. + */ + if ((instr->regs[0]->flags & IR3_REG_RELATIV) && + (flags & IR3_REG_RELATIV)) + return false; + + /* TODO it seems to *mostly* work to cp RELATIV, except we get some + * intermittent piglit variable-indexing fails. Newer blob driver + * doesn't seem to cp these. Possibly this is hw workaround? Not + * sure, but until that is understood better, lets just switch off + * cp for indirect src's: + */ + if (flags & IR3_REG_RELATIV) + return false; + + switch (opc_cat(instr->opc)) { + case 1: + valid_flags = IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV; + if (flags & ~valid_flags) + return false; + break; + case 2: + valid_flags = ir3_cat2_absneg(instr->opc) | + IR3_REG_CONST | IR3_REG_RELATIV; + + if (ir3_cat2_int(instr->opc)) + valid_flags |= IR3_REG_IMMED; + + if (flags & ~valid_flags) + return false; + + if (flags & (IR3_REG_CONST | IR3_REG_IMMED)) { + unsigned m = (n ^ 1) + 1; + /* cannot deal w/ const in both srcs: + * (note that some cat2 actually only have a single src) + */ + if (m < instr->regs_count) { + struct ir3_register *reg = instr->regs[m]; + if ((flags & IR3_REG_CONST) && (reg->flags & IR3_REG_CONST)) + return false; + if ((flags & IR3_REG_IMMED) && (reg->flags & IR3_REG_IMMED)) + return false; + } + /* cannot be const + ABS|NEG: */ + if (flags & (IR3_REG_FABS | IR3_REG_FNEG | + IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT)) + return false; + } + break; + case 3: + valid_flags = ir3_cat3_absneg(instr->opc) | + IR3_REG_CONST | IR3_REG_RELATIV; + + if (flags & ~valid_flags) + return false; + + if (flags & (IR3_REG_CONST | IR3_REG_RELATIV)) { + /* cannot deal w/ const/relativ in 2nd src: */ + if (n == 1) + return false; + } + + if (flags & IR3_REG_CONST) { + /* cannot be const + ABS|NEG: */ + if (flags & (IR3_REG_FABS | IR3_REG_FNEG | + IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT)) + return false; + } + break; + case 4: + /* seems like blob compiler avoids const as src.. */ + /* TODO double check if this is still the case on a4xx */ + if (flags & (IR3_REG_CONST | IR3_REG_IMMED)) + return false; + if (flags & (IR3_REG_SABS | IR3_REG_SNEG)) + return false; + break; + case 5: + /* no flags allowed */ + if (flags) + return false; + break; + case 6: + valid_flags = IR3_REG_IMMED; + if (flags & ~valid_flags) + return false; + + if (flags & IR3_REG_IMMED) { + /* doesn't seem like we can have immediate src for store + * instructions: + * + * TODO this restriction could also apply to load instructions, + * but for load instructions this arg is the address (and not + * really sure any good way to test a hard-coded immed addr src) + */ + if (is_store(instr) && (n == 1)) + return false; + + if ((instr->opc == OPC_LDL) && (n != 1)) + return false; + + if ((instr->opc == OPC_STL) && (n != 2)) + return false; + + /* disallow CP into anything but the SSBO slot argument for + * atomics: + */ + if (is_atomic(instr->opc) && (n != 0)) + return false; + + if (is_atomic(instr->opc) && !(instr->flags & IR3_INSTR_G)) + return false; + } + + break; + } + + return true; +} + +/* propagate register flags from src to dst.. negates need special + * handling to cancel each other out. + */ +static void combine_flags(unsigned *dstflags, struct ir3_instruction *src) +{ + unsigned srcflags = src->regs[1]->flags; + + /* if what we are combining into already has (abs) flags, + * we can drop (neg) from src: + */ + if (*dstflags & IR3_REG_FABS) + srcflags &= ~IR3_REG_FNEG; + if (*dstflags & IR3_REG_SABS) + srcflags &= ~IR3_REG_SNEG; + + if (srcflags & IR3_REG_FABS) + *dstflags |= IR3_REG_FABS; + if (srcflags & IR3_REG_SABS) + *dstflags |= IR3_REG_SABS; + if (srcflags & IR3_REG_FNEG) + *dstflags ^= IR3_REG_FNEG; + if (srcflags & IR3_REG_SNEG) + *dstflags ^= IR3_REG_SNEG; + if (srcflags & IR3_REG_BNOT) + *dstflags ^= IR3_REG_BNOT; + + *dstflags &= ~IR3_REG_SSA; + *dstflags |= srcflags & IR3_REG_SSA; + *dstflags |= srcflags & IR3_REG_CONST; + *dstflags |= srcflags & IR3_REG_IMMED; + *dstflags |= srcflags & IR3_REG_RELATIV; + *dstflags |= srcflags & IR3_REG_ARRAY; + + /* if src of the src is boolean we can drop the (abs) since we know + * the source value is already a postitive integer. This cleans + * up the absnegs that get inserted when converting between nir and + * native boolean (see ir3_b2n/n2b) + */ + struct ir3_instruction *srcsrc = ssa(src->regs[1]); + if (srcsrc && is_bool(srcsrc)) + *dstflags &= ~IR3_REG_SABS; +} + +static struct ir3_register * +lower_immed(struct ir3_cp_ctx *ctx, struct ir3_register *reg, unsigned new_flags) +{ + unsigned swiz, idx, i; + + reg = ir3_reg_clone(ctx->shader, reg); + + /* in some cases, there are restrictions on (abs)/(neg) plus const.. + * so just evaluate those and clear the flags: + */ + if (new_flags & IR3_REG_SABS) { + reg->iim_val = abs(reg->iim_val); + new_flags &= ~IR3_REG_SABS; + } + + if (new_flags & IR3_REG_FABS) { + reg->fim_val = fabs(reg->fim_val); + new_flags &= ~IR3_REG_FABS; + } + + if (new_flags & IR3_REG_SNEG) { + reg->iim_val = -reg->iim_val; + new_flags &= ~IR3_REG_SNEG; + } + + if (new_flags & IR3_REG_FNEG) { + reg->fim_val = -reg->fim_val; + new_flags &= ~IR3_REG_FNEG; + } + + /* Reallocate for 4 more elements whenever it's necessary */ + if (ctx->immediate_idx == ctx->so->immediates_size * 4) { + ctx->so->immediates_size += 4; + ctx->so->immediates = realloc (ctx->so->immediates, + ctx->so->immediates_size * sizeof (ctx->so->immediates[0])); + } + + for (i = 0; i < ctx->immediate_idx; i++) { + swiz = i % 4; + idx = i / 4; + + if (ctx->so->immediates[idx].val[swiz] == reg->uim_val) { + break; + } + } + + if (i == ctx->immediate_idx) { + /* need to generate a new immediate: */ + swiz = i % 4; + idx = i / 4; + ctx->so->immediates[idx].val[swiz] = reg->uim_val; + ctx->so->immediates_count = idx + 1; + ctx->immediate_idx++; + } + + new_flags &= ~IR3_REG_IMMED; + new_flags |= IR3_REG_CONST; + reg->flags = new_flags; + reg->num = i + (4 * ctx->so->constbase.immediate); + + return reg; +} + +static void +unuse(struct ir3_instruction *instr) +{ + debug_assert(instr->use_count > 0); + + if (--instr->use_count == 0) { + struct ir3_block *block = instr->block; + + instr->barrier_class = 0; + instr->barrier_conflict = 0; + + /* we don't want to remove anything in keeps (which could + * be things like array store's) + */ + for (unsigned i = 0; i < block->keeps_count; i++) { + debug_assert(block->keeps[i] != instr); + } + } +} + +/** + * Handle cp for a given src register. This additionally handles + * the cases of collapsing immedate/const (which replace the src + * register with a non-ssa src) or collapsing mov's from relative + * src (which needs to also fixup the address src reference by the + * instruction). + */ +static void +reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr, + struct ir3_register *reg, unsigned n) +{ + struct ir3_instruction *src = ssa(reg); + + if (is_eligible_mov(src, true)) { + /* simple case, no immed/const/relativ, only mov's w/ ssa src: */ + struct ir3_register *src_reg = src->regs[1]; + unsigned new_flags = reg->flags; + + combine_flags(&new_flags, src); + + if (valid_flags(instr, n, new_flags)) { + if (new_flags & IR3_REG_ARRAY) { + debug_assert(!(reg->flags & IR3_REG_ARRAY)); + reg->array = src_reg->array; + } + reg->flags = new_flags; + reg->instr = ssa(src_reg); + + instr->barrier_class |= src->barrier_class; + instr->barrier_conflict |= src->barrier_conflict; + + unuse(src); + reg->instr->use_count++; + } + + } else if (is_same_type_mov(src) && + /* cannot collapse const/immed/etc into meta instrs: */ + !is_meta(instr)) { + /* immed/const/etc cases, which require some special handling: */ + struct ir3_register *src_reg = src->regs[1]; + unsigned new_flags = reg->flags; + + combine_flags(&new_flags, src); + + if (!valid_flags(instr, n, new_flags)) { + /* See if lowering an immediate to const would help. */ + if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) { + debug_assert(new_flags & IR3_REG_IMMED); + instr->regs[n + 1] = lower_immed(ctx, src_reg, new_flags); + return; + } + + /* special case for "normal" mad instructions, we can + * try swapping the first two args if that fits better. + * + * the "plain" MAD's (ie. the ones that don't shift first + * src prior to multiply) can swap their first two srcs if + * src[0] is !CONST and src[1] is CONST: + */ + if ((n == 1) && is_mad(instr->opc) && + !(instr->regs[0 + 1]->flags & (IR3_REG_CONST | IR3_REG_RELATIV)) && + valid_flags(instr, 0, new_flags & ~IR3_REG_IMMED)) { + /* swap src[0] and src[1]: */ + struct ir3_register *tmp; + tmp = instr->regs[0 + 1]; + instr->regs[0 + 1] = instr->regs[1 + 1]; + instr->regs[1 + 1] = tmp; + + n = 0; + } else { + return; + } + } + + /* Here we handle the special case of mov from + * CONST and/or RELATIV. These need to be handled + * specially, because in the case of move from CONST + * there is no src ir3_instruction so we need to + * replace the ir3_register. And in the case of + * RELATIV we need to handle the address register + * dependency. + */ + if (src_reg->flags & IR3_REG_CONST) { + /* an instruction cannot reference two different + * address registers: + */ + if ((src_reg->flags & IR3_REG_RELATIV) && + conflicts(instr->address, reg->instr->address)) + return; + + /* This seems to be a hw bug, or something where the timings + * just somehow don't work out. This restriction may only + * apply if the first src is also CONST. + */ + if ((opc_cat(instr->opc) == 3) && (n == 2) && + (src_reg->flags & IR3_REG_RELATIV) && + (src_reg->array.offset == 0)) + return; + + src_reg = ir3_reg_clone(instr->block->shader, src_reg); + src_reg->flags = new_flags; + instr->regs[n+1] = src_reg; + + if (src_reg->flags & IR3_REG_RELATIV) + ir3_instr_set_address(instr, reg->instr->address); + + return; + } + + if ((src_reg->flags & IR3_REG_RELATIV) && + !conflicts(instr->address, reg->instr->address)) { + src_reg = ir3_reg_clone(instr->block->shader, src_reg); + src_reg->flags = new_flags; + instr->regs[n+1] = src_reg; + ir3_instr_set_address(instr, reg->instr->address); + + return; + } + + /* NOTE: seems we can only do immed integers, so don't + * need to care about float. But we do need to handle + * abs/neg *before* checking that the immediate requires + * few enough bits to encode: + * + * TODO: do we need to do something to avoid accidentally + * catching a float immed? + */ + if (src_reg->flags & IR3_REG_IMMED) { + int32_t iim_val = src_reg->iim_val; + + debug_assert((opc_cat(instr->opc) == 1) || + (opc_cat(instr->opc) == 6) || + ir3_cat2_int(instr->opc) || + (is_mad(instr->opc) && (n == 0))); + + if (new_flags & IR3_REG_SABS) + iim_val = abs(iim_val); + + if (new_flags & IR3_REG_SNEG) + iim_val = -iim_val; + + if (new_flags & IR3_REG_BNOT) + iim_val = ~iim_val; + + /* other than category 1 (mov) we can only encode up to 10 bits: */ + if ((instr->opc == OPC_MOV) || + !((iim_val & ~0x3ff) && (-iim_val & ~0x3ff))) { + new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT); + src_reg = ir3_reg_clone(instr->block->shader, src_reg); + src_reg->flags = new_flags; + src_reg->iim_val = iim_val; + instr->regs[n+1] = src_reg; + } else if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) { + /* See if lowering an immediate to const would help. */ + instr->regs[n+1] = lower_immed(ctx, src_reg, new_flags); + } + + return; + } + } +} + +/* Handle special case of eliminating output mov, and similar cases where + * there isn't a normal "consuming" instruction. In this case we cannot + * collapse flags (ie. output mov from const, or w/ abs/neg flags, cannot + * be eliminated) + */ +static struct ir3_instruction * +eliminate_output_mov(struct ir3_instruction *instr) +{ + if (is_eligible_mov(instr, false)) { + struct ir3_register *reg = instr->regs[1]; + if (!(reg->flags & IR3_REG_ARRAY)) { + struct ir3_instruction *src_instr = ssa(reg); + debug_assert(src_instr); + return src_instr; + } + } + return instr; +} + +/** + * Find instruction src's which are mov's that can be collapsed, replacing + * the mov dst with the mov src + */ +static void +instr_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr) +{ + struct ir3_register *reg; + + if (instr->regs_count == 0) + return; + + if (ir3_instr_check_mark(instr)) + return; + + /* walk down the graph from each src: */ + foreach_src_n(reg, n, instr) { + struct ir3_instruction *src = ssa(reg); + + if (!src) + continue; + + instr_cp(ctx, src); + + /* TODO non-indirect access we could figure out which register + * we actually want and allow cp.. + */ + if (reg->flags & IR3_REG_ARRAY) + continue; + + /* Don't CP absneg into meta instructions, that won't end well: */ + if (is_meta(instr) && (src->opc != OPC_MOV)) + continue; + + reg_cp(ctx, instr, reg, n); + } + + if (instr->regs[0]->flags & IR3_REG_ARRAY) { + struct ir3_instruction *src = ssa(instr->regs[0]); + if (src) + instr_cp(ctx, src); + } + + if (instr->address) { + instr_cp(ctx, instr->address); + ir3_instr_set_address(instr, eliminate_output_mov(instr->address)); + } + + /* we can end up with extra cmps.s from frontend, which uses a + * + * cmps.s p0.x, cond, 0 + * + * as a way to mov into the predicate register. But frequently 'cond' + * is itself a cmps.s/cmps.f/cmps.u. So detect this special case and + * just re-write the instruction writing predicate register to get rid + * of the double cmps. + */ + if ((instr->opc == OPC_CMPS_S) && + (instr->regs[0]->num == regid(REG_P0, 0)) && + ssa(instr->regs[1]) && + (instr->regs[2]->flags & IR3_REG_IMMED) && + (instr->regs[2]->iim_val == 0)) { + struct ir3_instruction *cond = ssa(instr->regs[1]); + switch (cond->opc) { + case OPC_CMPS_S: + case OPC_CMPS_F: + case OPC_CMPS_U: + instr->opc = cond->opc; + instr->flags = cond->flags; + instr->cat2 = cond->cat2; + instr->address = cond->address; + instr->regs[1] = cond->regs[1]; + instr->regs[2] = cond->regs[2]; + instr->barrier_class |= cond->barrier_class; + instr->barrier_conflict |= cond->barrier_conflict; + unuse(cond); + break; + default: + break; + } + } +} + +void +ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so) +{ + struct ir3_cp_ctx ctx = { + .shader = ir, + .so = so, + }; + + /* This is a bit annoying, and probably wouldn't be necessary if we + * tracked a reverse link from producing instruction to consumer. + * But we need to know when we've eliminated the last consumer of + * a mov, so we need to do a pass to first count consumers of a + * mov. + */ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + struct ir3_instruction *src; + + /* by the way, we don't account for false-dep's, so the CP + * pass should always happen before false-dep's are inserted + */ + debug_assert(instr->deps_count == 0); + + foreach_ssa_src(src, instr) { + src->use_count++; + } + } + } + + ir3_clear_mark(ir); + + for (unsigned i = 0; i < ir->noutputs; i++) { + if (ir->outputs[i]) { + instr_cp(&ctx, ir->outputs[i]); + ir->outputs[i] = eliminate_output_mov(ir->outputs[i]); + } + } + + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + if (block->condition) { + instr_cp(&ctx, block->condition); + block->condition = eliminate_output_mov(block->condition); + } + + for (unsigned i = 0; i < block->keeps_count; i++) { + instr_cp(&ctx, block->keeps[i]); + block->keeps[i] = eliminate_output_mov(block->keeps[i]); + } + } +} diff --git a/src/freedreno/ir3/ir3_depth.c b/src/freedreno/ir3/ir3_depth.c new file mode 100644 index 00000000000..73bf5e19926 --- /dev/null +++ b/src/freedreno/ir3/ir3_depth.c @@ -0,0 +1,245 @@ +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include "util/u_math.h" + +#include "ir3.h" + +/* + * Instruction Depth: + * + * Calculates weighted instruction depth, ie. the sum of # of needed + * instructions plus delay slots back to original input (ie INPUT or + * CONST). That is to say, an instructions depth is: + * + * depth(instr) { + * d = 0; + * // for each src register: + * foreach (src in instr->regs[1..n]) + * d = max(d, delayslots(src->instr, n) + depth(src->instr)); + * return d + 1; + * } + * + * After an instruction's depth is calculated, it is inserted into the + * blocks depth sorted list, which is used by the scheduling pass. + */ + +/* generally don't count false dependencies, since this can just be + * something like a barrier, or SSBO store. The exception is array + * dependencies if the assigner is an array write and the consumer + * reads the same array. + */ +static bool +ignore_dep(struct ir3_instruction *assigner, + struct ir3_instruction *consumer, unsigned n) +{ + if (!__is_false_dep(consumer, n)) + return false; + + if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) { + struct ir3_register *dst = assigner->regs[0]; + struct ir3_register *src; + + debug_assert(dst->flags & IR3_REG_ARRAY); + + foreach_src(src, consumer) { + if ((src->flags & IR3_REG_ARRAY) && + (dst->array.id == src->array.id)) { + return false; + } + } + } + + return true; +} + +/* calculate required # of delay slots between the instruction that + * assigns a value and the one that consumes + */ +int ir3_delayslots(struct ir3_instruction *assigner, + struct ir3_instruction *consumer, unsigned n) +{ + if (ignore_dep(assigner, consumer, n)) + return 0; + + /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal + * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch + * handled with sync bits + */ + + if (is_meta(assigner)) + return 0; + + if (writes_addr(assigner)) + return 6; + + /* handled via sync flags: */ + if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner)) + return 0; + + /* assigner must be alu: */ + if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) || + is_mem(consumer)) { + return 6; + } else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) && + (n == 3)) { + /* special case, 3rd src to cat3 not required on first cycle */ + return 1; + } else { + return 3; + } +} + +void +ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list) +{ + /* remove from existing spot in list: */ + list_delinit(&instr->node); + + /* find where to re-insert instruction: */ + list_for_each_entry (struct ir3_instruction, pos, list, node) { + if (pos->depth > instr->depth) { + list_add(&instr->node, &pos->node); + return; + } + } + /* if we get here, we didn't find an insertion spot: */ + list_addtail(&instr->node, list); +} + +static void +ir3_instr_depth(struct ir3_instruction *instr, unsigned boost, bool falsedep) +{ + struct ir3_instruction *src; + + /* don't mark falsedep's as used, but otherwise process them normally: */ + if (!falsedep) + instr->flags &= ~IR3_INSTR_UNUSED; + + if (ir3_instr_check_mark(instr)) + return; + + instr->depth = 0; + + foreach_ssa_src_n(src, i, instr) { + unsigned sd; + + /* visit child to compute it's depth: */ + ir3_instr_depth(src, boost, __is_false_dep(instr, i)); + + /* for array writes, no need to delay on previous write: */ + if (i == 0) + continue; + + sd = ir3_delayslots(src, instr, i) + src->depth; + sd += boost; + + instr->depth = MAX2(instr->depth, sd); + } + + if (!is_meta(instr)) + instr->depth++; + + ir3_insert_by_depth(instr, &instr->block->instr_list); +} + +static bool +remove_unused_by_block(struct ir3_block *block) +{ + bool progress = false; + list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) { + if (instr->opc == OPC_END) + continue; + if (instr->flags & IR3_INSTR_UNUSED) { + list_delinit(&instr->node); + progress = true; + } + } + return progress; +} + +static bool +compute_depth_and_remove_unused(struct ir3 *ir) +{ + unsigned i; + bool progress = false; + + ir3_clear_mark(ir); + + /* initially mark everything as unused, we'll clear the flag as we + * visit the instructions: + */ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + instr->flags |= IR3_INSTR_UNUSED; + } + } + + for (i = 0; i < ir->noutputs; i++) + if (ir->outputs[i]) + ir3_instr_depth(ir->outputs[i], 0, false); + + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + for (i = 0; i < block->keeps_count; i++) + ir3_instr_depth(block->keeps[i], 0, false); + + /* We also need to account for if-condition: */ + if (block->condition) + ir3_instr_depth(block->condition, 6, false); + } + + /* mark un-used instructions: */ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + progress |= remove_unused_by_block(block); + } + + /* note that we can end up with unused indirects, but we should + * not end up with unused predicates. + */ + for (i = 0; i < ir->indirects_count; i++) { + struct ir3_instruction *instr = ir->indirects[i]; + if (instr && (instr->flags & IR3_INSTR_UNUSED)) + ir->indirects[i] = NULL; + } + + /* cleanup unused inputs: */ + for (i = 0; i < ir->ninputs; i++) { + struct ir3_instruction *in = ir->inputs[i]; + if (in && (in->flags & IR3_INSTR_UNUSED)) + ir->inputs[i] = NULL; + } + + return progress; +} + +void +ir3_depth(struct ir3 *ir) +{ + bool progress; + do { + progress = compute_depth_and_remove_unused(ir); + } while (progress); +} diff --git a/src/freedreno/ir3/ir3_group.c b/src/freedreno/ir3/ir3_group.c new file mode 100644 index 00000000000..570055973e8 --- /dev/null +++ b/src/freedreno/ir3/ir3_group.c @@ -0,0 +1,274 @@ +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include "ir3.h" + +/* + * Find/group instruction neighbors: + */ + +/* bleh.. we need to do the same group_n() thing for both inputs/outputs + * (where we have a simple instr[] array), and fanin nodes (where we have + * an extra indirection via reg->instr). + */ +struct group_ops { + struct ir3_instruction *(*get)(void *arr, int idx); + void (*insert_mov)(void *arr, int idx, struct ir3_instruction *instr); +}; + +static struct ir3_instruction *arr_get(void *arr, int idx) +{ + return ((struct ir3_instruction **)arr)[idx]; +} +static void arr_insert_mov_out(void *arr, int idx, struct ir3_instruction *instr) +{ + ((struct ir3_instruction **)arr)[idx] = + ir3_MOV(instr->block, instr, TYPE_F32); +} +static void arr_insert_mov_in(void *arr, int idx, struct ir3_instruction *instr) +{ + /* so, we can't insert a mov in front of a meta:in.. and the downstream + * instruction already has a pointer to 'instr'. So we cheat a bit and + * morph the meta:in instruction into a mov and insert a new meta:in + * in front. + */ + struct ir3_instruction *in; + + debug_assert(instr->regs_count == 1); + + in = ir3_instr_create(instr->block, OPC_META_INPUT); + in->inout.block = instr->block; + ir3_reg_create(in, instr->regs[0]->num, 0); + + /* create src reg for meta:in and fixup to now be a mov: */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = in; + instr->opc = OPC_MOV; + instr->cat1.src_type = TYPE_F32; + instr->cat1.dst_type = TYPE_F32; + + ((struct ir3_instruction **)arr)[idx] = in; +} +static struct group_ops arr_ops_out = { arr_get, arr_insert_mov_out }; +static struct group_ops arr_ops_in = { arr_get, arr_insert_mov_in }; + +static struct ir3_instruction *instr_get(void *arr, int idx) +{ + return ssa(((struct ir3_instruction *)arr)->regs[idx+1]); +} +static void +instr_insert_mov(void *arr, int idx, struct ir3_instruction *instr) +{ + ((struct ir3_instruction *)arr)->regs[idx+1]->instr = + ir3_MOV(instr->block, instr, TYPE_F32); +} +static struct group_ops instr_ops = { instr_get, instr_insert_mov }; + +/* verify that cur != instr, but cur is also not in instr's neighbor-list: */ +static bool +in_neighbor_list(struct ir3_instruction *instr, struct ir3_instruction *cur, int pos) +{ + int idx = 0; + + if (!instr) + return false; + + if (instr == cur) + return true; + + for (instr = ir3_neighbor_first(instr); instr; instr = instr->cp.right) + if ((idx++ != pos) && (instr == cur)) + return true; + + return false; +} + +static void +group_n(struct group_ops *ops, void *arr, unsigned n) +{ + unsigned i, j; + + /* first pass, figure out what has conflicts and needs a mov + * inserted. Do this up front, before starting to setup + * left/right neighbor pointers. Trying to do it in a single + * pass could result in a situation where we can't even setup + * the mov's right neighbor ptr if the next instr also needs + * a mov. + */ +restart: + for (i = 0; i < n; i++) { + struct ir3_instruction *instr = ops->get(arr, i); + if (instr) { + struct ir3_instruction *left = (i > 0) ? ops->get(arr, i - 1) : NULL; + struct ir3_instruction *right = (i < (n-1)) ? ops->get(arr, i + 1) : NULL; + bool conflict; + + /* check for left/right neighbor conflicts: */ + conflict = conflicts(instr->cp.left, left) || + conflicts(instr->cp.right, right); + + /* Mixing array elements and higher register classes + * (ie. groups) doesn't really work out in RA. See: + * + * https://trello.com/c/DqeDkeVf/156-bug-with-stk-70frag + */ + if (instr->regs[0]->flags & IR3_REG_ARRAY) + conflict = true; + + /* we also can't have an instr twice in the group: */ + for (j = i + 1; (j < n) && !conflict; j++) + if (in_neighbor_list(ops->get(arr, j), instr, i)) + conflict = true; + + if (conflict) { + ops->insert_mov(arr, i, instr); + /* inserting the mov may have caused a conflict + * against the previous: + */ + goto restart; + } + } + } + + /* second pass, now that we've inserted mov's, fixup left/right + * neighbors. This is guaranteed to succeed, since by definition + * the newly inserted mov's cannot conflict with anything. + */ + for (i = 0; i < n; i++) { + struct ir3_instruction *instr = ops->get(arr, i); + if (instr) { + struct ir3_instruction *left = (i > 0) ? ops->get(arr, i - 1) : NULL; + struct ir3_instruction *right = (i < (n-1)) ? ops->get(arr, i + 1) : NULL; + + debug_assert(!conflicts(instr->cp.left, left)); + if (left) { + instr->cp.left_cnt++; + instr->cp.left = left; + } + + debug_assert(!conflicts(instr->cp.right, right)); + if (right) { + instr->cp.right_cnt++; + instr->cp.right = right; + } + } + } +} + +static void +instr_find_neighbors(struct ir3_instruction *instr) +{ + struct ir3_instruction *src; + + if (ir3_instr_check_mark(instr)) + return; + + if (instr->opc == OPC_META_FI) + group_n(&instr_ops, instr, instr->regs_count - 1); + + foreach_ssa_src(src, instr) + instr_find_neighbors(src); +} + +/* a bit of sadness.. we can't have "holes" in inputs from PoV of + * register assignment, they still need to be grouped together. So + * we need to insert dummy/padding instruction for grouping, and + * then take it back out again before anyone notices. + */ +static void +pad_and_group_input(struct ir3_instruction **input, unsigned n) +{ + int i, mask = 0; + struct ir3_block *block = NULL; + + for (i = n - 1; i >= 0; i--) { + struct ir3_instruction *instr = input[i]; + if (instr) { + block = instr->block; + } else if (block) { + instr = ir3_NOP(block); + ir3_reg_create(instr, 0, IR3_REG_SSA); /* dummy dst */ + input[i] = instr; + mask |= (1 << i); + } + } + + group_n(&arr_ops_in, input, n); + + for (i = 0; i < n; i++) { + if (mask & (1 << i)) + input[i] = NULL; + } +} + +static void +find_neighbors(struct ir3 *ir) +{ + unsigned i; + + /* shader inputs/outputs themselves must be contiguous as well: + * + * NOTE: group inputs first, since we only insert mov's + * *before* the conflicted instr (and that would go badly + * for inputs). By doing inputs first, we should never + * have a conflict on inputs.. pushing any conflict to + * resolve to the outputs, for stuff like: + * + * MOV OUT[n], IN[m].wzyx + * + * NOTE: we assume here inputs/outputs are grouped in vec4. + * This logic won't quite cut it if we don't align smaller + * on vec4 boundaries + */ + for (i = 0; i < ir->ninputs; i += 4) + pad_and_group_input(&ir->inputs[i], 4); + for (i = 0; i < ir->noutputs; i += 4) + group_n(&arr_ops_out, &ir->outputs[i], 4); + + for (i = 0; i < ir->noutputs; i++) { + if (ir->outputs[i]) { + struct ir3_instruction *instr = ir->outputs[i]; + instr_find_neighbors(instr); + } + } + + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + for (i = 0; i < block->keeps_count; i++) { + struct ir3_instruction *instr = block->keeps[i]; + instr_find_neighbors(instr); + } + + /* We also need to account for if-condition: */ + if (block->condition) + instr_find_neighbors(block->condition); + } +} + +void +ir3_group(struct ir3 *ir) +{ + ir3_clear_mark(ir); + find_neighbors(ir); +} diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c new file mode 100644 index 00000000000..ff4c644eab5 --- /dev/null +++ b/src/freedreno/ir3/ir3_legalize.c @@ -0,0 +1,496 @@ +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include "util/ralloc.h" +#include "util/u_math.h" + +#include "ir3.h" + +/* + * Legalize: + * + * We currently require that scheduling ensures that we have enough nop's + * in all the right places. The legalize step mostly handles fixing up + * instruction flags ((ss)/(sy)/(ei)), and collapses sequences of nop's + * into fewer nop's w/ rpt flag. + */ + +struct ir3_legalize_ctx { + int num_samp; + bool has_ssbo; + int max_bary; +}; + +struct ir3_legalize_state { + regmask_t needs_ss; + regmask_t needs_ss_war; /* write after read */ + regmask_t needs_sy; +}; + +struct ir3_legalize_block_data { + bool valid; + struct ir3_legalize_state state; +}; + +/* We want to evaluate each block from the position of any other + * predecessor block, in order that the flags set are the union of + * all possible program paths. + * + * To do this, we need to know the output state (needs_ss/ss_war/sy) + * of all predecessor blocks. The tricky thing is loops, which mean + * that we can't simply recursively process each predecessor block + * before legalizing the current block. + * + * How we handle that is by looping over all the blocks until the + * results converge. If the output state of a given block changes + * in a given pass, this means that all successor blocks are not + * yet fully legalized. + */ + +static bool +legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) +{ + struct ir3_legalize_block_data *bd = block->data; + + if (bd->valid) + return false; + + struct ir3_instruction *last_input = NULL; + struct ir3_instruction *last_rel = NULL; + struct ir3_instruction *last_n = NULL; + struct list_head instr_list; + struct ir3_legalize_state prev_state = bd->state; + struct ir3_legalize_state *state = &bd->state; + + /* our input state is the OR of all predecessor blocks' state: */ + for (unsigned i = 0; i < block->predecessors_count; i++) { + struct ir3_legalize_block_data *pbd = block->predecessors[i]->data; + struct ir3_legalize_state *pstate = &pbd->state; + + /* Our input (ss)/(sy) state is based on OR'ing the output + * state of all our predecessor blocks + */ + regmask_or(&state->needs_ss, + &state->needs_ss, &pstate->needs_ss); + regmask_or(&state->needs_ss_war, + &state->needs_ss_war, &pstate->needs_ss_war); + regmask_or(&state->needs_sy, + &state->needs_sy, &pstate->needs_sy); + } + + /* remove all the instructions from the list, we'll be adding + * them back in as we go + */ + list_replace(&block->instr_list, &instr_list); + list_inithead(&block->instr_list); + + list_for_each_entry_safe (struct ir3_instruction, n, &instr_list, node) { + struct ir3_register *reg; + unsigned i; + + n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY); + + if (is_meta(n)) + continue; + + if (is_input(n)) { + struct ir3_register *inloc = n->regs[1]; + assert(inloc->flags & IR3_REG_IMMED); + ctx->max_bary = MAX2(ctx->max_bary, inloc->iim_val); + } + + if (last_n && is_barrier(last_n)) + n->flags |= IR3_INSTR_SS | IR3_INSTR_SY; + + /* NOTE: consider dst register too.. it could happen that + * texture sample instruction (for example) writes some + * components which are unused. A subsequent instruction + * that writes the same register can race w/ the sam instr + * resulting in undefined results: + */ + for (i = 0; i < n->regs_count; i++) { + reg = n->regs[i]; + + if (reg_gpr(reg)) { + + /* TODO: we probably only need (ss) for alu + * instr consuming sfu result.. need to make + * some tests for both this and (sy).. + */ + if (regmask_get(&state->needs_ss, reg)) { + n->flags |= IR3_INSTR_SS; + regmask_init(&state->needs_ss_war); + regmask_init(&state->needs_ss); + } + + if (regmask_get(&state->needs_sy, reg)) { + n->flags |= IR3_INSTR_SY; + regmask_init(&state->needs_sy); + } + } + + /* TODO: is it valid to have address reg loaded from a + * relative src (ie. mova a0, c<a0.x+4>)? If so, the + * last_rel check below should be moved ahead of this: + */ + if (reg->flags & IR3_REG_RELATIV) + last_rel = n; + } + + if (n->regs_count > 0) { + reg = n->regs[0]; + if (regmask_get(&state->needs_ss_war, reg)) { + n->flags |= IR3_INSTR_SS; + regmask_init(&state->needs_ss_war); + regmask_init(&state->needs_ss); + } + + if (last_rel && (reg->num == regid(REG_A0, 0))) { + last_rel->flags |= IR3_INSTR_UL; + last_rel = NULL; + } + } + + /* cat5+ does not have an (ss) bit, if needed we need to + * insert a nop to carry the sync flag. Would be kinda + * clever if we were aware of this during scheduling, but + * this should be a pretty rare case: + */ + if ((n->flags & IR3_INSTR_SS) && (opc_cat(n->opc) >= 5)) { + struct ir3_instruction *nop; + nop = ir3_NOP(block); + nop->flags |= IR3_INSTR_SS; + n->flags &= ~IR3_INSTR_SS; + } + + /* need to be able to set (ss) on first instruction: */ + if (list_empty(&block->instr_list) && (opc_cat(n->opc) >= 5)) + ir3_NOP(block); + + if (is_nop(n) && !list_empty(&block->instr_list)) { + struct ir3_instruction *last = list_last_entry(&block->instr_list, + struct ir3_instruction, node); + if (is_nop(last) && (last->repeat < 5)) { + last->repeat++; + last->flags |= n->flags; + continue; + } + } + + list_addtail(&n->node, &block->instr_list); + + if (is_sfu(n)) + regmask_set(&state->needs_ss, n->regs[0]); + + if (is_tex(n)) { + /* this ends up being the # of samp instructions.. but that + * is ok, everything else only cares whether it is zero or + * not. We do this here, rather than when we encounter a + * SAMP decl, because (especially in binning pass shader) + * the samp instruction(s) could get eliminated if the + * result is not used. + */ + ctx->num_samp = MAX2(ctx->num_samp, n->cat5.samp + 1); + regmask_set(&state->needs_sy, n->regs[0]); + } else if (n->opc == OPC_RESINFO) { + regmask_set(&state->needs_ss, n->regs[0]); + ir3_NOP(block)->flags |= IR3_INSTR_SS; + } else if (is_load(n)) { + /* seems like ldlv needs (ss) bit instead?? which is odd but + * makes a bunch of flat-varying tests start working on a4xx. + */ + if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL)) + regmask_set(&state->needs_ss, n->regs[0]); + else + regmask_set(&state->needs_sy, n->regs[0]); + } else if (is_atomic(n->opc)) { + if (n->flags & IR3_INSTR_G) + regmask_set(&state->needs_sy, n->regs[0]); + else + regmask_set(&state->needs_ss, n->regs[0]); + } + + if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G))) + ctx->has_ssbo = true; + + /* both tex/sfu appear to not always immediately consume + * their src register(s): + */ + if (is_tex(n) || is_sfu(n) || is_mem(n)) { + foreach_src(reg, n) { + if (reg_gpr(reg)) + regmask_set(&state->needs_ss_war, reg); + } + } + + if (is_input(n)) + last_input = n; + + last_n = n; + } + + if (last_input) { + /* special hack.. if using ldlv to bypass interpolation, + * we need to insert a dummy bary.f on which we can set + * the (ei) flag: + */ + if (is_mem(last_input) && (last_input->opc == OPC_LDLV)) { + struct ir3_instruction *baryf; + + /* (ss)bary.f (ei)r63.x, 0, r0.x */ + baryf = ir3_instr_create(block, OPC_BARY_F); + baryf->flags |= IR3_INSTR_SS; + ir3_reg_create(baryf, regid(63, 0), 0); + ir3_reg_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0; + ir3_reg_create(baryf, regid(0, 0), 0); + + /* insert the dummy bary.f after last_input: */ + list_delinit(&baryf->node); + list_add(&baryf->node, &last_input->node); + + last_input = baryf; + } + last_input->regs[0]->flags |= IR3_REG_EI; + } + + if (last_rel) + last_rel->flags |= IR3_INSTR_UL; + + bd->valid = true; + + if (memcmp(&prev_state, state, sizeof(*state))) { + /* our output state changed, this invalidates all of our + * successors: + */ + for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) { + if (!block->successors[i]) + break; + struct ir3_legalize_block_data *pbd = block->successors[i]->data; + pbd->valid = false; + } + } + + return true; +} + +/* NOTE: branch instructions are always the last instruction(s) + * in the block. We take advantage of this as we resolve the + * branches, since "if (foo) break;" constructs turn into + * something like: + * + * block3 { + * ... + * 0029:021: mov.s32s32 r62.x, r1.y + * 0082:022: br !p0.x, target=block5 + * 0083:023: br p0.x, target=block4 + * // succs: if _[0029:021: mov.s32s32] block4; else block5; + * } + * block4 { + * 0084:024: jump, target=block6 + * // succs: block6; + * } + * block5 { + * 0085:025: jump, target=block7 + * // succs: block7; + * } + * + * ie. only instruction in block4/block5 is a jump, so when + * resolving branches we can easily detect this by checking + * that the first instruction in the target block is itself + * a jump, and setup the br directly to the jump's target + * (and strip back out the now unreached jump) + * + * TODO sometimes we end up with things like: + * + * br !p0.x, #2 + * br p0.x, #12 + * add.u r0.y, r0.y, 1 + * + * If we swapped the order of the branches, we could drop one. + */ +static struct ir3_block * +resolve_dest_block(struct ir3_block *block) +{ + /* special case for last block: */ + if (!block->successors[0]) + return block; + + /* NOTE that we may or may not have inserted the jump + * in the target block yet, so conditions to resolve + * the dest to the dest block's successor are: + * + * (1) successor[1] == NULL && + * (2) (block-is-empty || only-instr-is-jump) + */ + if (block->successors[1] == NULL) { + if (list_empty(&block->instr_list)) { + return block->successors[0]; + } else if (list_length(&block->instr_list) == 1) { + struct ir3_instruction *instr = list_first_entry( + &block->instr_list, struct ir3_instruction, node); + if (instr->opc == OPC_JUMP) + return block->successors[0]; + } + } + return block; +} + +static bool +resolve_jump(struct ir3_instruction *instr) +{ + struct ir3_block *tblock = + resolve_dest_block(instr->cat0.target); + struct ir3_instruction *target; + + if (tblock != instr->cat0.target) { + list_delinit(&instr->cat0.target->node); + instr->cat0.target = tblock; + return true; + } + + target = list_first_entry(&tblock->instr_list, + struct ir3_instruction, node); + + /* TODO maybe a less fragile way to do this. But we are expecting + * a pattern from sched_block() that looks like: + * + * br !p0.x, #else-block + * br p0.x, #if-block + * + * if the first branch target is +2, or if 2nd branch target is +1 + * then we can just drop the jump. + */ + unsigned next_block; + if (instr->cat0.inv == true) + next_block = 2; + else + next_block = 1; + + if ((!target) || (target->ip == (instr->ip + next_block))) { + list_delinit(&instr->node); + return true; + } else { + instr->cat0.immed = + (int)target->ip - (int)instr->ip; + } + return false; +} + +/* resolve jumps, removing jumps/branches to immediately following + * instruction which we end up with from earlier stages. Since + * removing an instruction can invalidate earlier instruction's + * branch offsets, we need to do this iteratively until no more + * branches are removed. + */ +static bool +resolve_jumps(struct ir3 *ir) +{ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) + if (is_flow(instr) && instr->cat0.target) + if (resolve_jump(instr)) + return true; + + return false; +} + +/* we want to mark points where divergent flow control re-converges + * with (jp) flags. For now, since we don't do any optimization for + * things that start out as a 'do {} while()', re-convergence points + * will always be a branch or jump target. Note that this is overly + * conservative, since unconditional jump targets are not convergence + * points, we are just assuming that the other path to reach the jump + * target was divergent. If we were clever enough to optimize the + * jump at end of a loop back to a conditional branch into a single + * conditional branch, ie. like: + * + * add.f r1.w, r0.x, (neg)(r)c2.x <= loop start + * mul.f r1.z, r1.z, r0.x + * mul.f r1.y, r1.y, r0.x + * mul.f r0.z, r1.x, r0.x + * mul.f r0.w, r0.y, r0.x + * cmps.f.ge r0.x, (r)c2.y, (r)r1.w + * add.s r0.x, (r)r0.x, (r)-1 + * sel.f32 r0.x, (r)c3.y, (r)r0.x, c3.x + * cmps.f.eq p0.x, r0.x, c3.y + * mov.f32f32 r0.x, r1.w + * mov.f32f32 r0.y, r0.w + * mov.f32f32 r1.x, r0.z + * (rpt2)nop + * br !p0.x, #-13 + * (jp)mul.f r0.x, c263.y, r1.y + * + * Then we'd have to be more clever, as the convergence point is no + * longer a branch or jump target. + */ +static void +mark_convergence_points(struct ir3 *ir) +{ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + if (is_flow(instr) && instr->cat0.target) { + struct ir3_instruction *target = + list_first_entry(&instr->cat0.target->instr_list, + struct ir3_instruction, node); + target->flags |= IR3_INSTR_JP; + } + } + } +} + +void +ir3_legalize(struct ir3 *ir, int *num_samp, bool *has_ssbo, int *max_bary) +{ + struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx); + bool progress; + + ctx->max_bary = -1; + + /* allocate per-block data: */ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + block->data = rzalloc(ctx, struct ir3_legalize_block_data); + } + + /* process each block: */ + do { + progress = false; + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + progress |= legalize_block(ctx, block); + } + } while (progress); + + *num_samp = ctx->num_samp; + *has_ssbo = ctx->has_ssbo; + *max_bary = ctx->max_bary; + + do { + ir3_count_instructions(ir); + } while(resolve_jumps(ir)); + + mark_convergence_points(ir); + + ralloc_free(ctx); +} diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c new file mode 100644 index 00000000000..70c01ee0593 --- /dev/null +++ b/src/freedreno/ir3/ir3_nir.c @@ -0,0 +1,263 @@ +/* + * Copyright (C) 2015 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + + +#include "util/debug.h" + +#include "ir3_nir.h" +#include "ir3_compiler.h" +#include "ir3_shader.h" + +static const nir_shader_compiler_options options = { + .lower_fpow = true, + .lower_scmp = true, + .lower_flrp32 = true, + .lower_flrp64 = true, + .lower_ffract = true, + .lower_fmod32 = true, + .lower_fmod64 = true, + .lower_fdiv = true, + .lower_ldexp = true, + .fuse_ffma = true, + .native_integers = true, + .vertex_id_zero_based = true, + .lower_extract_byte = true, + .lower_extract_word = true, + .lower_all_io_to_temps = true, + .lower_helper_invocation = true, +}; + +const nir_shader_compiler_options * +ir3_get_compiler_options(struct ir3_compiler *compiler) +{ + return &options; +} + +/* for given shader key, are any steps handled in nir? */ +bool +ir3_key_lowers_nir(const struct ir3_shader_key *key) +{ + return key->fsaturate_s | key->fsaturate_t | key->fsaturate_r | + key->vsaturate_s | key->vsaturate_t | key->vsaturate_r | + key->ucp_enables | key->color_two_side | + key->fclamp_color | key->vclamp_color; +} + +#define OPT(nir, pass, ...) ({ \ + bool this_progress = false; \ + NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \ + this_progress; \ +}) + +#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__) + +static void +ir3_optimize_loop(nir_shader *s) +{ + bool progress; + do { + progress = false; + + OPT_V(s, nir_lower_vars_to_ssa); + progress |= OPT(s, nir_opt_copy_prop_vars); + progress |= OPT(s, nir_opt_dead_write_vars); + progress |= OPT(s, nir_lower_alu_to_scalar); + progress |= OPT(s, nir_lower_phis_to_scalar); + + progress |= OPT(s, nir_copy_prop); + progress |= OPT(s, nir_opt_dce); + progress |= OPT(s, nir_opt_cse); + static int gcm = -1; + if (gcm == -1) + gcm = env_var_as_unsigned("GCM", 0); + if (gcm == 1) + progress |= OPT(s, nir_opt_gcm, true); + else if (gcm == 2) + progress |= OPT(s, nir_opt_gcm, false); + progress |= OPT(s, nir_opt_peephole_select, 16); + progress |= OPT(s, nir_opt_intrinsics); + progress |= OPT(s, nir_opt_algebraic); + progress |= OPT(s, nir_opt_constant_folding); + progress |= OPT(s, nir_opt_dead_cf); + if (OPT(s, nir_opt_trivial_continues)) { + progress |= true; + /* If nir_opt_trivial_continues makes progress, then we need to clean + * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll + * to make progress. + */ + OPT(s, nir_copy_prop); + OPT(s, nir_opt_dce); + } + progress |= OPT(s, nir_opt_if); + progress |= OPT(s, nir_opt_remove_phis); + progress |= OPT(s, nir_opt_undef); + + } while (progress); +} + +struct nir_shader * +ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s, + const struct ir3_shader_key *key) +{ + struct nir_lower_tex_options tex_options = { + .lower_rect = 0, + }; + + if (key) { + switch (shader->type) { + case MESA_SHADER_FRAGMENT: + tex_options.saturate_s = key->fsaturate_s; + tex_options.saturate_t = key->fsaturate_t; + tex_options.saturate_r = key->fsaturate_r; + break; + case MESA_SHADER_VERTEX: + tex_options.saturate_s = key->vsaturate_s; + tex_options.saturate_t = key->vsaturate_t; + tex_options.saturate_r = key->vsaturate_r; + break; + default: + /* TODO */ + break; + } + } + + if (shader->compiler->gpu_id >= 400) { + /* a4xx seems to have *no* sam.p */ + tex_options.lower_txp = ~0; /* lower all txp */ + } else { + /* a3xx just needs to avoid sam.p for 3d tex */ + tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D); + } + + if (ir3_shader_debug & IR3_DBG_DISASM) { + debug_printf("----------------------\n"); + nir_print_shader(s, stdout); + debug_printf("----------------------\n"); + } + + OPT_V(s, nir_opt_global_to_local); + OPT_V(s, nir_lower_regs_to_ssa); + + if (key) { + if (s->info.stage == MESA_SHADER_VERTEX) { + OPT_V(s, nir_lower_clip_vs, key->ucp_enables, false); + if (key->vclamp_color) + OPT_V(s, nir_lower_clamp_color_outputs); + } else if (s->info.stage == MESA_SHADER_FRAGMENT) { + OPT_V(s, nir_lower_clip_fs, key->ucp_enables); + if (key->fclamp_color) + OPT_V(s, nir_lower_clamp_color_outputs); + } + if (key->color_two_side) { + OPT_V(s, nir_lower_two_sided_color); + } + } else { + /* only want to do this the first time (when key is null) + * and not again on any potential 2nd variant lowering pass: + */ + OPT_V(s, ir3_nir_apply_trig_workarounds); + } + + OPT_V(s, nir_lower_tex, &tex_options); + OPT_V(s, nir_lower_load_const_to_scalar); + if (shader->compiler->gpu_id < 500) + OPT_V(s, ir3_nir_lower_tg4_to_tex); + + ir3_optimize_loop(s); + + /* do idiv lowering after first opt loop to give a chance for + * divide by immed power-of-two to be caught first: + */ + if (OPT(s, nir_lower_idiv)) + ir3_optimize_loop(s); + + OPT_V(s, nir_remove_dead_variables, nir_var_local); + + OPT_V(s, nir_move_load_const); + + if (ir3_shader_debug & IR3_DBG_DISASM) { + debug_printf("----------------------\n"); + nir_print_shader(s, stdout); + debug_printf("----------------------\n"); + } + + nir_sweep(s); + + return s; +} + +void +ir3_nir_scan_driver_consts(nir_shader *shader, + struct ir3_driver_const_layout *layout) +{ + nir_foreach_function(function, shader) { + if (!function->impl) + continue; + + nir_foreach_block(block, function->impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = + nir_instr_as_intrinsic(instr); + unsigned idx; + + switch (intr->intrinsic) { + case nir_intrinsic_get_buffer_size: + idx = nir_src_as_const_value(intr->src[0])->u32[0]; + if (layout->ssbo_size.mask & (1 << idx)) + break; + layout->ssbo_size.mask |= (1 << idx); + layout->ssbo_size.off[idx] = + layout->ssbo_size.count; + layout->ssbo_size.count += 1; /* one const per */ + break; + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_min: + case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_deref_store: + case nir_intrinsic_image_deref_size: + idx = nir_intrinsic_get_var(intr, 0)->data.driver_location; + if (layout->image_dims.mask & (1 << idx)) + break; + layout->image_dims.mask |= (1 << idx); + layout->image_dims.off[idx] = + layout->image_dims.count; + layout->image_dims.count += 3; /* three const per */ + break; + default: + break; + } + } + } + } +} diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h new file mode 100644 index 00000000000..74201d34160 --- /dev/null +++ b/src/freedreno/ir3/ir3_nir.h @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2015 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#ifndef IR3_NIR_H_ +#define IR3_NIR_H_ + +#include "compiler/nir/nir.h" +#include "compiler/shader_enums.h" + +#include "ir3_shader.h" + +void ir3_nir_scan_driver_consts(nir_shader *shader, struct ir3_driver_const_layout *layout); + +bool ir3_nir_apply_trig_workarounds(nir_shader *shader); +bool ir3_nir_lower_tg4_to_tex(nir_shader *shader); + +const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler); +bool ir3_key_lowers_nir(const struct ir3_shader_key *key); +struct nir_shader * ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s, + const struct ir3_shader_key *key); + +#endif /* IR3_NIR_H_ */ diff --git a/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c b/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c new file mode 100644 index 00000000000..37a3dcb26f8 --- /dev/null +++ b/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c @@ -0,0 +1,138 @@ +/* + * Copyright © 2017 Ilia Mirkin + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "ir3_nir.h" +#include "compiler/nir/nir_builder.h" + +/* A4XX has a broken GATHER4 operation. It performs the texture swizzle on the + * gather results, rather than before. As a result, it must be emulated with + * direct texture calls. + */ + +static bool +lower_tg4(nir_block *block, nir_builder *b, void *mem_ctx) +{ + bool progress = false; + + static const int offsets[3][2] = { {0, 1}, {1, 1}, {1, 0} }; + + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_tex) + continue; + + nir_tex_instr *tg4 = (nir_tex_instr *)instr; + + if (tg4->op != nir_texop_tg4) + continue; + + b->cursor = nir_before_instr(&tg4->instr); + + nir_ssa_def *results[4]; + int offset_index = nir_tex_instr_src_index(tg4, nir_tex_src_offset); + for (int i = 0; i < 4; i++) { + int num_srcs = tg4->num_srcs + 1 /* lod */; + if (offset_index < 0 && i < 3) + num_srcs++; + + nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs); + tex->op = nir_texop_txl; + tex->sampler_dim = tg4->sampler_dim; + tex->coord_components = tg4->coord_components; + tex->is_array = tg4->is_array; + tex->is_shadow = tg4->is_shadow; + tex->is_new_style_shadow = tg4->is_new_style_shadow; + tex->texture_index = tg4->texture_index; + tex->sampler_index = tg4->sampler_index; + tex->dest_type = tg4->dest_type; + + for (int j = 0; j < tg4->num_srcs; j++) { + nir_src_copy(&tex->src[j].src, &tg4->src[j].src, tex); + tex->src[j].src_type = tg4->src[j].src_type; + } + if (i != 3) { + nir_ssa_def *offset = + nir_vec2(b, nir_imm_int(b, offsets[i][0]), + nir_imm_int(b, offsets[i][1])); + if (offset_index < 0) { + tex->src[tg4->num_srcs].src = nir_src_for_ssa(offset); + tex->src[tg4->num_srcs].src_type = nir_tex_src_offset; + } else { + assert(nir_tex_instr_src_size(tex, offset_index) == 2); + nir_ssa_def *orig = nir_ssa_for_src( + b, tex->src[offset_index].src, 2); + tex->src[offset_index].src = + nir_src_for_ssa(nir_iadd(b, orig, offset)); + } + } + tex->src[num_srcs - 1].src = nir_src_for_ssa(nir_imm_float(b, 0)); + tex->src[num_srcs - 1].src_type = nir_tex_src_lod; + + nir_ssa_dest_init(&tex->instr, &tex->dest, + nir_tex_instr_dest_size(tex), 32, NULL); + nir_builder_instr_insert(b, &tex->instr); + + results[i] = nir_channel(b, &tex->dest.ssa, tg4->component); + } + + nir_ssa_def *result = nir_vec4(b, results[0], results[1], results[2], results[3]); + nir_ssa_def_rewrite_uses(&tg4->dest.ssa, nir_src_for_ssa(result)); + + nir_instr_remove(&tg4->instr); + + progress = true; + } + + return progress; +} + +static bool +lower_tg4_func(nir_function_impl *impl) +{ + void *mem_ctx = ralloc_parent(impl); + nir_builder b; + nir_builder_init(&b, impl); + + bool progress = false; + nir_foreach_block_safe(block, impl) { + progress |= lower_tg4(block, &b, mem_ctx); + } + + if (progress) + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + + return progress; +} + +bool +ir3_nir_lower_tg4_to_tex(nir_shader *shader) +{ + bool progress = false; + + nir_foreach_function(function, shader) { + if (function->impl) + progress |= lower_tg4_func(function->impl); + } + + return progress; +} diff --git a/src/freedreno/ir3/ir3_nir_trig.py b/src/freedreno/ir3/ir3_nir_trig.py new file mode 100644 index 00000000000..3968aea543c --- /dev/null +++ b/src/freedreno/ir3/ir3_nir_trig.py @@ -0,0 +1,51 @@ +# +# Copyright (C) 2016 Intel Corporation +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +from __future__ import print_function + +import argparse +import sys + +trig_workarounds = [ + (('fsin', 'x'), ('fsin', ('fsub', ('fmul', 6.283185, ('ffract', ('fadd', ('fmul', 0.159155, 'x'), 0.5))), 3.141593))), + (('fcos', 'x'), ('fcos', ('fsub', ('fmul', 6.283185, ('ffract', ('fadd', ('fmul', 0.159155, 'x'), 0.5))), 3.141593))), +] + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--import-path', required=True) + args = parser.parse_args() + sys.path.insert(0, args.import_path) + run() + + +def run(): + import nir_algebraic # pylint: disable=import-error + + print('#include "ir3_nir.h"') + print(nir_algebraic.AlgebraicPass("ir3_nir_apply_trig_workarounds", + trig_workarounds).render()) + + +if __name__ == '__main__': + main() diff --git a/src/freedreno/ir3/ir3_print.c b/src/freedreno/ir3/ir3_print.c new file mode 100644 index 00000000000..b6ef6e4b5a7 --- /dev/null +++ b/src/freedreno/ir3/ir3_print.c @@ -0,0 +1,264 @@ +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include <stdarg.h> +#include <stdio.h> + +#include "ir3.h" + +#define PTRID(x) ((unsigned long)(x)) + +static void print_instr_name(struct ir3_instruction *instr) +{ + if (!instr) + return; +#ifdef DEBUG + printf("%04u:", instr->serialno); +#endif + printf("%04u:", instr->name); + printf("%04u:", instr->ip); + printf("%03u: ", instr->depth); + + if (instr->flags & IR3_INSTR_SY) + printf("(sy)"); + if (instr->flags & IR3_INSTR_SS) + printf("(ss)"); + + if (is_meta(instr)) { + switch (instr->opc) { + case OPC_META_INPUT: printf("_meta:in"); break; + case OPC_META_FO: printf("_meta:fo"); break; + case OPC_META_FI: printf("_meta:fi"); break; + + /* shouldn't hit here.. just for debugging: */ + default: printf("_meta:%d", instr->opc); break; + } + } else if (instr->opc == OPC_MOV) { + static const char *type[] = { + [TYPE_F16] = "f16", + [TYPE_F32] = "f32", + [TYPE_U16] = "u16", + [TYPE_U32] = "u32", + [TYPE_S16] = "s16", + [TYPE_S32] = "s32", + [TYPE_U8] = "u8", + [TYPE_S8] = "s8", + }; + if (instr->cat1.src_type == instr->cat1.dst_type) + printf("mov"); + else + printf("cov"); + printf(".%s%s", type[instr->cat1.src_type], type[instr->cat1.dst_type]); + } else { + printf("%s", ir3_instr_name(instr)); + if (instr->flags & IR3_INSTR_3D) + printf(".3d"); + if (instr->flags & IR3_INSTR_A) + printf(".a"); + if (instr->flags & IR3_INSTR_O) + printf(".o"); + if (instr->flags & IR3_INSTR_P) + printf(".p"); + if (instr->flags & IR3_INSTR_S) + printf(".s"); + if (instr->flags & IR3_INSTR_S2EN) + printf(".s2en"); + } +} + +static void print_reg_name(struct ir3_register *reg) +{ + if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) && + (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))) + printf("(absneg)"); + else if (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)) + printf("(neg)"); + else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) + printf("(abs)"); + + if (reg->flags & IR3_REG_IMMED) { + printf("imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val); + } else if (reg->flags & IR3_REG_ARRAY) { + printf("arr[id=%u, offset=%d, size=%u", reg->array.id, + reg->array.offset, reg->size); + /* for ARRAY we could have null src, for example first write + * instruction.. + */ + if (reg->instr) { + printf(", _["); + print_instr_name(reg->instr); + printf("]"); + } + printf("]"); + } else if (reg->flags & IR3_REG_SSA) { + printf("_["); + print_instr_name(reg->instr); + printf("]"); + } else if (reg->flags & IR3_REG_RELATIV) { + if (reg->flags & IR3_REG_HALF) + printf("h"); + if (reg->flags & IR3_REG_CONST) + printf("c<a0.x + %d>", reg->array.offset); + else + printf("\x1b[0;31mr<a0.x + %d>\x1b[0m (%u)", reg->array.offset, reg->size); + } else { + if (reg->flags & IR3_REG_HALF) + printf("h"); + if (reg->flags & IR3_REG_CONST) + printf("c%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]); + else + printf("\x1b[0;31mr%u.%c\x1b[0m", reg_num(reg), "xyzw"[reg_comp(reg)]); + } +} + +static void +tab(int lvl) +{ + for (int i = 0; i < lvl; i++) + printf("\t"); +} + +static void +print_instr(struct ir3_instruction *instr, int lvl) +{ + unsigned i; + + tab(lvl); + + print_instr_name(instr); + for (i = 0; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + printf(i ? ", " : " "); + print_reg_name(reg); + } + + if (instr->address) { + printf(", address=_"); + printf("["); + print_instr_name(instr->address); + printf("]"); + } + + if (instr->cp.left) { + printf(", left=_"); + printf("["); + print_instr_name(instr->cp.left); + printf("]"); + } + + if (instr->cp.right) { + printf(", right=_"); + printf("["); + print_instr_name(instr->cp.right); + printf("]"); + } + + if (instr->opc == OPC_META_FO) { + printf(", off=%d", instr->fo.off); + } + + if (is_flow(instr) && instr->cat0.target) { + /* the predicate register src is implied: */ + if (instr->opc == OPC_BR) { + printf(" %sp0.x", instr->cat0.inv ? "!" : ""); + } + printf(", target=block%u", block_id(instr->cat0.target)); + } + + if (instr->deps_count) { + printf(", false-deps:"); + for (unsigned i = 0; i < instr->deps_count; i++) { + if (i > 0) + printf(", "); + printf("_["); + print_instr_name(instr->deps[i]); + printf("]"); + } + } + + printf("\n"); +} + +void ir3_print_instr(struct ir3_instruction *instr) +{ + print_instr(instr, 0); +} + +static void +print_block(struct ir3_block *block, int lvl) +{ + tab(lvl); printf("block%u {\n", block_id(block)); + + if (block->predecessors_count > 0) { + tab(lvl+1); + printf("pred: "); + for (unsigned i = 0; i < block->predecessors_count; i++) { + if (i) + printf(", "); + printf("block%u", block_id(block->predecessors[i])); + } + printf("\n"); + } + + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + print_instr(instr, lvl+1); + } + + tab(lvl+1); printf("/* keeps:\n"); + for (unsigned i = 0; i < block->keeps_count; i++) { + print_instr(block->keeps[i], lvl+2); + } + tab(lvl+1); printf(" */\n"); + + if (block->successors[1]) { + /* leading into if/else: */ + tab(lvl+1); + printf("/* succs: if _["); + print_instr_name(block->condition); + printf("] block%u; else block%u; */\n", + block_id(block->successors[0]), + block_id(block->successors[1])); + } else if (block->successors[0]) { + tab(lvl+1); + printf("/* succs: block%u; */\n", + block_id(block->successors[0])); + } + tab(lvl); printf("}\n"); +} + +void +ir3_print(struct ir3 *ir) +{ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) + print_block(block, 0); + + for (unsigned i = 0; i < ir->noutputs; i++) { + if (!ir->outputs[i]) + continue; + printf("out%d: ", i); + print_instr(ir->outputs[i], 0); + } +} diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c new file mode 100644 index 00000000000..ad09c4018d3 --- /dev/null +++ b/src/freedreno/ir3/ir3_ra.c @@ -0,0 +1,1124 @@ +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include "util/u_math.h" +#include "util/register_allocate.h" +#include "util/ralloc.h" +#include "util/bitset.h" + +#include "ir3.h" +#include "ir3_compiler.h" + +/* + * Register Assignment: + * + * Uses the register_allocate util, which implements graph coloring + * algo with interference classes. To handle the cases where we need + * consecutive registers (for example, texture sample instructions), + * we model these as larger (double/quad/etc) registers which conflict + * with the corresponding registers in other classes. + * + * Additionally we create additional classes for half-regs, which + * do not conflict with the full-reg classes. We do need at least + * sizes 1-4 (to deal w/ texture sample instructions output to half- + * reg). At the moment we don't create the higher order half-reg + * classes as half-reg frequently does not have enough precision + * for texture coords at higher resolutions. + * + * There are some additional cases that we need to handle specially, + * as the graph coloring algo doesn't understand "partial writes". + * For example, a sequence like: + * + * add r0.z, ... + * sam (f32)(xy)r0.x, ... + * ... + * sam (f32)(xyzw)r0.w, r0.x, ... ; 3d texture, so r0.xyz are coord + * + * In this scenario, we treat r0.xyz as class size 3, which is written + * (from a use/def perspective) at the 'add' instruction and ignore the + * subsequent partial writes to r0.xy. So the 'add r0.z, ...' is the + * defining instruction, as it is the first to partially write r0.xyz. + * + * Note i965 has a similar scenario, which they solve with a virtual + * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after + * register assignment. But for us that is horrible from a scheduling + * standpoint. Instead what we do is use idea of 'definer' instruction. + * Ie. the first instruction (lowest ip) to write to the variable is the + * one we consider from use/def perspective when building interference + * graph. (Other instructions which write other variable components + * just define the variable some more.) + * + * Arrays of arbitrary size are handled via pre-coloring a consecutive + * sequence of registers. Additional scalar (single component) reg + * names are allocated starting at ctx->class_base[total_class_count] + * (see arr->base), which are pre-colored. In the use/def graph direct + * access is treated as a single element use/def, and indirect access + * is treated as use or def of all array elements. (Only the first + * def is tracked, in case of multiple indirect writes, etc.) + * + * TODO arrays that fit in one of the pre-defined class sizes should + * not need to be pre-colored, but instead could be given a normal + * vreg name. (Ignoring this for now since it is a good way to work + * out the kinks with arbitrary sized arrays.) + * + * TODO might be easier for debugging to split this into two passes, + * the first assigning vreg names in a way that we could ir3_print() + * the result. + */ + +static const unsigned class_sizes[] = { + 1, 2, 3, 4, + 4 + 4, /* txd + 1d/2d */ + 4 + 6, /* txd + 3d */ +}; +#define class_count ARRAY_SIZE(class_sizes) + +static const unsigned half_class_sizes[] = { + 1, 2, 3, 4, +}; +#define half_class_count ARRAY_SIZE(half_class_sizes) + +/* seems to just be used for compute shaders? Seems like vec1 and vec3 + * are sufficient (for now?) + */ +static const unsigned high_class_sizes[] = { + 1, 3, +}; +#define high_class_count ARRAY_SIZE(high_class_sizes) + +#define total_class_count (class_count + half_class_count + high_class_count) + +/* Below a0.x are normal regs. RA doesn't need to assign a0.x/p0.x. */ +#define NUM_REGS (4 * 48) /* r0 to r47 */ +#define NUM_HIGH_REGS (4 * 8) /* r48 to r55 */ +#define FIRST_HIGH_REG (4 * 48) +/* Number of virtual regs in a given class: */ +#define CLASS_REGS(i) (NUM_REGS - (class_sizes[i] - 1)) +#define HALF_CLASS_REGS(i) (NUM_REGS - (half_class_sizes[i] - 1)) +#define HIGH_CLASS_REGS(i) (NUM_HIGH_REGS - (high_class_sizes[i] - 1)) + +#define HALF_OFFSET (class_count) +#define HIGH_OFFSET (class_count + half_class_count) + +/* register-set, created one time, used for all shaders: */ +struct ir3_ra_reg_set { + struct ra_regs *regs; + unsigned int classes[class_count]; + unsigned int half_classes[half_class_count]; + unsigned int high_classes[high_class_count]; + /* maps flat virtual register space to base gpr: */ + uint16_t *ra_reg_to_gpr; + /* maps cls,gpr to flat virtual register space: */ + uint16_t **gpr_to_ra_reg; +}; + +static void +build_q_values(unsigned int **q_values, unsigned off, + const unsigned *sizes, unsigned count) +{ + for (unsigned i = 0; i < count; i++) { + q_values[i + off] = rzalloc_array(q_values, unsigned, total_class_count); + + /* From register_allocate.c: + * + * q(B,C) (indexed by C, B is this register class) in + * Runeson/Nyström paper. This is "how many registers of B could + * the worst choice register from C conflict with". + * + * If we just let the register allocation algorithm compute these + * values, is extremely expensive. However, since all of our + * registers are laid out, we can very easily compute them + * ourselves. View the register from C as fixed starting at GRF n + * somewhere in the middle, and the register from B as sliding back + * and forth. Then the first register to conflict from B is the + * one starting at n - class_size[B] + 1 and the last register to + * conflict will start at n + class_size[B] - 1. Therefore, the + * number of conflicts from B is class_size[B] + class_size[C] - 1. + * + * +-+-+-+-+-+-+ +-+-+-+-+-+-+ + * B | | | | | |n| --> | | | | | | | + * +-+-+-+-+-+-+ +-+-+-+-+-+-+ + * +-+-+-+-+-+ + * C |n| | | | | + * +-+-+-+-+-+ + * + * (Idea copied from brw_fs_reg_allocate.cpp) + */ + for (unsigned j = 0; j < count; j++) + q_values[i + off][j + off] = sizes[i] + sizes[j] - 1; + } +} + +/* One-time setup of RA register-set, which describes all the possible + * "virtual" registers and their interferences. Ie. double register + * occupies (and conflicts with) two single registers, and so forth. + * Since registers do not need to be aligned to their class size, they + * can conflict with other registers in the same class too. Ie: + * + * Single (base) | Double + * --------------+--------------- + * R0 | D0 + * R1 | D0 D1 + * R2 | D1 D2 + * R3 | D2 + * .. and so on.. + * + * (NOTE the disassembler uses notation like r0.x/y/z/w but those are + * really just four scalar registers. Don't let that confuse you.) + */ +struct ir3_ra_reg_set * +ir3_ra_alloc_reg_set(struct ir3_compiler *compiler) +{ + struct ir3_ra_reg_set *set = rzalloc(compiler, struct ir3_ra_reg_set); + unsigned ra_reg_count, reg, first_half_reg, first_high_reg, base; + unsigned int **q_values; + + /* calculate # of regs across all classes: */ + ra_reg_count = 0; + for (unsigned i = 0; i < class_count; i++) + ra_reg_count += CLASS_REGS(i); + for (unsigned i = 0; i < half_class_count; i++) + ra_reg_count += HALF_CLASS_REGS(i); + for (unsigned i = 0; i < high_class_count; i++) + ra_reg_count += HIGH_CLASS_REGS(i); + + /* allocate and populate q_values: */ + q_values = ralloc_array(set, unsigned *, total_class_count); + + build_q_values(q_values, 0, class_sizes, class_count); + build_q_values(q_values, HALF_OFFSET, half_class_sizes, half_class_count); + build_q_values(q_values, HIGH_OFFSET, high_class_sizes, high_class_count); + + /* allocate the reg-set.. */ + set->regs = ra_alloc_reg_set(set, ra_reg_count, true); + set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count); + set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count); + + /* .. and classes */ + reg = 0; + for (unsigned i = 0; i < class_count; i++) { + set->classes[i] = ra_alloc_reg_class(set->regs); + + set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i)); + + for (unsigned j = 0; j < CLASS_REGS(i); j++) { + ra_class_add_reg(set->regs, set->classes[i], reg); + + set->ra_reg_to_gpr[reg] = j; + set->gpr_to_ra_reg[i][j] = reg; + + for (unsigned br = j; br < j + class_sizes[i]; br++) + ra_add_transitive_reg_conflict(set->regs, br, reg); + + reg++; + } + } + + first_half_reg = reg; + base = HALF_OFFSET; + + for (unsigned i = 0; i < half_class_count; i++) { + set->half_classes[i] = ra_alloc_reg_class(set->regs); + + set->gpr_to_ra_reg[base + i] = + ralloc_array(set, uint16_t, HALF_CLASS_REGS(i)); + + for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) { + ra_class_add_reg(set->regs, set->half_classes[i], reg); + + set->ra_reg_to_gpr[reg] = j; + set->gpr_to_ra_reg[base + i][j] = reg; + + for (unsigned br = j; br < j + half_class_sizes[i]; br++) + ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg); + + reg++; + } + } + + first_high_reg = reg; + base = HIGH_OFFSET; + + for (unsigned i = 0; i < high_class_count; i++) { + set->high_classes[i] = ra_alloc_reg_class(set->regs); + + set->gpr_to_ra_reg[base + i] = + ralloc_array(set, uint16_t, HIGH_CLASS_REGS(i)); + + for (unsigned j = 0; j < HIGH_CLASS_REGS(i); j++) { + ra_class_add_reg(set->regs, set->high_classes[i], reg); + + set->ra_reg_to_gpr[reg] = j; + set->gpr_to_ra_reg[base + i][j] = reg; + + for (unsigned br = j; br < j + high_class_sizes[i]; br++) + ra_add_transitive_reg_conflict(set->regs, br + first_high_reg, reg); + + reg++; + } + } + + /* starting a6xx, half precision regs conflict w/ full precision regs: */ + if (compiler->gpu_id >= 600) { + /* because of transitivity, we can get away with just setting up + * conflicts between the first class of full and half regs: + */ + for (unsigned j = 0; j < CLASS_REGS(0) / 2; j++) { + unsigned freg = set->gpr_to_ra_reg[0][j]; + unsigned hreg0 = set->gpr_to_ra_reg[HALF_OFFSET][(j * 2) + 0]; + unsigned hreg1 = set->gpr_to_ra_reg[HALF_OFFSET][(j * 2) + 1]; + + ra_add_transitive_reg_conflict(set->regs, freg, hreg0); + ra_add_transitive_reg_conflict(set->regs, freg, hreg1); + } + + // TODO also need to update q_values, but for now: + ra_set_finalize(set->regs, NULL); + } else { + ra_set_finalize(set->regs, q_values); + } + + ralloc_free(q_values); + + return set; +} + +/* additional block-data (per-block) */ +struct ir3_ra_block_data { + BITSET_WORD *def; /* variables defined before used in block */ + BITSET_WORD *use; /* variables used before defined in block */ + BITSET_WORD *livein; /* which defs reach entry point of block */ + BITSET_WORD *liveout; /* which defs reach exit point of block */ +}; + +/* additional instruction-data (per-instruction) */ +struct ir3_ra_instr_data { + /* cached instruction 'definer' info: */ + struct ir3_instruction *defn; + int off, sz, cls; +}; + +/* register-assign context, per-shader */ +struct ir3_ra_ctx { + struct ir3 *ir; + gl_shader_stage type; + bool frag_face; + + struct ir3_ra_reg_set *set; + struct ra_graph *g; + unsigned alloc_count; + /* one per class, plus one slot for arrays: */ + unsigned class_alloc_count[total_class_count + 1]; + unsigned class_base[total_class_count + 1]; + unsigned instr_cnt; + unsigned *def, *use; /* def/use table */ + struct ir3_ra_instr_data *instrd; +}; + +/* does it conflict? */ +static inline bool +intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end) +{ + return !((a_start >= b_end) || (b_start >= a_end)); +} + +static bool +is_half(struct ir3_instruction *instr) +{ + return !!(instr->regs[0]->flags & IR3_REG_HALF); +} + +static bool +is_high(struct ir3_instruction *instr) +{ + return !!(instr->regs[0]->flags & IR3_REG_HIGH); +} + +static int +size_to_class(unsigned sz, bool half, bool high) +{ + if (high) { + for (unsigned i = 0; i < high_class_count; i++) + if (high_class_sizes[i] >= sz) + return i + HIGH_OFFSET; + } else if (half) { + for (unsigned i = 0; i < half_class_count; i++) + if (half_class_sizes[i] >= sz) + return i + HALF_OFFSET; + } else { + for (unsigned i = 0; i < class_count; i++) + if (class_sizes[i] >= sz) + return i; + } + debug_assert(0); + return -1; +} + +static bool +writes_gpr(struct ir3_instruction *instr) +{ + if (is_store(instr)) + return false; + /* is dest a normal temp register: */ + struct ir3_register *reg = instr->regs[0]; + if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)) + return false; + if ((reg->num == regid(REG_A0, 0)) || + (reg->num == regid(REG_P0, 0))) + return false; + return true; +} + +static bool +instr_before(struct ir3_instruction *a, struct ir3_instruction *b) +{ + if (a->flags & IR3_INSTR_UNUSED) + return false; + return (a->ip < b->ip); +} + +static struct ir3_instruction * +get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, + int *sz, int *off) +{ + struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; + struct ir3_instruction *d = NULL; + + if (id->defn) { + *sz = id->sz; + *off = id->off; + return id->defn; + } + + if (instr->opc == OPC_META_FI) { + /* What about the case where collect is subset of array, we + * need to find the distance between where actual array starts + * and fanin.. that probably doesn't happen currently. + */ + struct ir3_register *src; + int dsz, doff; + + /* note: don't use foreach_ssa_src as this gets called once + * while assigning regs (which clears SSA flag) + */ + foreach_src_n(src, n, instr) { + struct ir3_instruction *dd; + if (!src->instr) + continue; + + dd = get_definer(ctx, src->instr, &dsz, &doff); + + if ((!d) || instr_before(dd, d)) { + d = dd; + *sz = dsz; + *off = doff - n; + } + } + + } else if (instr->cp.right || instr->cp.left) { + /* covers also the meta:fo case, which ends up w/ single + * scalar instructions for each component: + */ + struct ir3_instruction *f = ir3_neighbor_first(instr); + + /* by definition, the entire sequence forms one linked list + * of single scalar register nodes (even if some of them may + * be fanouts from a texture sample (for example) instr. We + * just need to walk the list finding the first element of + * the group defined (lowest ip) + */ + int cnt = 0; + + /* need to skip over unused in the group: */ + while (f && (f->flags & IR3_INSTR_UNUSED)) { + f = f->cp.right; + cnt++; + } + + while (f) { + if ((!d) || instr_before(f, d)) + d = f; + if (f == instr) + *off = cnt; + f = f->cp.right; + cnt++; + } + + *sz = cnt; + + } else { + /* second case is looking directly at the instruction which + * produces multiple values (eg, texture sample), rather + * than the fanout nodes that point back to that instruction. + * This isn't quite right, because it may be part of a larger + * group, such as: + * + * sam (f32)(xyzw)r0.x, ... + * add r1.x, ... + * add r1.y, ... + * sam (f32)(xyzw)r2.x, r0.w <-- (r0.w, r1.x, r1.y) + * + * need to come up with a better way to handle that case. + */ + if (instr->address) { + *sz = instr->regs[0]->size; + } else { + *sz = util_last_bit(instr->regs[0]->wrmask); + } + *off = 0; + d = instr; + } + + if (d->opc == OPC_META_FO) { + struct ir3_instruction *dd; + int dsz, doff; + + dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff); + + /* by definition, should come before: */ + debug_assert(instr_before(dd, d)); + + *sz = MAX2(*sz, dsz); + + debug_assert(instr->opc == OPC_META_FO); + *off = MAX2(*off, instr->fo.off); + + d = dd; + } + + id->defn = d; + id->sz = *sz; + id->off = *off; + + return d; +} + +static void +ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block) +{ + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; + if (instr->regs_count == 0) + continue; + /* couple special cases: */ + if (writes_addr(instr) || writes_pred(instr)) { + id->cls = -1; + } else if (instr->regs[0]->flags & IR3_REG_ARRAY) { + id->cls = total_class_count; + } else { + id->defn = get_definer(ctx, instr, &id->sz, &id->off); + id->cls = size_to_class(id->sz, is_half(id->defn), is_high(id->defn)); + } + } +} + +/* give each instruction a name (and ip), and count up the # of names + * of each class + */ +static void +ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block) +{ + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; + +#ifdef DEBUG + instr->name = ~0; +#endif + + ctx->instr_cnt++; + + if (instr->regs_count == 0) + continue; + + if (!writes_gpr(instr)) + continue; + + if (id->defn != instr) + continue; + + /* arrays which don't fit in one of the pre-defined class + * sizes are pre-colored: + */ + if ((id->cls >= 0) && (id->cls < total_class_count)) { + instr->name = ctx->class_alloc_count[id->cls]++; + ctx->alloc_count++; + } + } +} + +static void +ra_init(struct ir3_ra_ctx *ctx) +{ + unsigned n, base; + + ir3_clear_mark(ctx->ir); + n = ir3_count_instructions(ctx->ir); + + ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n); + + list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { + ra_block_find_definers(ctx, block); + } + + list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { + ra_block_name_instructions(ctx, block); + } + + /* figure out the base register name for each class. The + * actual ra name is class_base[cls] + instr->name; + */ + ctx->class_base[0] = 0; + for (unsigned i = 1; i <= total_class_count; i++) { + ctx->class_base[i] = ctx->class_base[i-1] + + ctx->class_alloc_count[i-1]; + } + + /* and vreg names for array elements: */ + base = ctx->class_base[total_class_count]; + list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { + arr->base = base; + ctx->class_alloc_count[total_class_count] += arr->length; + base += arr->length; + } + ctx->alloc_count += ctx->class_alloc_count[total_class_count]; + + ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count); + ralloc_steal(ctx->g, ctx->instrd); + ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); + ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); +} + +static unsigned +__ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn) +{ + unsigned name; + debug_assert(cls >= 0); + debug_assert(cls < total_class_count); /* we shouldn't get arrays here.. */ + name = ctx->class_base[cls] + defn->name; + debug_assert(name < ctx->alloc_count); + return name; +} + +static int +ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id) +{ + /* TODO handle name mapping for arrays */ + return __ra_name(ctx, id->cls, id->defn); +} + +static void +ra_destroy(struct ir3_ra_ctx *ctx) +{ + ralloc_free(ctx->g); +} + +static void +ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) +{ + struct ir3_ra_block_data *bd; + unsigned bitset_words = BITSET_WORDS(ctx->alloc_count); + +#define def(name, instr) \ + do { \ + /* defined on first write: */ \ + if (!ctx->def[name]) \ + ctx->def[name] = instr->ip; \ + ctx->use[name] = instr->ip; \ + BITSET_SET(bd->def, name); \ + } while(0); + +#define use(name, instr) \ + do { \ + ctx->use[name] = MAX2(ctx->use[name], instr->ip); \ + if (!BITSET_TEST(bd->def, name)) \ + BITSET_SET(bd->use, name); \ + } while(0); + + bd = rzalloc(ctx->g, struct ir3_ra_block_data); + + bd->def = rzalloc_array(bd, BITSET_WORD, bitset_words); + bd->use = rzalloc_array(bd, BITSET_WORD, bitset_words); + bd->livein = rzalloc_array(bd, BITSET_WORD, bitset_words); + bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words); + + block->data = bd; + + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + struct ir3_instruction *src; + struct ir3_register *reg; + + if (instr->regs_count == 0) + continue; + + /* There are a couple special cases to deal with here: + * + * fanout: used to split values from a higher class to a lower + * class, for example split the results of a texture fetch + * into individual scalar values; We skip over these from + * a 'def' perspective, and for a 'use' we walk the chain + * up to the defining instruction. + * + * fanin: used to collect values from lower class and assemble + * them together into a higher class, for example arguments + * to texture sample instructions; We consider these to be + * defined at the earliest fanin source. + * + * Most of this is handled in the get_definer() helper. + * + * In either case, we trace the instruction back to the original + * definer and consider that as the def/use ip. + */ + + if (writes_gpr(instr)) { + struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; + struct ir3_register *dst = instr->regs[0]; + + if (dst->flags & IR3_REG_ARRAY) { + struct ir3_array *arr = + ir3_lookup_array(ctx->ir, dst->array.id); + unsigned i; + + arr->start_ip = MIN2(arr->start_ip, instr->ip); + arr->end_ip = MAX2(arr->end_ip, instr->ip); + + /* set the node class now.. in case we don't encounter + * this array dst again. From register_alloc algo's + * perspective, these are all single/scalar regs: + */ + for (i = 0; i < arr->length; i++) { + unsigned name = arr->base + i; + ra_set_node_class(ctx->g, name, ctx->set->classes[0]); + } + + /* indirect write is treated like a write to all array + * elements, since we don't know which one is actually + * written: + */ + if (dst->flags & IR3_REG_RELATIV) { + for (i = 0; i < arr->length; i++) { + unsigned name = arr->base + i; + def(name, instr); + } + } else { + unsigned name = arr->base + dst->array.offset; + def(name, instr); + } + + } else if (id->defn == instr) { + unsigned name = ra_name(ctx, id); + + /* since we are in SSA at this point: */ + debug_assert(!BITSET_TEST(bd->use, name)); + + def(name, id->defn); + + if (is_high(id->defn)) { + ra_set_node_class(ctx->g, name, + ctx->set->high_classes[id->cls - HIGH_OFFSET]); + } else if (is_half(id->defn)) { + ra_set_node_class(ctx->g, name, + ctx->set->half_classes[id->cls - HALF_OFFSET]); + } else { + ra_set_node_class(ctx->g, name, + ctx->set->classes[id->cls]); + } + } + } + + foreach_src(reg, instr) { + if (reg->flags & IR3_REG_ARRAY) { + struct ir3_array *arr = + ir3_lookup_array(ctx->ir, reg->array.id); + arr->start_ip = MIN2(arr->start_ip, instr->ip); + arr->end_ip = MAX2(arr->end_ip, instr->ip); + + /* indirect read is treated like a read fromall array + * elements, since we don't know which one is actually + * read: + */ + if (reg->flags & IR3_REG_RELATIV) { + unsigned i; + for (i = 0; i < arr->length; i++) { + unsigned name = arr->base + i; + use(name, instr); + } + } else { + unsigned name = arr->base + reg->array.offset; + use(name, instr); + /* NOTE: arrays are not SSA so unconditionally + * set use bit: + */ + BITSET_SET(bd->use, name); + debug_assert(reg->array.offset < arr->length); + } + } else if ((src = ssa(reg)) && writes_gpr(src)) { + unsigned name = ra_name(ctx, &ctx->instrd[src->ip]); + use(name, instr); + } + } + } +} + +static bool +ra_compute_livein_liveout(struct ir3_ra_ctx *ctx) +{ + unsigned bitset_words = BITSET_WORDS(ctx->alloc_count); + bool progress = false; + + list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { + struct ir3_ra_block_data *bd = block->data; + + /* update livein: */ + for (unsigned i = 0; i < bitset_words; i++) { + BITSET_WORD new_livein = + (bd->use[i] | (bd->liveout[i] & ~bd->def[i])); + + if (new_livein & ~bd->livein[i]) { + bd->livein[i] |= new_livein; + progress = true; + } + } + + /* update liveout: */ + for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) { + struct ir3_block *succ = block->successors[j]; + struct ir3_ra_block_data *succ_bd; + + if (!succ) + continue; + + succ_bd = succ->data; + + for (unsigned i = 0; i < bitset_words; i++) { + BITSET_WORD new_liveout = + (succ_bd->livein[i] & ~bd->liveout[i]); + + if (new_liveout) { + bd->liveout[i] |= new_liveout; + progress = true; + } + } + } + } + + return progress; +} + +static void +print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt) +{ + bool first = true; + debug_printf(" %s:", name); + for (unsigned i = 0; i < cnt; i++) { + if (BITSET_TEST(bs, i)) { + if (!first) + debug_printf(","); + debug_printf(" %04u", i); + first = false; + } + } + debug_printf("\n"); +} + +static void +ra_add_interference(struct ir3_ra_ctx *ctx) +{ + struct ir3 *ir = ctx->ir; + + /* initialize array live ranges: */ + list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) { + arr->start_ip = ~0; + arr->end_ip = 0; + } + + /* compute live ranges (use/def) on a block level, also updating + * block's def/use bitmasks (used below to calculate per-block + * livein/liveout): + */ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + ra_block_compute_live_ranges(ctx, block); + } + + /* update per-block livein/liveout: */ + while (ra_compute_livein_liveout(ctx)) {} + + if (ir3_shader_debug & IR3_DBG_OPTMSGS) { + debug_printf("AFTER LIVEIN/OUT:\n"); + ir3_print(ir); + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + struct ir3_ra_block_data *bd = block->data; + debug_printf("block%u:\n", block_id(block)); + print_bitset(" def", bd->def, ctx->alloc_count); + print_bitset(" use", bd->use, ctx->alloc_count); + print_bitset(" l/i", bd->livein, ctx->alloc_count); + print_bitset(" l/o", bd->liveout, ctx->alloc_count); + } + list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) { + debug_printf("array%u:\n", arr->id); + debug_printf(" length: %u\n", arr->length); + debug_printf(" start_ip: %u\n", arr->start_ip); + debug_printf(" end_ip: %u\n", arr->end_ip); + } + } + + /* extend start/end ranges based on livein/liveout info from cfg: */ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + struct ir3_ra_block_data *bd = block->data; + + for (unsigned i = 0; i < ctx->alloc_count; i++) { + if (BITSET_TEST(bd->livein, i)) { + ctx->def[i] = MIN2(ctx->def[i], block->start_ip); + ctx->use[i] = MAX2(ctx->use[i], block->start_ip); + } + + if (BITSET_TEST(bd->liveout, i)) { + ctx->def[i] = MIN2(ctx->def[i], block->end_ip); + ctx->use[i] = MAX2(ctx->use[i], block->end_ip); + } + } + + list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { + for (unsigned i = 0; i < arr->length; i++) { + if (BITSET_TEST(bd->livein, i + arr->base)) { + arr->start_ip = MIN2(arr->start_ip, block->start_ip); + } + if (BITSET_TEST(bd->livein, i + arr->base)) { + arr->end_ip = MAX2(arr->end_ip, block->end_ip); + } + } + } + } + + /* need to fix things up to keep outputs live: */ + for (unsigned i = 0; i < ir->noutputs; i++) { + struct ir3_instruction *instr = ir->outputs[i]; + unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]); + ctx->use[name] = ctx->instr_cnt; + } + + for (unsigned i = 0; i < ctx->alloc_count; i++) { + for (unsigned j = 0; j < ctx->alloc_count; j++) { + if (intersects(ctx->def[i], ctx->use[i], + ctx->def[j], ctx->use[j])) { + ra_add_node_interference(ctx->g, i, j); + } + } + } +} + +/* some instructions need fix-up if dst register is half precision: */ +static void fixup_half_instr_dst(struct ir3_instruction *instr) +{ + switch (opc_cat(instr->opc)) { + case 1: /* move instructions */ + instr->cat1.dst_type = half_type(instr->cat1.dst_type); + break; + case 3: + switch (instr->opc) { + case OPC_MAD_F32: + instr->opc = OPC_MAD_F16; + break; + case OPC_SEL_B32: + instr->opc = OPC_SEL_B16; + break; + case OPC_SEL_S32: + instr->opc = OPC_SEL_S16; + break; + case OPC_SEL_F32: + instr->opc = OPC_SEL_F16; + break; + case OPC_SAD_S32: + instr->opc = OPC_SAD_S16; + break; + /* instructions may already be fixed up: */ + case OPC_MAD_F16: + case OPC_SEL_B16: + case OPC_SEL_S16: + case OPC_SEL_F16: + case OPC_SAD_S16: + break; + default: + assert(0); + break; + } + break; + case 5: + instr->cat5.type = half_type(instr->cat5.type); + break; + } +} +/* some instructions need fix-up if src register is half precision: */ +static void fixup_half_instr_src(struct ir3_instruction *instr) +{ + switch (instr->opc) { + case OPC_MOV: + instr->cat1.src_type = half_type(instr->cat1.src_type); + break; + default: + break; + } +} + +/* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first + * array access(es) which do not have any previous access to depend + * on from scheduling point of view + */ +static void +reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg, + struct ir3_instruction *instr) +{ + struct ir3_ra_instr_data *id; + + if (reg->flags & IR3_REG_ARRAY) { + struct ir3_array *arr = + ir3_lookup_array(ctx->ir, reg->array.id); + unsigned name = arr->base + reg->array.offset; + unsigned r = ra_get_node_reg(ctx->g, name); + unsigned num = ctx->set->ra_reg_to_gpr[r]; + + if (reg->flags & IR3_REG_RELATIV) { + reg->array.offset = num; + } else { + reg->num = num; + reg->flags &= ~IR3_REG_SSA; + } + + reg->flags &= ~IR3_REG_ARRAY; + } else if ((id = &ctx->instrd[instr->ip]) && id->defn) { + unsigned name = ra_name(ctx, id); + unsigned r = ra_get_node_reg(ctx->g, name); + unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off; + + debug_assert(!(reg->flags & IR3_REG_RELATIV)); + + if (is_high(id->defn)) + num += FIRST_HIGH_REG; + + reg->num = num; + reg->flags &= ~IR3_REG_SSA; + + if (is_half(id->defn)) + reg->flags |= IR3_REG_HALF; + } +} + +static void +ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block) +{ + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + struct ir3_register *reg; + + if (instr->regs_count == 0) + continue; + + if (writes_gpr(instr)) { + reg_assign(ctx, instr->regs[0], instr); + if (instr->regs[0]->flags & IR3_REG_HALF) + fixup_half_instr_dst(instr); + } + + foreach_src_n(reg, n, instr) { + struct ir3_instruction *src = reg->instr; + /* Note: reg->instr could be null for IR3_REG_ARRAY */ + if (!(src || (reg->flags & IR3_REG_ARRAY))) + continue; + reg_assign(ctx, instr->regs[n+1], src); + if (instr->regs[n+1]->flags & IR3_REG_HALF) + fixup_half_instr_src(instr); + } + } +} + +static int +ra_alloc(struct ir3_ra_ctx *ctx) +{ + /* pre-assign array elements: + */ + list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { + unsigned base = 0; + + if (arr->end_ip == 0) + continue; + + /* figure out what else we conflict with which has already + * been assigned: + */ +retry: + list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) { + if (arr2 == arr) + break; + if (arr2->end_ip == 0) + continue; + /* if it intersects with liverange AND register range.. */ + if (intersects(arr->start_ip, arr->end_ip, + arr2->start_ip, arr2->end_ip) && + intersects(base, base + arr->length, + arr2->reg, arr2->reg + arr2->length)) { + base = MAX2(base, arr2->reg + arr2->length); + goto retry; + } + } + + arr->reg = base; + + for (unsigned i = 0; i < arr->length; i++) { + unsigned name, reg; + + name = arr->base + i; + reg = ctx->set->gpr_to_ra_reg[0][base++]; + + ra_set_node_reg(ctx->g, name, reg); + } + } + + if (!ra_allocate(ctx->g)) + return -1; + + list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { + ra_block_alloc(ctx, block); + } + + return 0; +} + +int ir3_ra(struct ir3 *ir, gl_shader_stage type, + bool frag_coord, bool frag_face) +{ + struct ir3_ra_ctx ctx = { + .ir = ir, + .type = type, + .frag_face = frag_face, + .set = ir->compiler->set, + }; + int ret; + + ra_init(&ctx); + ra_add_interference(&ctx); + ret = ra_alloc(&ctx); + ra_destroy(&ctx); + + return ret; +} diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c new file mode 100644 index 00000000000..6552980d90c --- /dev/null +++ b/src/freedreno/ir3/ir3_sched.c @@ -0,0 +1,818 @@ +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + + +#include "util/u_math.h" + +#include "ir3.h" + +/* + * Instruction Scheduling: + * + * A recursive depth based scheduling algo. Recursively find an eligible + * instruction to schedule from the deepest instruction (recursing through + * it's unscheduled src instructions). Normally this would result in a + * lot of re-traversal of the same instructions, so we cache results in + * instr->data (and clear cached results that would be no longer valid + * after scheduling an instruction). + * + * There are a few special cases that need to be handled, since sched + * is currently independent of register allocation. Usages of address + * register (a0.x) or predicate register (p0.x) must be serialized. Ie. + * if you have two pairs of instructions that write the same special + * register and then read it, then those pairs cannot be interleaved. + * To solve this, when we are in such a scheduling "critical section", + * and we encounter a conflicting write to a special register, we try + * to schedule any remaining instructions that use that value first. + */ + +struct ir3_sched_ctx { + struct ir3_block *block; /* the current block */ + struct list_head depth_list; /* depth sorted unscheduled instrs */ + struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/ + struct ir3_instruction *addr; /* current a0.x user, if any */ + struct ir3_instruction *pred; /* current p0.x user, if any */ + bool error; +}; + +static bool is_sfu_or_mem(struct ir3_instruction *instr) +{ + return is_sfu(instr) || is_mem(instr); +} + +#define NULL_INSTR ((void *)~0) + +static void +clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) +{ + list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) { + if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr) + instr2->data = NULL; + } +} + +static void +schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) +{ + debug_assert(ctx->block == instr->block); + + /* maybe there is a better way to handle this than just stuffing + * a nop.. ideally we'd know about this constraint in the + * scheduling and depth calculation.. + */ + if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr)) + ir3_NOP(ctx->block); + + /* remove from depth list: + */ + list_delinit(&instr->node); + + if (writes_addr(instr)) { + debug_assert(ctx->addr == NULL); + ctx->addr = instr; + } + + if (writes_pred(instr)) { + debug_assert(ctx->pred == NULL); + ctx->pred = instr; + } + + instr->flags |= IR3_INSTR_MARK; + + list_addtail(&instr->node, &instr->block->instr_list); + ctx->scheduled = instr; + + if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) { + clear_cache(ctx, NULL); + } else { + /* invalidate only the necessary entries.. */ + clear_cache(ctx, instr); + } +} + +static struct ir3_instruction * +deepest(struct ir3_instruction **srcs, unsigned nsrcs) +{ + struct ir3_instruction *d = NULL; + unsigned i = 0, id = 0; + + while ((i < nsrcs) && !(d = srcs[id = i])) + i++; + + if (!d) + return NULL; + + for (; i < nsrcs; i++) + if (srcs[i] && (srcs[i]->depth > d->depth)) + d = srcs[id = i]; + + srcs[id] = NULL; + + return d; +} + +/** + * @block: the block to search in, starting from end; in first pass, + * this will be the block the instruction would be inserted into + * (but has not yet, ie. it only contains already scheduled + * instructions). For intra-block scheduling (second pass), this + * would be one of the predecessor blocks. + * @instr: the instruction to search for + * @maxd: max distance, bail after searching this # of instruction + * slots, since it means the instruction we are looking for is + * far enough away + * @pred: if true, recursively search into predecessor blocks to + * find the worst case (shortest) distance (only possible after + * individual blocks are all scheduled + */ +static unsigned +distance(struct ir3_block *block, struct ir3_instruction *instr, + unsigned maxd, bool pred) +{ + unsigned d = 0; + + list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) { + if ((n == instr) || (d >= maxd)) + return d; + /* NOTE: don't count branch/jump since we don't know yet if they will + * be eliminated later in resolve_jumps().. really should do that + * earlier so we don't have this constraint. + */ + if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR))) + d++; + } + + /* if coming from a predecessor block, assume it is assigned far + * enough away.. we'll fix up later. + */ + if (!pred) + return maxd; + + if (pred && (block->data != block)) { + /* Search into predecessor blocks, finding the one with the + * shortest distance, since that will be the worst case + */ + unsigned min = maxd - d; + + /* (ab)use block->data to prevent recursion: */ + block->data = block; + + for (unsigned i = 0; i < block->predecessors_count; i++) { + unsigned n; + + n = distance(block->predecessors[i], instr, min, pred); + + min = MIN2(min, n); + } + + block->data = NULL; + d += min; + } + + return d; +} + +/* calculate delay for specified src: */ +static unsigned +delay_calc_srcn(struct ir3_block *block, + struct ir3_instruction *assigner, + struct ir3_instruction *consumer, + unsigned srcn, bool soft, bool pred) +{ + unsigned delay = 0; + + if (is_meta(assigner)) { + struct ir3_instruction *src; + foreach_ssa_src(src, assigner) { + unsigned d; + d = delay_calc_srcn(block, src, consumer, srcn, soft, pred); + delay = MAX2(delay, d); + } + } else { + if (soft) { + if (is_sfu(assigner)) { + delay = 4; + } else { + delay = ir3_delayslots(assigner, consumer, srcn); + } + } else { + delay = ir3_delayslots(assigner, consumer, srcn); + } + delay -= distance(block, assigner, delay, pred); + } + + return delay; +} + +/* calculate delay for instruction (maximum of delay for all srcs): */ +static unsigned +delay_calc(struct ir3_block *block, struct ir3_instruction *instr, + bool soft, bool pred) +{ + unsigned delay = 0; + struct ir3_instruction *src; + + foreach_ssa_src_n(src, i, instr) { + unsigned d; + d = delay_calc_srcn(block, src, instr, i, soft, pred); + delay = MAX2(delay, d); + } + + return delay; +} + +struct ir3_sched_notes { + /* there is at least one kill which could be scheduled, except + * for unscheduled bary.f's: + */ + bool blocked_kill; + /* there is at least one instruction that could be scheduled, + * except for conflicting address/predicate register usage: + */ + bool addr_conflict, pred_conflict; +}; + +static bool is_scheduled(struct ir3_instruction *instr) +{ + return !!(instr->flags & IR3_INSTR_MARK); +} + +/* could an instruction be scheduled if specified ssa src was scheduled? */ +static bool +could_sched(struct ir3_instruction *instr, struct ir3_instruction *src) +{ + struct ir3_instruction *other_src; + foreach_ssa_src(other_src, instr) { + /* if dependency not scheduled, we aren't ready yet: */ + if ((src != other_src) && !is_scheduled(other_src)) { + return false; + } + } + return true; +} + +/* Check if instruction is ok to schedule. Make sure it is not blocked + * by use of addr/predicate register, etc. + */ +static bool +check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, + struct ir3_instruction *instr) +{ + /* For instructions that write address register we need to + * make sure there is at least one instruction that uses the + * addr value which is otherwise ready. + * + * TODO if any instructions use pred register and have other + * src args, we would need to do the same for writes_pred().. + */ + if (writes_addr(instr)) { + struct ir3 *ir = instr->block->shader; + bool ready = false; + for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) { + struct ir3_instruction *indirect = ir->indirects[i]; + if (!indirect) + continue; + if (indirect->address != instr) + continue; + ready = could_sched(indirect, instr); + } + + /* nothing could be scheduled, so keep looking: */ + if (!ready) + return false; + } + + /* if this is a write to address/predicate register, and that + * register is currently in use, we need to defer until it is + * free: + */ + if (writes_addr(instr) && ctx->addr) { + debug_assert(ctx->addr != instr); + notes->addr_conflict = true; + return false; + } + + if (writes_pred(instr) && ctx->pred) { + debug_assert(ctx->pred != instr); + notes->pred_conflict = true; + return false; + } + + /* if the instruction is a kill, we need to ensure *every* + * bary.f is scheduled. The hw seems unhappy if the thread + * gets killed before the end-input (ei) flag is hit. + * + * We could do this by adding each bary.f instruction as + * virtual ssa src for the kill instruction. But we have + * fixed length instr->regs[]. + * + * TODO this wouldn't be quite right if we had multiple + * basic blocks, if any block was conditional. We'd need + * to schedule the bary.f's outside of any block which + * was conditional that contained a kill.. I think.. + */ + if (is_kill(instr)) { + struct ir3 *ir = instr->block->shader; + + for (unsigned i = 0; i < ir->baryfs_count; i++) { + struct ir3_instruction *baryf = ir->baryfs[i]; + if (baryf->flags & IR3_INSTR_UNUSED) + continue; + if (!is_scheduled(baryf)) { + notes->blocked_kill = true; + return false; + } + } + } + + return true; +} + +/* Find the best instruction to schedule from specified instruction or + * recursively it's ssa sources. + */ +static struct ir3_instruction * +find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, + struct ir3_instruction *instr) +{ + struct ir3_instruction *srcs[__ssa_src_cnt(instr)]; + struct ir3_instruction *src; + unsigned nsrcs = 0; + + if (is_scheduled(instr)) + return NULL; + + /* use instr->data to cache the results of recursing up the + * instr src's. Otherwise the recursive algo can scale quite + * badly w/ shader size. But this takes some care to clear + * the cache appropriately when instructions are scheduled. + */ + if (instr->data) { + if (instr->data == NULL_INSTR) + return NULL; + return instr->data; + } + + /* find unscheduled srcs: */ + foreach_ssa_src(src, instr) { + if (!is_scheduled(src)) { + debug_assert(nsrcs < ARRAY_SIZE(srcs)); + srcs[nsrcs++] = src; + } + } + + /* if all our src's are already scheduled: */ + if (nsrcs == 0) { + if (check_instr(ctx, notes, instr)) { + instr->data = instr; + return instr; + } + return NULL; + } + + while ((src = deepest(srcs, nsrcs))) { + struct ir3_instruction *candidate; + + candidate = find_instr_recursive(ctx, notes, src); + if (!candidate) + continue; + + if (check_instr(ctx, notes, candidate)) { + instr->data = candidate; + return candidate; + } + } + + instr->data = NULL_INSTR; + return NULL; +} + +/* find instruction to schedule: */ +static struct ir3_instruction * +find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, + bool soft) +{ + struct ir3_instruction *best_instr = NULL; + unsigned min_delay = ~0; + + /* TODO we'd really rather use the list/array of block outputs. But we + * don't have such a thing. Recursing *every* instruction in the list + * will result in a lot of repeated traversal, since instructions will + * get traversed both when they appear as ssa src to a later instruction + * as well as where they appear in the depth_list. + */ + list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) { + struct ir3_instruction *candidate; + unsigned delay; + + candidate = find_instr_recursive(ctx, notes, instr); + if (!candidate) + continue; + + delay = delay_calc(ctx->block, candidate, soft, false); + if (delay < min_delay) { + best_instr = candidate; + min_delay = delay; + } + + if (min_delay == 0) + break; + } + + return best_instr; +} + +/* "spill" the address register by remapping any unscheduled + * instructions which depend on the current address register + * to a clone of the instruction which wrote the address reg. + */ +static struct ir3_instruction * +split_addr(struct ir3_sched_ctx *ctx) +{ + struct ir3 *ir; + struct ir3_instruction *new_addr = NULL; + unsigned i; + + debug_assert(ctx->addr); + + ir = ctx->addr->block->shader; + + for (i = 0; i < ir->indirects_count; i++) { + struct ir3_instruction *indirect = ir->indirects[i]; + + if (!indirect) + continue; + + /* skip instructions already scheduled: */ + if (is_scheduled(indirect)) + continue; + + /* remap remaining instructions using current addr + * to new addr: + */ + if (indirect->address == ctx->addr) { + if (!new_addr) { + new_addr = ir3_instr_clone(ctx->addr); + /* original addr is scheduled, but new one isn't: */ + new_addr->flags &= ~IR3_INSTR_MARK; + } + ir3_instr_set_address(indirect, new_addr); + } + } + + /* all remaining indirects remapped to new addr: */ + ctx->addr = NULL; + + return new_addr; +} + +/* "spill" the predicate register by remapping any unscheduled + * instructions which depend on the current predicate register + * to a clone of the instruction which wrote the address reg. + */ +static struct ir3_instruction * +split_pred(struct ir3_sched_ctx *ctx) +{ + struct ir3 *ir; + struct ir3_instruction *new_pred = NULL; + unsigned i; + + debug_assert(ctx->pred); + + ir = ctx->pred->block->shader; + + for (i = 0; i < ir->predicates_count; i++) { + struct ir3_instruction *predicated = ir->predicates[i]; + + /* skip instructions already scheduled: */ + if (is_scheduled(predicated)) + continue; + + /* remap remaining instructions using current pred + * to new pred: + * + * TODO is there ever a case when pred isn't first + * (and only) src? + */ + if (ssa(predicated->regs[1]) == ctx->pred) { + if (!new_pred) { + new_pred = ir3_instr_clone(ctx->pred); + /* original pred is scheduled, but new one isn't: */ + new_pred->flags &= ~IR3_INSTR_MARK; + } + predicated->regs[1]->instr = new_pred; + } + } + + /* all remaining predicated remapped to new pred: */ + ctx->pred = NULL; + + return new_pred; +} + +static void +sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) +{ + struct list_head unscheduled_list; + + ctx->block = block; + + /* addr/pred writes are per-block: */ + ctx->addr = NULL; + ctx->pred = NULL; + + /* move all instructions to the unscheduled list, and + * empty the block's instruction list (to which we will + * be inserting). + */ + list_replace(&block->instr_list, &unscheduled_list); + list_inithead(&block->instr_list); + list_inithead(&ctx->depth_list); + + /* first a pre-pass to schedule all meta:input instructions + * (which need to appear first so that RA knows the register is + * occupied), and move remaining to depth sorted list: + */ + list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) { + if (instr->opc == OPC_META_INPUT) { + schedule(ctx, instr); + } else { + ir3_insert_by_depth(instr, &ctx->depth_list); + } + } + + while (!list_empty(&ctx->depth_list)) { + struct ir3_sched_notes notes = {0}; + struct ir3_instruction *instr; + + instr = find_eligible_instr(ctx, ¬es, true); + if (!instr) + instr = find_eligible_instr(ctx, ¬es, false); + + if (instr) { + unsigned delay = delay_calc(ctx->block, instr, false, false); + + /* and if we run out of instructions that can be scheduled, + * then it is time for nop's: + */ + debug_assert(delay <= 6); + while (delay > 0) { + ir3_NOP(block); + delay--; + } + + schedule(ctx, instr); + } else { + struct ir3_instruction *new_instr = NULL; + + /* nothing available to schedule.. if we are blocked on + * address/predicate register conflict, then break the + * deadlock by cloning the instruction that wrote that + * reg: + */ + if (notes.addr_conflict) { + new_instr = split_addr(ctx); + } else if (notes.pred_conflict) { + new_instr = split_pred(ctx); + } else { + debug_assert(0); + ctx->error = true; + return; + } + + if (new_instr) { + /* clearing current addr/pred can change what is + * available to schedule, so clear cache.. + */ + clear_cache(ctx, NULL); + + ir3_insert_by_depth(new_instr, &ctx->depth_list); + /* the original instr that wrote addr/pred may have + * originated from a different block: + */ + new_instr->block = block; + } + } + } + + /* And lastly, insert branch/jump instructions to take us to + * the next block. Later we'll strip back out the branches + * that simply jump to next instruction. + */ + if (block->successors[1]) { + /* if/else, conditional branches to "then" or "else": */ + struct ir3_instruction *br; + unsigned delay = 6; + + debug_assert(ctx->pred); + debug_assert(block->condition); + + delay -= distance(ctx->block, ctx->pred, delay, false); + + while (delay > 0) { + ir3_NOP(block); + delay--; + } + + /* create "else" branch first (since "then" block should + * frequently/always end up being a fall-thru): + */ + br = ir3_BR(block); + br->cat0.inv = true; + br->cat0.target = block->successors[1]; + + /* NOTE: we have to hard code delay of 6 above, since + * we want to insert the nop's before constructing the + * branch. Throw in an assert so we notice if this + * ever breaks on future generation: + */ + debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6); + + br = ir3_BR(block); + br->cat0.target = block->successors[0]; + + } else if (block->successors[0]) { + /* otherwise unconditional jump to next block: */ + struct ir3_instruction *jmp; + + jmp = ir3_JUMP(block); + jmp->cat0.target = block->successors[0]; + } + + /* NOTE: if we kept track of the predecessors, we could do a better + * job w/ (jp) flags.. every node w/ > predecessor is a join point. + * Note that as we eliminate blocks which contain only an unconditional + * jump we probably need to propagate (jp) flag.. + */ +} + +/* After scheduling individual blocks, we still could have cases where + * one (or more) paths into a block, a value produced by a previous + * has too few delay slots to be legal. We can't deal with this in the + * first pass, because loops (ie. we can't ensure all predecessor blocks + * are already scheduled in the first pass). All we can really do at + * this point is stuff in extra nop's until things are legal. + */ +static void +sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) +{ + unsigned n = 0; + + ctx->block = block; + + list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) { + unsigned delay = 0; + + for (unsigned i = 0; i < block->predecessors_count; i++) { + unsigned d = delay_calc(block->predecessors[i], instr, false, true); + delay = MAX2(d, delay); + } + + while (delay > n) { + struct ir3_instruction *nop = ir3_NOP(block); + + /* move to before instr: */ + list_delinit(&nop->node); + list_addtail(&nop->node, &instr->node); + + n++; + } + + /* we can bail once we hit worst case delay: */ + if (++n > 6) + break; + } +} + +int ir3_sched(struct ir3 *ir) +{ + struct ir3_sched_ctx ctx = {0}; + + ir3_clear_mark(ir); + + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + sched_block(&ctx, block); + } + + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + sched_intra_block(&ctx, block); + } + + if (ctx.error) + return -1; + return 0; +} + +/* does instruction 'prior' need to be scheduled before 'instr'? */ +static bool +depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior) +{ + /* TODO for dependencies that are related to a specific object, ie + * a specific SSBO/image/array, we could relax this constraint to + * make accesses to unrelated objects not depend on each other (at + * least as long as not declared coherent) + */ + if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) || + ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class)) + return true; + return !!(instr->barrier_class & prior->barrier_conflict); +} + +static void +add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr) +{ + struct list_head *prev = instr->node.prev; + struct list_head *next = instr->node.next; + + /* add dependencies on previous instructions that must be scheduled + * prior to the current instruction + */ + while (prev != &block->instr_list) { + struct ir3_instruction *pi = + LIST_ENTRY(struct ir3_instruction, prev, node); + + prev = prev->prev; + + if (is_meta(pi)) + continue; + + if (instr->barrier_class == pi->barrier_class) { + ir3_instr_add_dep(instr, pi); + break; + } + + if (depends_on(instr, pi)) + ir3_instr_add_dep(instr, pi); + } + + /* add dependencies on this instruction to following instructions + * that must be scheduled after the current instruction: + */ + while (next != &block->instr_list) { + struct ir3_instruction *ni = + LIST_ENTRY(struct ir3_instruction, next, node); + + next = next->next; + + if (is_meta(ni)) + continue; + + if (instr->barrier_class == ni->barrier_class) { + ir3_instr_add_dep(ni, instr); + break; + } + + if (depends_on(ni, instr)) + ir3_instr_add_dep(ni, instr); + } +} + +/* before scheduling a block, we need to add any necessary false-dependencies + * to ensure that: + * + * (1) barriers are scheduled in the right order wrt instructions related + * to the barrier + * + * (2) reads that come before a write actually get scheduled before the + * write + */ +static void +calculate_deps(struct ir3_block *block) +{ + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + if (instr->barrier_class) { + add_barrier_deps(block, instr); + } + } +} + +void +ir3_sched_add_deps(struct ir3 *ir) +{ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + calculate_deps(block); + } +} diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c new file mode 100644 index 00000000000..8b18e950cca --- /dev/null +++ b/src/freedreno/ir3/ir3_shader.c @@ -0,0 +1,436 @@ +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include "util/u_string.h" +#include "util/u_memory.h" +#include "util/u_format.h" + +#include "drm/freedreno_drmif.h" + +#include "ir3_shader.h" +#include "ir3_compiler.h" +#include "ir3_nir.h" + +int +ir3_glsl_type_size(const struct glsl_type *type) +{ + return glsl_count_attribute_slots(type, false); +} + +static void +delete_variant(struct ir3_shader_variant *v) +{ + if (v->ir) + ir3_destroy(v->ir); + if (v->bo) + fd_bo_del(v->bo); + if (v->immediates) + free(v->immediates); + free(v); +} + +/* for vertex shader, the inputs are loaded into registers before the shader + * is executed, so max_regs from the shader instructions might not properly + * reflect the # of registers actually used, especially in case passthrough + * varyings. + * + * Likewise, for fragment shader, we can have some regs which are passed + * input values but never touched by the resulting shader (ie. as result + * of dead code elimination or simply because we don't know how to turn + * the reg off. + */ +static void +fixup_regfootprint(struct ir3_shader_variant *v) +{ + unsigned i; + + for (i = 0; i < v->inputs_count; i++) { + /* skip frag inputs fetch via bary.f since their reg's are + * not written by gpu before shader starts (and in fact the + * regid's might not even be valid) + */ + if (v->inputs[i].bary) + continue; + + /* ignore high regs that are global to all threads in a warp + * (they exist by default) (a5xx+) + */ + if (v->inputs[i].regid >= regid(48,0)) + continue; + + if (v->inputs[i].compmask) { + unsigned n = util_last_bit(v->inputs[i].compmask) - 1; + int32_t regid = (v->inputs[i].regid + n) >> 2; + v->info.max_reg = MAX2(v->info.max_reg, regid); + } + } + + for (i = 0; i < v->outputs_count; i++) { + int32_t regid = (v->outputs[i].regid + 3) >> 2; + v->info.max_reg = MAX2(v->info.max_reg, regid); + } +} + +/* wrapper for ir3_assemble() which does some info fixup based on + * shader state. Non-static since used by ir3_cmdline too. + */ +void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id) +{ + void *bin; + + bin = ir3_assemble(v->ir, &v->info, gpu_id); + if (!bin) + return NULL; + + if (gpu_id >= 400) { + v->instrlen = v->info.sizedwords / (2 * 16); + } else { + v->instrlen = v->info.sizedwords / (2 * 4); + } + + /* NOTE: if relative addressing is used, we set constlen in + * the compiler (to worst-case value) since we don't know in + * the assembler what the max addr reg value can be: + */ + v->constlen = MIN2(255, MAX2(v->constlen, v->info.max_const + 1)); + + fixup_regfootprint(v); + + return bin; +} + +static void +assemble_variant(struct ir3_shader_variant *v) +{ + struct ir3_compiler *compiler = v->shader->compiler; + uint32_t gpu_id = compiler->gpu_id; + uint32_t sz, *bin; + + bin = ir3_shader_assemble(v, gpu_id); + sz = v->info.sizedwords * 4; + + v->bo = fd_bo_new(compiler->dev, sz, + DRM_FREEDRENO_GEM_CACHE_WCOMBINE | + DRM_FREEDRENO_GEM_TYPE_KMEM); + + memcpy(fd_bo_map(v->bo), bin, sz); + + if (ir3_shader_debug & IR3_DBG_DISASM) { + struct ir3_shader_key key = v->key; + printf("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type, + v->binning_pass, key.color_two_side, key.half_precision); + ir3_shader_disasm(v, bin, stdout); + } + + if (shader_debug_enabled(v->shader->type)) { + fprintf(stderr, "Native code for unnamed %s shader %s:\n", + _mesa_shader_stage_to_string(v->shader->type), + v->shader->nir->info.name); + if (v->shader->type == MESA_SHADER_FRAGMENT) + fprintf(stderr, "SIMD0\n"); + ir3_shader_disasm(v, bin, stderr); + } + + free(bin); + + /* no need to keep the ir around beyond this point: */ + ir3_destroy(v->ir); + v->ir = NULL; +} + +static struct ir3_shader_variant * +create_variant(struct ir3_shader *shader, struct ir3_shader_key *key, + bool binning_pass) +{ + struct ir3_shader_variant *v = CALLOC_STRUCT(ir3_shader_variant); + int ret; + + if (!v) + return NULL; + + v->id = ++shader->variant_count; + v->shader = shader; + v->binning_pass = binning_pass; + v->key = *key; + v->type = shader->type; + + ret = ir3_compile_shader_nir(shader->compiler, v); + if (ret) { + debug_error("compile failed!"); + goto fail; + } + + assemble_variant(v); + if (!v->bo) { + debug_error("assemble failed!"); + goto fail; + } + + return v; + +fail: + delete_variant(v); + return NULL; +} + +static inline struct ir3_shader_variant * +shader_variant(struct ir3_shader *shader, struct ir3_shader_key *key, + bool *created) +{ + struct ir3_shader_variant *v; + + *created = false; + + for (v = shader->variants; v; v = v->next) + if (ir3_shader_key_equal(key, &v->key)) + return v; + + /* compile new variant if it doesn't exist already: */ + v = create_variant(shader, key, false); + if (v) { + v->next = shader->variants; + shader->variants = v; + *created = true; + } + + return v; +} + +struct ir3_shader_variant * +ir3_shader_get_variant(struct ir3_shader *shader, struct ir3_shader_key *key, + bool binning_pass, bool *created) +{ + struct ir3_shader_variant *v = + shader_variant(shader, key, created); + + if (binning_pass) { + if (!v->binning) + v->binning = create_variant(shader, key, true); + return v->binning; + } + + return v; +} + +void +ir3_shader_destroy(struct ir3_shader *shader) +{ + struct ir3_shader_variant *v, *t; + for (v = shader->variants; v; ) { + t = v; + v = v->next; + delete_variant(t); + } + ralloc_free(shader->nir); + free(shader); +} + +struct ir3_shader * +ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir) +{ + struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader); + + shader->compiler = compiler; + shader->id = ++shader->compiler->shader_count; + shader->type = nir->info.stage; + + NIR_PASS_V(nir, nir_lower_io, nir_var_all, ir3_glsl_type_size, + (nir_lower_io_options)0); + + /* do first pass optimization, ignoring the key: */ + shader->nir = ir3_optimize_nir(shader, nir, NULL); + if (ir3_shader_debug & IR3_DBG_DISASM) { + printf("dump nir%d: type=%d", shader->id, shader->type); + nir_print_shader(shader->nir, stdout); + } + + return shader; +} + +static void dump_reg(FILE *out, const char *name, uint32_t r) +{ + if (r != regid(63,0)) + fprintf(out, "; %s: r%d.%c\n", name, r >> 2, "xyzw"[r & 0x3]); +} + +static void dump_output(FILE *out, struct ir3_shader_variant *so, + unsigned slot, const char *name) +{ + uint32_t regid; + regid = ir3_find_output_regid(so, slot); + dump_reg(out, name, regid); +} + +void +ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out) +{ + struct ir3 *ir = so->ir; + struct ir3_register *reg; + const char *type = ir3_shader_stage(so->shader); + uint8_t regid; + unsigned i; + + for (i = 0; i < ir->ninputs; i++) { + if (!ir->inputs[i]) { + fprintf(out, "; in%d unused\n", i); + continue; + } + reg = ir->inputs[i]->regs[0]; + regid = reg->num; + fprintf(out, "@in(%sr%d.%c)\tin%d\n", + (reg->flags & IR3_REG_HALF) ? "h" : "", + (regid >> 2), "xyzw"[regid & 0x3], i); + } + + for (i = 0; i < ir->noutputs; i++) { + if (!ir->outputs[i]) { + fprintf(out, "; out%d unused\n", i); + continue; + } + /* kill shows up as a virtual output.. skip it! */ + if (is_kill(ir->outputs[i])) + continue; + reg = ir->outputs[i]->regs[0]; + regid = reg->num; + fprintf(out, "@out(%sr%d.%c)\tout%d\n", + (reg->flags & IR3_REG_HALF) ? "h" : "", + (regid >> 2), "xyzw"[regid & 0x3], i); + } + + for (i = 0; i < so->immediates_count; i++) { + fprintf(out, "@const(c%d.x)\t", so->constbase.immediate + i); + fprintf(out, "0x%08x, 0x%08x, 0x%08x, 0x%08x\n", + so->immediates[i].val[0], + so->immediates[i].val[1], + so->immediates[i].val[2], + so->immediates[i].val[3]); + } + + disasm_a3xx(bin, so->info.sizedwords, 0, out); + + switch (so->type) { + case MESA_SHADER_VERTEX: + fprintf(out, "; %s: outputs:", type); + for (i = 0; i < so->outputs_count; i++) { + uint8_t regid = so->outputs[i].regid; + fprintf(out, " r%d.%c (%s)", + (regid >> 2), "xyzw"[regid & 0x3], + gl_varying_slot_name(so->outputs[i].slot)); + } + fprintf(out, "\n"); + fprintf(out, "; %s: inputs:", type); + for (i = 0; i < so->inputs_count; i++) { + uint8_t regid = so->inputs[i].regid; + fprintf(out, " r%d.%c (cm=%x,il=%u,b=%u)", + (regid >> 2), "xyzw"[regid & 0x3], + so->inputs[i].compmask, + so->inputs[i].inloc, + so->inputs[i].bary); + } + fprintf(out, "\n"); + break; + case MESA_SHADER_FRAGMENT: + fprintf(out, "; %s: outputs:", type); + for (i = 0; i < so->outputs_count; i++) { + uint8_t regid = so->outputs[i].regid; + fprintf(out, " r%d.%c (%s)", + (regid >> 2), "xyzw"[regid & 0x3], + gl_frag_result_name(so->outputs[i].slot)); + } + fprintf(out, "\n"); + fprintf(out, "; %s: inputs:", type); + for (i = 0; i < so->inputs_count; i++) { + uint8_t regid = so->inputs[i].regid; + fprintf(out, " r%d.%c (%s,cm=%x,il=%u,b=%u)", + (regid >> 2), "xyzw"[regid & 0x3], + gl_varying_slot_name(so->inputs[i].slot), + so->inputs[i].compmask, + so->inputs[i].inloc, + so->inputs[i].bary); + } + fprintf(out, "\n"); + break; + default: + /* TODO */ + break; + } + + /* print generic shader info: */ + fprintf(out, "; %s prog %d/%d: %u instructions, %d half, %d full\n", + type, so->shader->id, so->id, + so->info.instrs_count, + so->info.max_half_reg + 1, + so->info.max_reg + 1); + + fprintf(out, "; %d const, %u constlen\n", + so->info.max_const + 1, + so->constlen); + + fprintf(out, "; %u (ss), %u (sy)\n", so->info.ss, so->info.sy); + + /* print shader type specific info: */ + switch (so->type) { + case MESA_SHADER_VERTEX: + dump_output(out, so, VARYING_SLOT_POS, "pos"); + dump_output(out, so, VARYING_SLOT_PSIZ, "psize"); + break; + case MESA_SHADER_FRAGMENT: + dump_reg(out, "pos (bary)", + ir3_find_sysval_regid(so, SYSTEM_VALUE_VARYING_COORD)); + dump_output(out, so, FRAG_RESULT_DEPTH, "posz"); + if (so->color0_mrt) { + dump_output(out, so, FRAG_RESULT_COLOR, "color"); + } else { + dump_output(out, so, FRAG_RESULT_DATA0, "data0"); + dump_output(out, so, FRAG_RESULT_DATA1, "data1"); + dump_output(out, so, FRAG_RESULT_DATA2, "data2"); + dump_output(out, so, FRAG_RESULT_DATA3, "data3"); + dump_output(out, so, FRAG_RESULT_DATA4, "data4"); + dump_output(out, so, FRAG_RESULT_DATA5, "data5"); + dump_output(out, so, FRAG_RESULT_DATA6, "data6"); + dump_output(out, so, FRAG_RESULT_DATA7, "data7"); + } + /* these two are hard-coded since we don't know how to + * program them to anything but all 0's... + */ + if (so->frag_coord) + fprintf(out, "; fragcoord: r0.x\n"); + if (so->frag_face) + fprintf(out, "; fragface: hr0.x\n"); + break; + default: + /* TODO */ + break; + } + + fprintf(out, "\n"); +} + +uint64_t +ir3_shader_outputs(const struct ir3_shader *so) +{ + return so->nir->info.outputs_written; +} diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h new file mode 100644 index 00000000000..bc47160d6ea --- /dev/null +++ b/src/freedreno/ir3/ir3_shader.h @@ -0,0 +1,587 @@ +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#ifndef IR3_SHADER_H_ +#define IR3_SHADER_H_ + +#include <stdio.h> + +#include "compiler/shader_enums.h" +#include "compiler/nir/nir.h" +#include "util/bitscan.h" + +#include "ir3.h" + +struct glsl_type; + +/* driver param indices: */ +enum ir3_driver_param { + /* compute shader driver params: */ + IR3_DP_NUM_WORK_GROUPS_X = 0, + IR3_DP_NUM_WORK_GROUPS_Y = 1, + IR3_DP_NUM_WORK_GROUPS_Z = 2, + IR3_DP_LOCAL_GROUP_SIZE_X = 4, + IR3_DP_LOCAL_GROUP_SIZE_Y = 5, + IR3_DP_LOCAL_GROUP_SIZE_Z = 6, + /* NOTE: gl_NumWorkGroups should be vec4 aligned because + * glDispatchComputeIndirect() needs to load these from + * the info->indirect buffer. Keep that in mind when/if + * adding any addition CS driver params. + */ + IR3_DP_CS_COUNT = 8, /* must be aligned to vec4 */ + + /* vertex shader driver params: */ + IR3_DP_VTXID_BASE = 0, + IR3_DP_VTXCNT_MAX = 1, + /* user-clip-plane components, up to 8x vec4's: */ + IR3_DP_UCP0_X = 4, + /* .... */ + IR3_DP_UCP7_W = 35, + IR3_DP_VS_COUNT = 36 /* must be aligned to vec4 */ +}; + +#define IR3_MAX_SHADER_BUFFERS 32 +#define IR3_MAX_SHADER_IMAGES 32 +#define IR3_MAX_SO_BUFFERS 4 +#define IR3_MAX_SO_OUTPUTS 64 + +/** + * For consts needed to pass internal values to shader which may or may not + * be required, rather than allocating worst-case const space, we scan the + * shader and allocate consts as-needed: + * + * + SSBO sizes: only needed if shader has a get_buffer_size intrinsic + * for a given SSBO + * + * + Image dimensions: needed to calculate pixel offset, but only for + * images that have a image_store intrinsic + */ +struct ir3_driver_const_layout { + struct { + uint32_t mask; /* bitmask of SSBOs that have get_buffer_size */ + uint32_t count; /* number of consts allocated */ + /* one const allocated per SSBO which has get_buffer_size, + * ssbo_sizes.off[ssbo_id] is offset from start of ssbo_sizes + * consts: + */ + uint32_t off[IR3_MAX_SHADER_BUFFERS]; + } ssbo_size; + + struct { + uint32_t mask; /* bitmask of images that have image_store */ + uint32_t count; /* number of consts allocated */ + /* three const allocated per image which has image_store: + * + cpp (bytes per pixel) + * + pitch (y pitch) + * + array_pitch (z pitch) + */ + uint32_t off[IR3_MAX_SHADER_IMAGES]; + } image_dims; +}; + +/** + * A single output for vertex transform feedback. + */ +struct ir3_stream_output { + unsigned register_index:6; /**< 0 to 63 (OUT index) */ + unsigned start_component:2; /** 0 to 3 */ + unsigned num_components:3; /** 1 to 4 */ + unsigned output_buffer:3; /**< 0 to PIPE_MAX_SO_BUFFERS */ + unsigned dst_offset:16; /**< offset into the buffer in dwords */ + unsigned stream:2; /**< 0 to 3 */ +}; + +/** + * Stream output for vertex transform feedback. + */ +struct ir3_stream_output_info { + unsigned num_outputs; + /** stride for an entire vertex for each buffer in dwords */ + uint16_t stride[IR3_MAX_SO_BUFFERS]; + + /** + * Array of stream outputs, in the order they are to be written in. + * Selected components are tightly packed into the output buffer. + */ + struct ir3_stream_output output[IR3_MAX_SO_OUTPUTS]; +}; + +/* Configuration key used to identify a shader variant.. different + * shader variants can be used to implement features not supported + * in hw (two sided color), binning-pass vertex shader, etc. + */ +struct ir3_shader_key { + union { + struct { + /* + * Combined Vertex/Fragment shader parameters: + */ + unsigned ucp_enables : 8; + + /* do we need to check {v,f}saturate_{s,t,r}? */ + unsigned has_per_samp : 1; + + /* + * Vertex shader variant parameters: + */ + unsigned vclamp_color : 1; + + /* + * Fragment shader variant parameters: + */ + unsigned color_two_side : 1; + unsigned half_precision : 1; + /* used when shader needs to handle flat varyings (a4xx) + * for front/back color inputs to frag shader: + */ + unsigned rasterflat : 1; + unsigned fclamp_color : 1; + }; + uint32_t global; + }; + + /* bitmask of sampler which needs coords clamped for vertex + * shader: + */ + uint16_t vsaturate_s, vsaturate_t, vsaturate_r; + + /* bitmask of sampler which needs coords clamped for frag + * shader: + */ + uint16_t fsaturate_s, fsaturate_t, fsaturate_r; + + /* bitmask of ms shifts */ + uint32_t vsamples, fsamples; + + /* bitmask of samplers which need astc srgb workaround: */ + uint16_t vastc_srgb, fastc_srgb; +}; + +static inline bool +ir3_shader_key_equal(struct ir3_shader_key *a, struct ir3_shader_key *b) +{ + /* slow-path if we need to check {v,f}saturate_{s,t,r} */ + if (a->has_per_samp || b->has_per_samp) + return memcmp(a, b, sizeof(struct ir3_shader_key)) == 0; + return a->global == b->global; +} + +/* will the two keys produce different lowering for a fragment shader? */ +static inline bool +ir3_shader_key_changes_fs(struct ir3_shader_key *key, struct ir3_shader_key *last_key) +{ + if (last_key->has_per_samp || key->has_per_samp) { + if ((last_key->fsaturate_s != key->fsaturate_s) || + (last_key->fsaturate_t != key->fsaturate_t) || + (last_key->fsaturate_r != key->fsaturate_r) || + (last_key->fsamples != key->fsamples) || + (last_key->fastc_srgb != key->fastc_srgb)) + return true; + } + + if (last_key->fclamp_color != key->fclamp_color) + return true; + + if (last_key->color_two_side != key->color_two_side) + return true; + + if (last_key->half_precision != key->half_precision) + return true; + + if (last_key->rasterflat != key->rasterflat) + return true; + + if (last_key->ucp_enables != key->ucp_enables) + return true; + + return false; +} + +/* will the two keys produce different lowering for a vertex shader? */ +static inline bool +ir3_shader_key_changes_vs(struct ir3_shader_key *key, struct ir3_shader_key *last_key) +{ + if (last_key->has_per_samp || key->has_per_samp) { + if ((last_key->vsaturate_s != key->vsaturate_s) || + (last_key->vsaturate_t != key->vsaturate_t) || + (last_key->vsaturate_r != key->vsaturate_r) || + (last_key->vsamples != key->vsamples) || + (last_key->vastc_srgb != key->vastc_srgb)) + return true; + } + + if (last_key->vclamp_color != key->vclamp_color) + return true; + + if (last_key->ucp_enables != key->ucp_enables) + return true; + + return false; +} + +/* clears shader-key flags which don't apply to the given shader + * stage + */ +static inline void +ir3_normalize_key(struct ir3_shader_key *key, gl_shader_stage type) +{ + switch (type) { + case MESA_SHADER_FRAGMENT: + if (key->has_per_samp) { + key->vsaturate_s = 0; + key->vsaturate_t = 0; + key->vsaturate_r = 0; + key->vastc_srgb = 0; + key->vsamples = 0; + } + break; + case MESA_SHADER_VERTEX: + key->color_two_side = false; + key->half_precision = false; + key->rasterflat = false; + if (key->has_per_samp) { + key->fsaturate_s = 0; + key->fsaturate_t = 0; + key->fsaturate_r = 0; + key->fastc_srgb = 0; + key->fsamples = 0; + } + break; + default: + /* TODO */ + break; + } + +} + +struct ir3_shader_variant { + struct fd_bo *bo; + + /* variant id (for debug) */ + uint32_t id; + + struct ir3_shader_key key; + + /* vertex shaders can have an extra version for hwbinning pass, + * which is pointed to by so->binning: + */ + bool binning_pass; + struct ir3_shader_variant *binning; + + struct ir3_driver_const_layout const_layout; + struct ir3_info info; + struct ir3 *ir; + + /* the instructions length is in units of instruction groups + * (4 instructions for a3xx, 16 instructions for a4xx.. each + * instruction is 2 dwords): + */ + unsigned instrlen; + + /* the constants length is in units of vec4's, and is the sum of + * the uniforms and the built-in compiler constants + */ + unsigned constlen; + + /* number of uniforms (in vec4), not including built-in compiler + * constants, etc. + */ + unsigned num_uniforms; + + unsigned num_ubos; + + /* About Linkage: + * + Let the frag shader determine the position/compmask for the + * varyings, since it is the place where we know if the varying + * is actually used, and if so, which components are used. So + * what the hw calls "outloc" is taken from the "inloc" of the + * frag shader. + * + From the vert shader, we only need the output regid + */ + + bool frag_coord, frag_face, color0_mrt; + + /* NOTE: for input/outputs, slot is: + * gl_vert_attrib - for VS inputs + * gl_varying_slot - for VS output / FS input + * gl_frag_result - for FS output + */ + + /* varyings/outputs: */ + unsigned outputs_count; + struct { + uint8_t slot; + uint8_t regid; + } outputs[16 + 2]; /* +POSITION +PSIZE */ + bool writes_pos, writes_psize; + + /* attributes (VS) / varyings (FS): + * Note that sysval's should come *after* normal inputs. + */ + unsigned inputs_count; + struct { + uint8_t slot; + uint8_t regid; + uint8_t compmask; + uint8_t ncomp; + /* location of input (ie. offset passed to bary.f, etc). This + * matches the SP_VS_VPC_DST_REG.OUTLOCn value (a3xx and a4xx + * have the OUTLOCn value offset by 8, presumably to account + * for gl_Position/gl_PointSize) + */ + uint8_t inloc; + /* vertex shader specific: */ + bool sysval : 1; /* slot is a gl_system_value */ + /* fragment shader specific: */ + bool bary : 1; /* fetched varying (vs one loaded into reg) */ + bool rasterflat : 1; /* special handling for emit->rasterflat */ + enum glsl_interp_mode interpolate; + } inputs[16 + 2]; /* +POSITION +FACE */ + + /* sum of input components (scalar). For frag shaders, it only counts + * the varying inputs: + */ + unsigned total_in; + + /* For frag shaders, the total number of inputs (not scalar, + * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR) + */ + unsigned varying_in; + + /* number of samplers/textures (which are currently 1:1): */ + int num_samp; + + /* do we have one or more SSBO instructions: */ + bool has_ssbo; + + /* do we have kill instructions: */ + bool has_kill; + + /* Layout of constant registers, each section (in vec4). Pointer size + * is 32b (a3xx, a4xx), or 64b (a5xx+), which effects the size of the + * UBO and stream-out consts. + */ + struct { + /* user const start at zero */ + unsigned ubo; + /* NOTE that a3xx might need a section for SSBO addresses too */ + unsigned ssbo_sizes; + unsigned image_dims; + unsigned driver_param; + unsigned tfbo; + unsigned immediate; + } constbase; + + unsigned immediates_count; + unsigned immediates_size; + struct { + uint32_t val[4]; + } *immediates; + + /* for astc srgb workaround, the number/base of additional + * alpha tex states we need, and index of original tex states + */ + struct { + unsigned base, count; + unsigned orig_idx[16]; + } astc_srgb; + + /* shader variants form a linked list: */ + struct ir3_shader_variant *next; + + /* replicated here to avoid passing extra ptrs everywhere: */ + gl_shader_stage type; + struct ir3_shader *shader; +}; + +struct ir3_shader { + gl_shader_stage type; + + /* shader id (for debug): */ + uint32_t id; + uint32_t variant_count; + + /* so we know when we can disable TGSI related hacks: */ + bool from_tgsi; + + struct ir3_compiler *compiler; + + struct nir_shader *nir; + struct ir3_stream_output_info stream_output; + + struct ir3_shader_variant *variants; +}; + +void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id); +struct ir3_shader_variant * ir3_shader_get_variant(struct ir3_shader *shader, + struct ir3_shader_key *key, bool binning_pass, bool *created); +struct ir3_shader * ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir); +void ir3_shader_destroy(struct ir3_shader *shader); +void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out); +uint64_t ir3_shader_outputs(const struct ir3_shader *so); + +int +ir3_glsl_type_size(const struct glsl_type *type); + +static inline const char * +ir3_shader_stage(struct ir3_shader *shader) +{ + switch (shader->type) { + case MESA_SHADER_VERTEX: return "VERT"; + case MESA_SHADER_FRAGMENT: return "FRAG"; + case MESA_SHADER_COMPUTE: return "CL"; + default: + unreachable("invalid type"); + return NULL; + } +} + +/* + * Helper/util: + */ + +static inline int +ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot) +{ + int j; + + for (j = 0; j < so->outputs_count; j++) + if (so->outputs[j].slot == slot) + return j; + + /* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n] + * in the vertex shader.. but the fragment shader doesn't know this + * so it will always have both IN.COLOR[n] and IN.BCOLOR[n]. So + * at link time if there is no matching OUT.BCOLOR[n], we must map + * OUT.COLOR[n] to IN.BCOLOR[n]. And visa versa if there is only + * a OUT.BCOLOR[n] but no matching OUT.COLOR[n] + */ + if (slot == VARYING_SLOT_BFC0) { + slot = VARYING_SLOT_COL0; + } else if (slot == VARYING_SLOT_BFC1) { + slot = VARYING_SLOT_COL1; + } else if (slot == VARYING_SLOT_COL0) { + slot = VARYING_SLOT_BFC0; + } else if (slot == VARYING_SLOT_COL1) { + slot = VARYING_SLOT_BFC1; + } else { + return 0; + } + + for (j = 0; j < so->outputs_count; j++) + if (so->outputs[j].slot == slot) + return j; + + debug_assert(0); + + return 0; +} + +static inline int +ir3_next_varying(const struct ir3_shader_variant *so, int i) +{ + while (++i < so->inputs_count) + if (so->inputs[i].compmask && so->inputs[i].bary) + break; + return i; +} + +struct ir3_shader_linkage { + uint8_t max_loc; + uint8_t cnt; + struct { + uint8_t regid; + uint8_t compmask; + uint8_t loc; + } var[32]; +}; + +static inline void +ir3_link_add(struct ir3_shader_linkage *l, uint8_t regid, uint8_t compmask, uint8_t loc) +{ + int i = l->cnt++; + + debug_assert(i < ARRAY_SIZE(l->var)); + + l->var[i].regid = regid; + l->var[i].compmask = compmask; + l->var[i].loc = loc; + l->max_loc = MAX2(l->max_loc, loc + util_last_bit(compmask)); +} + +static inline void +ir3_link_shaders(struct ir3_shader_linkage *l, + const struct ir3_shader_variant *vs, + const struct ir3_shader_variant *fs) +{ + int j = -1, k; + + while (l->cnt < ARRAY_SIZE(l->var)) { + j = ir3_next_varying(fs, j); + + if (j >= fs->inputs_count) + break; + + if (fs->inputs[j].inloc >= fs->total_in) + continue; + + k = ir3_find_output(vs, fs->inputs[j].slot); + + ir3_link_add(l, vs->outputs[k].regid, + fs->inputs[j].compmask, fs->inputs[j].inloc); + } +} + +static inline uint32_t +ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot) +{ + int j; + for (j = 0; j < so->outputs_count; j++) + if (so->outputs[j].slot == slot) + return so->outputs[j].regid; + return regid(63, 0); +} + +static inline uint32_t +ir3_find_sysval_regid(const struct ir3_shader_variant *so, unsigned slot) +{ + int j; + for (j = 0; j < so->inputs_count; j++) + if (so->inputs[j].sysval && (so->inputs[j].slot == slot)) + return so->inputs[j].regid; + return regid(63, 0); +} + +/* calculate register footprint in terms of half-regs (ie. one full + * reg counts as two half-regs). + */ +static inline uint32_t +ir3_shader_halfregs(const struct ir3_shader_variant *v) +{ + return (2 * (v->info.max_reg + 1)) + (v->info.max_half_reg + 1); +} + +#endif /* IR3_SHADER_H_ */ diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build new file mode 100644 index 00000000000..07319dff595 --- /dev/null +++ b/src/freedreno/ir3/meson.build @@ -0,0 +1,64 @@ +# Copyright © 2018 Rob Clark + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +ir3_nir_trig_c = custom_target( + 'ir3_nir_trig.c', + input : 'ir3_nir_trig.py', + output : 'ir3_nir_trig.c', + command : [ + prog_python, '@INPUT@', + '-p', join_paths(meson.source_root(), 'src/compiler/nir/'), + ], + capture : true, + depend_files : nir_algebraic_py, +) + +libfreedreno_ir3_files = files( + 'disasm-a3xx.c', + 'instr-a3xx.h', + 'ir3.c', + 'ir3_compiler_nir.c', + 'ir3_compiler.c', + 'ir3_compiler.h', + 'ir3_cp.c', + 'ir3_depth.c', + 'ir3_group.c', + 'ir3.h', + 'ir3_legalize.c', + 'ir3_nir.c', + 'ir3_nir.h', + 'ir3_nir_lower_tg4_to_tex.c', + 'ir3_print.c', + 'ir3_ra.c', + 'ir3_sched.c', + 'ir3_shader.c', + 'ir3_shader.h', +) + +libfreedreno_ir3 = static_library( + 'freedreno_ir3', + [libfreedreno_ir3_files, ir3_nir_trig_c], + include_directories : [inc_freedreno, inc_common], + c_args : [c_vis_args, no_override_init_args], + cpp_args : [cpp_vis_args], + dependencies : idep_nir_headers, + build_by_default : false, +) + diff --git a/src/freedreno/meson.build b/src/freedreno/meson.build index bb2cb201c0d..26ee6213890 100644 --- a/src/freedreno/meson.build +++ b/src/freedreno/meson.build @@ -21,3 +21,4 @@ inc_freedreno = include_directories('.') subdir('drm') +subdir('ir3') |