aboutsummaryrefslogtreecommitdiffstats
path: root/src/freedreno
diff options
context:
space:
mode:
Diffstat (limited to 'src/freedreno')
-rw-r--r--src/freedreno/Makefile.am19
-rw-r--r--src/freedreno/Makefile.sources24
-rw-r--r--src/freedreno/ir3/disasm-a3xx.c1038
-rw-r--r--src/freedreno/ir3/instr-a3xx.h872
-rw-r--r--src/freedreno/ir3/ir3.c941
-rw-r--r--src/freedreno/ir3/ir3.h1394
-rw-r--r--src/freedreno/ir3/ir3_compiler.c71
-rw-r--r--src/freedreno/ir3/ir3_compiler.h96
-rw-r--r--src/freedreno/ir3/ir3_compiler_nir.c3818
-rw-r--r--src/freedreno/ir3/ir3_cp.c653
-rw-r--r--src/freedreno/ir3/ir3_depth.c245
-rw-r--r--src/freedreno/ir3/ir3_group.c274
-rw-r--r--src/freedreno/ir3/ir3_legalize.c496
-rw-r--r--src/freedreno/ir3/ir3_nir.c263
-rw-r--r--src/freedreno/ir3/ir3_nir.h45
-rw-r--r--src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c138
-rw-r--r--src/freedreno/ir3/ir3_nir_trig.py51
-rw-r--r--src/freedreno/ir3/ir3_print.c264
-rw-r--r--src/freedreno/ir3/ir3_ra.c1124
-rw-r--r--src/freedreno/ir3/ir3_sched.c818
-rw-r--r--src/freedreno/ir3/ir3_shader.c436
-rw-r--r--src/freedreno/ir3/ir3_shader.h587
-rw-r--r--src/freedreno/ir3/meson.build64
-rw-r--r--src/freedreno/meson.build1
24 files changed, 13731 insertions, 1 deletions
diff --git a/src/freedreno/Makefile.am b/src/freedreno/Makefile.am
index 9ddc3c0ad35..8f027e34f8a 100644
--- a/src/freedreno/Makefile.am
+++ b/src/freedreno/Makefile.am
@@ -45,7 +45,8 @@ TESTS =
BUILT_SOURCES =
CLEANFILES =
EXTRA_DIST = \
- drm/meson.build
+ drm/meson.build \
+ ir3/meson.build
MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
PYTHON_GEN = $(AM_V_GEN)$(PYTHON) $(PYTHON_FLAGS)
@@ -57,3 +58,19 @@ noinst_LTLIBRARIES += libfreedreno_drm.la
libfreedreno_drm_la_SOURCES = $(drm_SOURCES)
libfreedreno_drm_la_CFLAGS = $(VALGRIND_CFLAGS) $(LIBDRM_CFLAGS)
+noinst_LTLIBRARIES += libfreedreno_ir3.la
+
+libfreedreno_ir3_la_SOURCES = $(ir3_SOURCES) $(ir3_GENERATED_FILES)
+libfreedreno_ir3_la_CFLAGS = \
+ -I$(top_srcdir)/src/freedreno/ir3 \
+ -I$(top_builddir)/src/compiler/nir \
+ -I$(top_srcdir)/src/compiler/nir
+libfreedreno_ir3_LIBADD = \
+ $(top_builddir)/src/compiler/nir/libnir.la \
+ $(top_builddir)/src/util/libmesautil.la
+
+MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
+ir3/ir3_nir_trig.c: ir3/ir3_nir_trig.py $(top_srcdir)/src/compiler/nir/nir_algebraic.py
+ $(MKDIR_GEN)
+ $(AM_V_GEN) $(PYTHON) $(PYTHON_FLAGS) $(srcdir)/ir3/ir3_nir_trig.py -p $(top_srcdir)/src/compiler/nir > $@ || ($(RM) $@; false)
+
diff --git a/src/freedreno/Makefile.sources b/src/freedreno/Makefile.sources
index 06a1a99b9e2..1df5e6250b5 100644
--- a/src/freedreno/Makefile.sources
+++ b/src/freedreno/Makefile.sources
@@ -15,3 +15,27 @@ drm_SOURCES := \
drm/msm_drm.h \
drm/msm_ringbuffer.c
+ir3_SOURCES := \
+ ir3/disasm-a3xx.c \
+ ir3/instr-a3xx.h \
+ ir3/ir3.c \
+ ir3/ir3_compiler.c \
+ ir3/ir3_compiler.h \
+ ir3/ir3_compiler_nir.c \
+ ir3/ir3_cp.c \
+ ir3/ir3_depth.c \
+ ir3/ir3_group.c \
+ ir3/ir3.h \
+ ir3/ir3_legalize.c \
+ ir3/ir3_nir.c \
+ ir3/ir3_nir.h \
+ ir3/ir3_nir_lower_tg4_to_tex.c \
+ ir3/ir3_print.c \
+ ir3/ir3_ra.c \
+ ir3/ir3_sched.c \
+ ir3/ir3_shader.c \
+ ir3/ir3_shader.h
+
+ir3_GENERATED_FILES := \
+ ir3/ir3_nir_trig.c
+
diff --git a/src/freedreno/ir3/disasm-a3xx.c b/src/freedreno/ir3/disasm-a3xx.c
new file mode 100644
index 00000000000..4cf45ce9227
--- /dev/null
+++ b/src/freedreno/ir3/disasm-a3xx.c
@@ -0,0 +1,1038 @@
+/*
+ * Copyright (c) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+
+#include <util/u_debug.h>
+
+#include "instr-a3xx.h"
+
+/* bitmask of debug flags */
+enum debug_t {
+ PRINT_RAW = 0x1, /* dump raw hexdump */
+ PRINT_VERBOSE = 0x2,
+};
+
+static enum debug_t debug;
+
+#define printf debug_printf
+
+static const char *levels[] = {
+ "",
+ "\t",
+ "\t\t",
+ "\t\t\t",
+ "\t\t\t\t",
+ "\t\t\t\t\t",
+ "\t\t\t\t\t\t",
+ "\t\t\t\t\t\t\t",
+ "\t\t\t\t\t\t\t\t",
+ "\t\t\t\t\t\t\t\t\t",
+ "x",
+ "x",
+ "x",
+ "x",
+ "x",
+ "x",
+};
+
+static const char *component = "xyzw";
+
+static const char *type[] = {
+ [TYPE_F16] = "f16",
+ [TYPE_F32] = "f32",
+ [TYPE_U16] = "u16",
+ [TYPE_U32] = "u32",
+ [TYPE_S16] = "s16",
+ [TYPE_S32] = "s32",
+ [TYPE_U8] = "u8",
+ [TYPE_S8] = "s8",
+};
+
+struct disasm_ctx {
+ FILE *out;
+ int level;
+
+ /* current instruction repeat flag: */
+ unsigned repeat;
+};
+
+static void print_reg(struct disasm_ctx *ctx, reg_t reg, bool full, bool r,
+ bool c, bool im, bool neg, bool abs, bool addr_rel)
+{
+ const char type = c ? 'c' : 'r';
+
+ // XXX I prefer - and || for neg/abs, but preserving format used
+ // by libllvm-a3xx for easy diffing..
+
+ if (abs && neg)
+ fprintf(ctx->out, "(absneg)");
+ else if (neg)
+ fprintf(ctx->out, "(neg)");
+ else if (abs)
+ fprintf(ctx->out, "(abs)");
+
+ if (r)
+ fprintf(ctx->out, "(r)");
+
+ if (im) {
+ fprintf(ctx->out, "%d", reg.iim_val);
+ } else if (addr_rel) {
+ /* I would just use %+d but trying to make it diff'able with
+ * libllvm-a3xx...
+ */
+ if (reg.iim_val < 0)
+ fprintf(ctx->out, "%s%c<a0.x - %d>", full ? "" : "h", type, -reg.iim_val);
+ else if (reg.iim_val > 0)
+ fprintf(ctx->out, "%s%c<a0.x + %d>", full ? "" : "h", type, reg.iim_val);
+ else
+ fprintf(ctx->out, "%s%c<a0.x>", full ? "" : "h", type);
+ } else if ((reg.num == REG_A0) && !c) {
+ fprintf(ctx->out, "a0.%c", component[reg.comp]);
+ } else if ((reg.num == REG_P0) && !c) {
+ fprintf(ctx->out, "p0.%c", component[reg.comp]);
+ } else {
+ fprintf(ctx->out, "%s%c%d.%c", full ? "" : "h", type, reg.num & 0x3f, component[reg.comp]);
+ }
+}
+
+
+static void print_reg_dst(struct disasm_ctx *ctx, reg_t reg, bool full, bool addr_rel)
+{
+ print_reg(ctx, reg, full, false, false, false, false, false, addr_rel);
+}
+
+static void print_reg_src(struct disasm_ctx *ctx, reg_t reg, bool full, bool r,
+ bool c, bool im, bool neg, bool abs, bool addr_rel)
+{
+ print_reg(ctx, reg, full, r, c, im, neg, abs, addr_rel);
+}
+
+/* TODO switch to using reginfo struct everywhere, since more readable
+ * than passing a bunch of bools to print_reg_src
+ */
+
+struct reginfo {
+ reg_t reg;
+ bool full;
+ bool r;
+ bool c;
+ bool im;
+ bool neg;
+ bool abs;
+ bool addr_rel;
+};
+
+static void print_src(struct disasm_ctx *ctx, struct reginfo *info)
+{
+ print_reg_src(ctx, info->reg, info->full, info->r, info->c, info->im,
+ info->neg, info->abs, info->addr_rel);
+}
+
+//static void print_dst(struct disasm_ctx *ctx, struct reginfo *info)
+//{
+// print_reg_dst(ctx, info->reg, info->full, info->addr_rel);
+//}
+
+static void print_instr_cat0(struct disasm_ctx *ctx, instr_t *instr)
+{
+ instr_cat0_t *cat0 = &instr->cat0;
+
+ switch (cat0->opc) {
+ case OPC_KILL:
+ fprintf(ctx->out, " %sp0.%c", cat0->inv ? "!" : "",
+ component[cat0->comp]);
+ break;
+ case OPC_BR:
+ fprintf(ctx->out, " %sp0.%c, #%d", cat0->inv ? "!" : "",
+ component[cat0->comp], cat0->a3xx.immed);
+ break;
+ case OPC_JUMP:
+ case OPC_CALL:
+ fprintf(ctx->out, " #%d", cat0->a3xx.immed);
+ break;
+ }
+
+ if ((debug & PRINT_VERBOSE) && (cat0->dummy2|cat0->dummy3|cat0->dummy4))
+ fprintf(ctx->out, "\t{0: %x,%x,%x}", cat0->dummy2, cat0->dummy3, cat0->dummy4);
+}
+
+static void print_instr_cat1(struct disasm_ctx *ctx, instr_t *instr)
+{
+ instr_cat1_t *cat1 = &instr->cat1;
+
+ if (cat1->ul)
+ fprintf(ctx->out, "(ul)");
+
+ if (cat1->src_type == cat1->dst_type) {
+ if ((cat1->src_type == TYPE_S16) && (((reg_t)cat1->dst).num == REG_A0)) {
+ /* special case (nmemonic?): */
+ fprintf(ctx->out, "mova");
+ } else {
+ fprintf(ctx->out, "mov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
+ }
+ } else {
+ fprintf(ctx->out, "cov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
+ }
+
+ fprintf(ctx->out, " ");
+
+ if (cat1->even)
+ fprintf(ctx->out, "(even)");
+
+ if (cat1->pos_inf)
+ fprintf(ctx->out, "(pos_infinity)");
+
+ print_reg_dst(ctx, (reg_t)(cat1->dst), type_size(cat1->dst_type) == 32,
+ cat1->dst_rel);
+
+ fprintf(ctx->out, ", ");
+
+ /* ugg, have to special case this.. vs print_reg().. */
+ if (cat1->src_im) {
+ if (type_float(cat1->src_type))
+ fprintf(ctx->out, "(%f)", cat1->fim_val);
+ else if (type_uint(cat1->src_type))
+ fprintf(ctx->out, "0x%08x", cat1->uim_val);
+ else
+ fprintf(ctx->out, "%d", cat1->iim_val);
+ } else if (cat1->src_rel && !cat1->src_c) {
+ /* I would just use %+d but trying to make it diff'able with
+ * libllvm-a3xx...
+ */
+ char type = cat1->src_rel_c ? 'c' : 'r';
+ if (cat1->off < 0)
+ fprintf(ctx->out, "%c<a0.x - %d>", type, -cat1->off);
+ else if (cat1->off > 0)
+ fprintf(ctx->out, "%c<a0.x + %d>", type, cat1->off);
+ else
+ fprintf(ctx->out, "%c<a0.x>", type);
+ } else {
+ print_reg_src(ctx, (reg_t)(cat1->src), type_size(cat1->src_type) == 32,
+ cat1->src_r, cat1->src_c, cat1->src_im, false, false, false);
+ }
+
+ if ((debug & PRINT_VERBOSE) && (cat1->must_be_0))
+ fprintf(ctx->out, "\t{1: %x}", cat1->must_be_0);
+}
+
+static void print_instr_cat2(struct disasm_ctx *ctx, instr_t *instr)
+{
+ instr_cat2_t *cat2 = &instr->cat2;
+ static const char *cond[] = {
+ "lt",
+ "le",
+ "gt",
+ "ge",
+ "eq",
+ "ne",
+ "?6?",
+ };
+
+ switch (_OPC(2, cat2->opc)) {
+ case OPC_CMPS_F:
+ case OPC_CMPS_U:
+ case OPC_CMPS_S:
+ case OPC_CMPV_F:
+ case OPC_CMPV_U:
+ case OPC_CMPV_S:
+ fprintf(ctx->out, ".%s", cond[cat2->cond]);
+ break;
+ }
+
+ fprintf(ctx->out, " ");
+ if (cat2->ei)
+ fprintf(ctx->out, "(ei)");
+ print_reg_dst(ctx, (reg_t)(cat2->dst), cat2->full ^ cat2->dst_half, false);
+ fprintf(ctx->out, ", ");
+
+ if (cat2->c1.src1_c) {
+ print_reg_src(ctx, (reg_t)(cat2->c1.src1), cat2->full, cat2->src1_r,
+ cat2->c1.src1_c, cat2->src1_im, cat2->src1_neg,
+ cat2->src1_abs, false);
+ } else if (cat2->rel1.src1_rel) {
+ print_reg_src(ctx, (reg_t)(cat2->rel1.src1), cat2->full, cat2->src1_r,
+ cat2->rel1.src1_c, cat2->src1_im, cat2->src1_neg,
+ cat2->src1_abs, cat2->rel1.src1_rel);
+ } else {
+ print_reg_src(ctx, (reg_t)(cat2->src1), cat2->full, cat2->src1_r,
+ false, cat2->src1_im, cat2->src1_neg,
+ cat2->src1_abs, false);
+ }
+
+ switch (_OPC(2, cat2->opc)) {
+ case OPC_ABSNEG_F:
+ case OPC_ABSNEG_S:
+ case OPC_CLZ_B:
+ case OPC_CLZ_S:
+ case OPC_SIGN_F:
+ case OPC_FLOOR_F:
+ case OPC_CEIL_F:
+ case OPC_RNDNE_F:
+ case OPC_RNDAZ_F:
+ case OPC_TRUNC_F:
+ case OPC_NOT_B:
+ case OPC_BFREV_B:
+ case OPC_SETRM:
+ case OPC_CBITS_B:
+ /* these only have one src reg */
+ break;
+ default:
+ fprintf(ctx->out, ", ");
+ if (cat2->c2.src2_c) {
+ print_reg_src(ctx, (reg_t)(cat2->c2.src2), cat2->full, cat2->src2_r,
+ cat2->c2.src2_c, cat2->src2_im, cat2->src2_neg,
+ cat2->src2_abs, false);
+ } else if (cat2->rel2.src2_rel) {
+ print_reg_src(ctx, (reg_t)(cat2->rel2.src2), cat2->full, cat2->src2_r,
+ cat2->rel2.src2_c, cat2->src2_im, cat2->src2_neg,
+ cat2->src2_abs, cat2->rel2.src2_rel);
+ } else {
+ print_reg_src(ctx, (reg_t)(cat2->src2), cat2->full, cat2->src2_r,
+ false, cat2->src2_im, cat2->src2_neg,
+ cat2->src2_abs, false);
+ }
+ break;
+ }
+}
+
+static void print_instr_cat3(struct disasm_ctx *ctx, instr_t *instr)
+{
+ instr_cat3_t *cat3 = &instr->cat3;
+ bool full = instr_cat3_full(cat3);
+
+ fprintf(ctx->out, " ");
+ print_reg_dst(ctx, (reg_t)(cat3->dst), full ^ cat3->dst_half, false);
+ fprintf(ctx->out, ", ");
+ if (cat3->c1.src1_c) {
+ print_reg_src(ctx, (reg_t)(cat3->c1.src1), full,
+ cat3->src1_r, cat3->c1.src1_c, false, cat3->src1_neg,
+ false, false);
+ } else if (cat3->rel1.src1_rel) {
+ print_reg_src(ctx, (reg_t)(cat3->rel1.src1), full,
+ cat3->src1_r, cat3->rel1.src1_c, false, cat3->src1_neg,
+ false, cat3->rel1.src1_rel);
+ } else {
+ print_reg_src(ctx, (reg_t)(cat3->src1), full,
+ cat3->src1_r, false, false, cat3->src1_neg,
+ false, false);
+ }
+ fprintf(ctx->out, ", ");
+ print_reg_src(ctx, (reg_t)cat3->src2, full,
+ cat3->src2_r, cat3->src2_c, false, cat3->src2_neg,
+ false, false);
+ fprintf(ctx->out, ", ");
+ if (cat3->c2.src3_c) {
+ print_reg_src(ctx, (reg_t)(cat3->c2.src3), full,
+ cat3->src3_r, cat3->c2.src3_c, false, cat3->src3_neg,
+ false, false);
+ } else if (cat3->rel2.src3_rel) {
+ print_reg_src(ctx, (reg_t)(cat3->rel2.src3), full,
+ cat3->src3_r, cat3->rel2.src3_c, false, cat3->src3_neg,
+ false, cat3->rel2.src3_rel);
+ } else {
+ print_reg_src(ctx, (reg_t)(cat3->src3), full,
+ cat3->src3_r, false, false, cat3->src3_neg,
+ false, false);
+ }
+}
+
+static void print_instr_cat4(struct disasm_ctx *ctx, instr_t *instr)
+{
+ instr_cat4_t *cat4 = &instr->cat4;
+
+ fprintf(ctx->out, " ");
+ print_reg_dst(ctx, (reg_t)(cat4->dst), cat4->full ^ cat4->dst_half, false);
+ fprintf(ctx->out, ", ");
+
+ if (cat4->c.src_c) {
+ print_reg_src(ctx, (reg_t)(cat4->c.src), cat4->full,
+ cat4->src_r, cat4->c.src_c, cat4->src_im,
+ cat4->src_neg, cat4->src_abs, false);
+ } else if (cat4->rel.src_rel) {
+ print_reg_src(ctx, (reg_t)(cat4->rel.src), cat4->full,
+ cat4->src_r, cat4->rel.src_c, cat4->src_im,
+ cat4->src_neg, cat4->src_abs, cat4->rel.src_rel);
+ } else {
+ print_reg_src(ctx, (reg_t)(cat4->src), cat4->full,
+ cat4->src_r, false, cat4->src_im,
+ cat4->src_neg, cat4->src_abs, false);
+ }
+
+ if ((debug & PRINT_VERBOSE) && (cat4->dummy1|cat4->dummy2))
+ fprintf(ctx->out, "\t{4: %x,%x}", cat4->dummy1, cat4->dummy2);
+}
+
+static void print_instr_cat5(struct disasm_ctx *ctx, instr_t *instr)
+{
+ static const struct {
+ bool src1, src2, samp, tex;
+ } info[0x1f] = {
+ [opc_op(OPC_ISAM)] = { true, false, true, true, },
+ [opc_op(OPC_ISAML)] = { true, true, true, true, },
+ [opc_op(OPC_ISAMM)] = { true, false, true, true, },
+ [opc_op(OPC_SAM)] = { true, false, true, true, },
+ [opc_op(OPC_SAMB)] = { true, true, true, true, },
+ [opc_op(OPC_SAML)] = { true, true, true, true, },
+ [opc_op(OPC_SAMGQ)] = { true, false, true, true, },
+ [opc_op(OPC_GETLOD)] = { true, false, true, true, },
+ [opc_op(OPC_CONV)] = { true, true, true, true, },
+ [opc_op(OPC_CONVM)] = { true, true, true, true, },
+ [opc_op(OPC_GETSIZE)] = { true, false, false, true, },
+ [opc_op(OPC_GETBUF)] = { false, false, false, true, },
+ [opc_op(OPC_GETPOS)] = { true, false, false, true, },
+ [opc_op(OPC_GETINFO)] = { false, false, false, true, },
+ [opc_op(OPC_DSX)] = { true, false, false, false, },
+ [opc_op(OPC_DSY)] = { true, false, false, false, },
+ [opc_op(OPC_GATHER4R)] = { true, false, true, true, },
+ [opc_op(OPC_GATHER4G)] = { true, false, true, true, },
+ [opc_op(OPC_GATHER4B)] = { true, false, true, true, },
+ [opc_op(OPC_GATHER4A)] = { true, false, true, true, },
+ [opc_op(OPC_SAMGP0)] = { true, false, true, true, },
+ [opc_op(OPC_SAMGP1)] = { true, false, true, true, },
+ [opc_op(OPC_SAMGP2)] = { true, false, true, true, },
+ [opc_op(OPC_SAMGP3)] = { true, false, true, true, },
+ [opc_op(OPC_DSXPP_1)] = { true, false, false, false, },
+ [opc_op(OPC_DSYPP_1)] = { true, false, false, false, },
+ [opc_op(OPC_RGETPOS)] = { false, false, false, false, },
+ [opc_op(OPC_RGETINFO)] = { false, false, false, false, },
+ };
+ instr_cat5_t *cat5 = &instr->cat5;
+ int i;
+
+ if (cat5->is_3d) fprintf(ctx->out, ".3d");
+ if (cat5->is_a) fprintf(ctx->out, ".a");
+ if (cat5->is_o) fprintf(ctx->out, ".o");
+ if (cat5->is_p) fprintf(ctx->out, ".p");
+ if (cat5->is_s) fprintf(ctx->out, ".s");
+ if (cat5->is_s2en) fprintf(ctx->out, ".s2en");
+
+ fprintf(ctx->out, " ");
+
+ switch (_OPC(5, cat5->opc)) {
+ case OPC_DSXPP_1:
+ case OPC_DSYPP_1:
+ break;
+ default:
+ fprintf(ctx->out, "(%s)", type[cat5->type]);
+ break;
+ }
+
+ fprintf(ctx->out, "(");
+ for (i = 0; i < 4; i++)
+ if (cat5->wrmask & (1 << i))
+ fprintf(ctx->out, "%c", "xyzw"[i]);
+ fprintf(ctx->out, ")");
+
+ print_reg_dst(ctx, (reg_t)(cat5->dst), type_size(cat5->type) == 32, false);
+
+ if (info[cat5->opc].src1) {
+ fprintf(ctx->out, ", ");
+ print_reg_src(ctx, (reg_t)(cat5->src1), cat5->full, false, false, false,
+ false, false, false);
+ }
+
+ if (cat5->is_s2en) {
+ fprintf(ctx->out, ", ");
+ print_reg_src(ctx, (reg_t)(cat5->s2en.src2), cat5->full, false, false, false,
+ false, false, false);
+ fprintf(ctx->out, ", ");
+ print_reg_src(ctx, (reg_t)(cat5->s2en.src3), false, false, false, false,
+ false, false, false);
+ } else {
+ if (cat5->is_o || info[cat5->opc].src2) {
+ fprintf(ctx->out, ", ");
+ print_reg_src(ctx, (reg_t)(cat5->norm.src2), cat5->full,
+ false, false, false, false, false, false);
+ }
+ if (info[cat5->opc].samp)
+ fprintf(ctx->out, ", s#%d", cat5->norm.samp);
+ if (info[cat5->opc].tex)
+ fprintf(ctx->out, ", t#%d", cat5->norm.tex);
+ }
+
+ if (debug & PRINT_VERBOSE) {
+ if (cat5->is_s2en) {
+ if ((debug & PRINT_VERBOSE) && (cat5->s2en.dummy1|cat5->s2en.dummy2|cat5->dummy2))
+ fprintf(ctx->out, "\t{5: %x,%x,%x}", cat5->s2en.dummy1, cat5->s2en.dummy2, cat5->dummy2);
+ } else {
+ if ((debug & PRINT_VERBOSE) && (cat5->norm.dummy1|cat5->dummy2))
+ fprintf(ctx->out, "\t{5: %x,%x}", cat5->norm.dummy1, cat5->dummy2);
+ }
+ }
+}
+
+static void print_instr_cat6(struct disasm_ctx *ctx, instr_t *instr)
+{
+ instr_cat6_t *cat6 = &instr->cat6;
+ char sd = 0, ss = 0; /* dst/src address space */
+ bool nodst = false;
+ struct reginfo dst, src1, src2;
+ int src1off = 0, dstoff = 0;
+
+ memset(&dst, 0, sizeof(dst));
+ memset(&src1, 0, sizeof(src1));
+ memset(&src2, 0, sizeof(src2));
+
+ switch (_OPC(6, cat6->opc)) {
+ case OPC_RESINFO:
+ case OPC_RESFMT:
+ dst.full = type_size(cat6->type) == 32;
+ src1.full = type_size(cat6->type) == 32;
+ src2.full = type_size(cat6->type) == 32;
+ break;
+ case OPC_L2G:
+ case OPC_G2L:
+ dst.full = true;
+ src1.full = true;
+ src2.full = true;
+ break;
+ case OPC_STG:
+ case OPC_STL:
+ case OPC_STP:
+ case OPC_STI:
+ case OPC_STLW:
+ case OPC_STIB:
+ dst.full = true;
+ src1.full = type_size(cat6->type) == 32;
+ src2.full = type_size(cat6->type) == 32;
+ break;
+ default:
+ dst.full = type_size(cat6->type) == 32;
+ src1.full = true;
+ src2.full = true;
+ break;
+ }
+
+ switch (_OPC(6, cat6->opc)) {
+ case OPC_PREFETCH:
+ break;
+ case OPC_RESINFO:
+ fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
+ break;
+ case OPC_LDGB:
+ fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped");
+ fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
+ fprintf(ctx->out, ".%s", type[cat6->type]);
+ fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1);
+ break;
+ case OPC_STGB:
+ case OPC_STIB:
+ fprintf(ctx->out, ".%s", cat6->stgb.typed ? "typed" : "untyped");
+ fprintf(ctx->out, ".%dd", cat6->stgb.d + 1);
+ fprintf(ctx->out, ".%s", type[cat6->type]);
+ fprintf(ctx->out, ".%d", cat6->stgb.type_size + 1);
+ break;
+ case OPC_ATOMIC_ADD:
+ case OPC_ATOMIC_SUB:
+ case OPC_ATOMIC_XCHG:
+ case OPC_ATOMIC_INC:
+ case OPC_ATOMIC_DEC:
+ case OPC_ATOMIC_CMPXCHG:
+ case OPC_ATOMIC_MIN:
+ case OPC_ATOMIC_MAX:
+ case OPC_ATOMIC_AND:
+ case OPC_ATOMIC_OR:
+ case OPC_ATOMIC_XOR:
+ ss = cat6->g ? 'g' : 'l';
+ fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped");
+ fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
+ fprintf(ctx->out, ".%s", type[cat6->type]);
+ fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1);
+ fprintf(ctx->out, ".%c", ss);
+ break;
+ default:
+ dst.im = cat6->g && !cat6->dst_off;
+ fprintf(ctx->out, ".%s", type[cat6->type]);
+ break;
+ }
+ fprintf(ctx->out, " ");
+
+ switch (_OPC(6, cat6->opc)) {
+ case OPC_STG:
+ sd = 'g';
+ break;
+ case OPC_STP:
+ sd = 'p';
+ break;
+ case OPC_STL:
+ case OPC_STLW:
+ sd = 'l';
+ break;
+
+ case OPC_LDG:
+ case OPC_LDC:
+ ss = 'g';
+ break;
+ case OPC_LDP:
+ ss = 'p';
+ break;
+ case OPC_LDL:
+ case OPC_LDLW:
+ case OPC_LDLV:
+ ss = 'l';
+ break;
+
+ case OPC_L2G:
+ ss = 'l';
+ sd = 'g';
+ break;
+
+ case OPC_G2L:
+ ss = 'g';
+ sd = 'l';
+ break;
+
+ case OPC_PREFETCH:
+ ss = 'g';
+ nodst = true;
+ break;
+
+ case OPC_STI:
+ dst.full = false; // XXX or inverts??
+ break;
+ }
+
+ if ((_OPC(6, cat6->opc) == OPC_STGB) || (_OPC(6, cat6->opc) == OPC_STIB)) {
+ struct reginfo src3;
+
+ memset(&src3, 0, sizeof(src3));
+
+ src1.reg = (reg_t)(cat6->stgb.src1);
+ src2.reg = (reg_t)(cat6->stgb.src2);
+ src2.im = cat6->stgb.src2_im;
+ src3.reg = (reg_t)(cat6->stgb.src3);
+ src3.im = cat6->stgb.src3_im;
+ src3.full = true;
+
+ fprintf(ctx->out, "g[%u], ", cat6->stgb.dst_ssbo);
+ print_src(ctx, &src1);
+ fprintf(ctx->out, ", ");
+ print_src(ctx, &src2);
+ fprintf(ctx->out, ", ");
+ print_src(ctx, &src3);
+
+ if (debug & PRINT_VERBOSE)
+ fprintf(ctx->out, " (pad0=%x, pad3=%x)", cat6->stgb.pad0, cat6->stgb.pad3);
+
+ return;
+ }
+
+ if (is_atomic(_OPC(6, cat6->opc))) {
+
+ src1.reg = (reg_t)(cat6->ldgb.src1);
+ src1.im = cat6->ldgb.src1_im;
+ src2.reg = (reg_t)(cat6->ldgb.src2);
+ src2.im = cat6->ldgb.src2_im;
+ dst.reg = (reg_t)(cat6->ldgb.dst);
+
+ print_src(ctx, &dst);
+ fprintf(ctx->out, ", ");
+ if (ss == 'g') {
+ struct reginfo src3;
+ memset(&src3, 0, sizeof(src3));
+
+ src3.reg = (reg_t)(cat6->ldgb.src3);
+ src3.full = true;
+
+ /* For images, the ".typed" variant is used and src2 is
+ * the ivecN coordinates, ie ivec2 for 2d.
+ *
+ * For SSBOs, the ".untyped" variant is used and src2 is
+ * a simple dword offset.. src3 appears to be
+ * uvec2(offset * 4, 0). Not sure the point of that.
+ */
+
+ fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo);
+ print_src(ctx, &src1); /* value */
+ fprintf(ctx->out, ", ");
+ print_src(ctx, &src2); /* offset/coords */
+ fprintf(ctx->out, ", ");
+ print_src(ctx, &src3); /* 64b byte offset.. */
+
+ if (debug & PRINT_VERBOSE) {
+ fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0,
+ cat6->ldgb.pad3, cat6->ldgb.mustbe0);
+ }
+ } else { /* ss == 'l' */
+ fprintf(ctx->out, "l[");
+ print_src(ctx, &src1); /* simple byte offset */
+ fprintf(ctx->out, "], ");
+ print_src(ctx, &src2); /* value */
+
+ if (debug & PRINT_VERBOSE) {
+ fprintf(ctx->out, " (src3=%x, pad0=%x, pad3=%x, mustbe0=%x)",
+ cat6->ldgb.src3, cat6->ldgb.pad0,
+ cat6->ldgb.pad3, cat6->ldgb.mustbe0);
+ }
+ }
+
+ return;
+ } else if (_OPC(6, cat6->opc) == OPC_RESINFO) {
+ dst.reg = (reg_t)(cat6->ldgb.dst);
+
+ print_src(ctx, &dst);
+ fprintf(ctx->out, ", ");
+ fprintf(ctx->out, "g[%u]", cat6->ldgb.src_ssbo);
+
+ return;
+ } else if (_OPC(6, cat6->opc) == OPC_LDGB) {
+
+ src1.reg = (reg_t)(cat6->ldgb.src1);
+ src1.im = cat6->ldgb.src1_im;
+ src2.reg = (reg_t)(cat6->ldgb.src2);
+ src2.im = cat6->ldgb.src2_im;
+ dst.reg = (reg_t)(cat6->ldgb.dst);
+
+ print_src(ctx, &dst);
+ fprintf(ctx->out, ", ");
+ fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo);
+ print_src(ctx, &src1);
+ fprintf(ctx->out, ", ");
+ print_src(ctx, &src2);
+
+ if (debug & PRINT_VERBOSE)
+ fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0, cat6->ldgb.pad3, cat6->ldgb.mustbe0);
+
+ return;
+ }
+ if (cat6->dst_off) {
+ dst.reg = (reg_t)(cat6->c.dst);
+ dstoff = cat6->c.off;
+ } else {
+ dst.reg = (reg_t)(cat6->d.dst);
+ }
+
+ if (cat6->src_off) {
+ src1.reg = (reg_t)(cat6->a.src1);
+ src1.im = cat6->a.src1_im;
+ src2.reg = (reg_t)(cat6->a.src2);
+ src2.im = cat6->a.src2_im;
+ src1off = cat6->a.off;
+ } else {
+ src1.reg = (reg_t)(cat6->b.src1);
+ src1.im = cat6->b.src1_im;
+ src2.reg = (reg_t)(cat6->b.src2);
+ src2.im = cat6->b.src2_im;
+ }
+
+ if (!nodst) {
+ if (sd)
+ fprintf(ctx->out, "%c[", sd);
+ /* note: dst might actually be a src (ie. address to store to) */
+ print_src(ctx, &dst);
+ if (dstoff)
+ fprintf(ctx->out, "%+d", dstoff);
+ if (sd)
+ fprintf(ctx->out, "]");
+ fprintf(ctx->out, ", ");
+ }
+
+ if (ss)
+ fprintf(ctx->out, "%c[", ss);
+
+ /* can have a larger than normal immed, so hack: */
+ if (src1.im) {
+ fprintf(ctx->out, "%u", src1.reg.dummy13);
+ } else {
+ print_src(ctx, &src1);
+ }
+
+ if (src1off)
+ fprintf(ctx->out, "%+d", src1off);
+ if (ss)
+ fprintf(ctx->out, "]");
+
+ switch (_OPC(6, cat6->opc)) {
+ case OPC_RESINFO:
+ case OPC_RESFMT:
+ break;
+ default:
+ fprintf(ctx->out, ", ");
+ print_src(ctx, &src2);
+ break;
+ }
+}
+
+static void print_instr_cat7(struct disasm_ctx *ctx, instr_t *instr)
+{
+ instr_cat7_t *cat7 = &instr->cat7;
+
+ if (cat7->g)
+ fprintf(ctx->out, ".g");
+ if (cat7->l)
+ fprintf(ctx->out, ".l");
+
+ if (_OPC(7, cat7->opc) == OPC_FENCE) {
+ if (cat7->r)
+ fprintf(ctx->out, ".r");
+ if (cat7->w)
+ fprintf(ctx->out, ".w");
+ }
+}
+
+/* size of largest OPC field of all the instruction categories: */
+#define NOPC_BITS 6
+
+static const struct opc_info {
+ uint16_t cat;
+ uint16_t opc;
+ const char *name;
+ void (*print)(struct disasm_ctx *ctx, instr_t *instr);
+} opcs[1 << (3+NOPC_BITS)] = {
+#define OPC(cat, opc, name) [(opc)] = { (cat), (opc), #name, print_instr_cat##cat }
+ /* category 0: */
+ OPC(0, OPC_NOP, nop),
+ OPC(0, OPC_BR, br),
+ OPC(0, OPC_JUMP, jump),
+ OPC(0, OPC_CALL, call),
+ OPC(0, OPC_RET, ret),
+ OPC(0, OPC_KILL, kill),
+ OPC(0, OPC_END, end),
+ OPC(0, OPC_EMIT, emit),
+ OPC(0, OPC_CUT, cut),
+ OPC(0, OPC_CHMASK, chmask),
+ OPC(0, OPC_CHSH, chsh),
+ OPC(0, OPC_FLOW_REV, flow_rev),
+
+ /* category 1: */
+ OPC(1, OPC_MOV, ),
+
+ /* category 2: */
+ OPC(2, OPC_ADD_F, add.f),
+ OPC(2, OPC_MIN_F, min.f),
+ OPC(2, OPC_MAX_F, max.f),
+ OPC(2, OPC_MUL_F, mul.f),
+ OPC(2, OPC_SIGN_F, sign.f),
+ OPC(2, OPC_CMPS_F, cmps.f),
+ OPC(2, OPC_ABSNEG_F, absneg.f),
+ OPC(2, OPC_CMPV_F, cmpv.f),
+ OPC(2, OPC_FLOOR_F, floor.f),
+ OPC(2, OPC_CEIL_F, ceil.f),
+ OPC(2, OPC_RNDNE_F, rndne.f),
+ OPC(2, OPC_RNDAZ_F, rndaz.f),
+ OPC(2, OPC_TRUNC_F, trunc.f),
+ OPC(2, OPC_ADD_U, add.u),
+ OPC(2, OPC_ADD_S, add.s),
+ OPC(2, OPC_SUB_U, sub.u),
+ OPC(2, OPC_SUB_S, sub.s),
+ OPC(2, OPC_CMPS_U, cmps.u),
+ OPC(2, OPC_CMPS_S, cmps.s),
+ OPC(2, OPC_MIN_U, min.u),
+ OPC(2, OPC_MIN_S, min.s),
+ OPC(2, OPC_MAX_U, max.u),
+ OPC(2, OPC_MAX_S, max.s),
+ OPC(2, OPC_ABSNEG_S, absneg.s),
+ OPC(2, OPC_AND_B, and.b),
+ OPC(2, OPC_OR_B, or.b),
+ OPC(2, OPC_NOT_B, not.b),
+ OPC(2, OPC_XOR_B, xor.b),
+ OPC(2, OPC_CMPV_U, cmpv.u),
+ OPC(2, OPC_CMPV_S, cmpv.s),
+ OPC(2, OPC_MUL_U, mul.u),
+ OPC(2, OPC_MUL_S, mul.s),
+ OPC(2, OPC_MULL_U, mull.u),
+ OPC(2, OPC_BFREV_B, bfrev.b),
+ OPC(2, OPC_CLZ_S, clz.s),
+ OPC(2, OPC_CLZ_B, clz.b),
+ OPC(2, OPC_SHL_B, shl.b),
+ OPC(2, OPC_SHR_B, shr.b),
+ OPC(2, OPC_ASHR_B, ashr.b),
+ OPC(2, OPC_BARY_F, bary.f),
+ OPC(2, OPC_MGEN_B, mgen.b),
+ OPC(2, OPC_GETBIT_B, getbit.b),
+ OPC(2, OPC_SETRM, setrm),
+ OPC(2, OPC_CBITS_B, cbits.b),
+ OPC(2, OPC_SHB, shb),
+ OPC(2, OPC_MSAD, msad),
+
+ /* category 3: */
+ OPC(3, OPC_MAD_U16, mad.u16),
+ OPC(3, OPC_MADSH_U16, madsh.u16),
+ OPC(3, OPC_MAD_S16, mad.s16),
+ OPC(3, OPC_MADSH_M16, madsh.m16),
+ OPC(3, OPC_MAD_U24, mad.u24),
+ OPC(3, OPC_MAD_S24, mad.s24),
+ OPC(3, OPC_MAD_F16, mad.f16),
+ OPC(3, OPC_MAD_F32, mad.f32),
+ OPC(3, OPC_SEL_B16, sel.b16),
+ OPC(3, OPC_SEL_B32, sel.b32),
+ OPC(3, OPC_SEL_S16, sel.s16),
+ OPC(3, OPC_SEL_S32, sel.s32),
+ OPC(3, OPC_SEL_F16, sel.f16),
+ OPC(3, OPC_SEL_F32, sel.f32),
+ OPC(3, OPC_SAD_S16, sad.s16),
+ OPC(3, OPC_SAD_S32, sad.s32),
+
+ /* category 4: */
+ OPC(4, OPC_RCP, rcp),
+ OPC(4, OPC_RSQ, rsq),
+ OPC(4, OPC_LOG2, log2),
+ OPC(4, OPC_EXP2, exp2),
+ OPC(4, OPC_SIN, sin),
+ OPC(4, OPC_COS, cos),
+ OPC(4, OPC_SQRT, sqrt),
+
+ /* category 5: */
+ OPC(5, OPC_ISAM, isam),
+ OPC(5, OPC_ISAML, isaml),
+ OPC(5, OPC_ISAMM, isamm),
+ OPC(5, OPC_SAM, sam),
+ OPC(5, OPC_SAMB, samb),
+ OPC(5, OPC_SAML, saml),
+ OPC(5, OPC_SAMGQ, samgq),
+ OPC(5, OPC_GETLOD, getlod),
+ OPC(5, OPC_CONV, conv),
+ OPC(5, OPC_CONVM, convm),
+ OPC(5, OPC_GETSIZE, getsize),
+ OPC(5, OPC_GETBUF, getbuf),
+ OPC(5, OPC_GETPOS, getpos),
+ OPC(5, OPC_GETINFO, getinfo),
+ OPC(5, OPC_DSX, dsx),
+ OPC(5, OPC_DSY, dsy),
+ OPC(5, OPC_GATHER4R, gather4r),
+ OPC(5, OPC_GATHER4G, gather4g),
+ OPC(5, OPC_GATHER4B, gather4b),
+ OPC(5, OPC_GATHER4A, gather4a),
+ OPC(5, OPC_SAMGP0, samgp0),
+ OPC(5, OPC_SAMGP1, samgp1),
+ OPC(5, OPC_SAMGP2, samgp2),
+ OPC(5, OPC_SAMGP3, samgp3),
+ OPC(5, OPC_DSXPP_1, dsxpp.1),
+ OPC(5, OPC_DSYPP_1, dsypp.1),
+ OPC(5, OPC_RGETPOS, rgetpos),
+ OPC(5, OPC_RGETINFO, rgetinfo),
+
+
+ /* category 6: */
+ OPC(6, OPC_LDG, ldg),
+ OPC(6, OPC_LDL, ldl),
+ OPC(6, OPC_LDP, ldp),
+ OPC(6, OPC_STG, stg),
+ OPC(6, OPC_STL, stl),
+ OPC(6, OPC_STP, stp),
+ OPC(6, OPC_STI, sti),
+ OPC(6, OPC_G2L, g2l),
+ OPC(6, OPC_L2G, l2g),
+ OPC(6, OPC_PREFETCH, prefetch),
+ OPC(6, OPC_LDLW, ldlw),
+ OPC(6, OPC_STLW, stlw),
+ OPC(6, OPC_RESFMT, resfmt),
+ OPC(6, OPC_RESINFO, resinfo),
+ OPC(6, OPC_ATOMIC_ADD, atomic.add),
+ OPC(6, OPC_ATOMIC_SUB, atomic.sub),
+ OPC(6, OPC_ATOMIC_XCHG, atomic.xchg),
+ OPC(6, OPC_ATOMIC_INC, atomic.inc),
+ OPC(6, OPC_ATOMIC_DEC, atomic.dec),
+ OPC(6, OPC_ATOMIC_CMPXCHG, atomic.cmpxchg),
+ OPC(6, OPC_ATOMIC_MIN, atomic.min),
+ OPC(6, OPC_ATOMIC_MAX, atomic.max),
+ OPC(6, OPC_ATOMIC_AND, atomic.and),
+ OPC(6, OPC_ATOMIC_OR, atomic.or),
+ OPC(6, OPC_ATOMIC_XOR, atomic.xor),
+ OPC(6, OPC_LDGB, ldgb),
+ OPC(6, OPC_STGB, stgb),
+ OPC(6, OPC_STIB, stib),
+ OPC(6, OPC_LDC, ldc),
+ OPC(6, OPC_LDLV, ldlv),
+
+ OPC(7, OPC_BAR, bar),
+ OPC(7, OPC_FENCE, fence),
+
+#undef OPC
+};
+
+#define GETINFO(instr) (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr)]))
+
+// XXX hack.. probably should move this table somewhere common:
+#include "ir3.h"
+const char *ir3_instr_name(struct ir3_instruction *instr)
+{
+ if (opc_cat(instr->opc) == -1) return "??meta??";
+ return opcs[instr->opc].name;
+}
+
+static bool print_instr(struct disasm_ctx *ctx, uint32_t *dwords, int n)
+{
+ instr_t *instr = (instr_t *)dwords;
+ uint32_t opc = instr_opc(instr);
+ const char *name;
+
+ if (debug & PRINT_VERBOSE)
+ fprintf(ctx->out, "%s%04d[%08xx_%08xx] ", levels[ctx->level], n, dwords[1], dwords[0]);
+
+ /* NOTE: order flags are printed is a bit fugly.. but for now I
+ * try to match the order in llvm-a3xx disassembler for easy
+ * diff'ing..
+ */
+
+ ctx->repeat = instr_repeat(instr);
+
+ if (instr->sync)
+ fprintf(ctx->out, "(sy)");
+ if (instr->ss && ((instr->opc_cat <= 4) || (instr->opc_cat == 7)))
+ fprintf(ctx->out, "(ss)");
+ if (instr->jmp_tgt)
+ fprintf(ctx->out, "(jp)");
+ if (instr_sat(instr))
+ fprintf(ctx->out, "(sat)");
+ if (ctx->repeat)
+ fprintf(ctx->out, "(rpt%d)", ctx->repeat);
+ if (instr->ul && ((2 <= instr->opc_cat) && (instr->opc_cat <= 4)))
+ fprintf(ctx->out, "(ul)");
+
+ name = GETINFO(instr)->name;
+
+ if (name) {
+ fprintf(ctx->out, "%s", name);
+ GETINFO(instr)->print(ctx, instr);
+ } else {
+ fprintf(ctx->out, "unknown(%d,%d)", instr->opc_cat, opc);
+ }
+
+ fprintf(ctx->out, "\n");
+
+ return (instr->opc_cat == 0) && (opc == OPC_END);
+}
+
+int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out)
+{
+ struct disasm_ctx ctx;
+ int i;
+
+ assert((sizedwords % 2) == 0);
+
+ memset(&ctx, 0, sizeof(ctx));
+ ctx.out = out;
+ ctx.level = level;
+
+ for (i = 0; i < sizedwords; i += 2)
+ print_instr(&ctx, &dwords[i], i/2);
+
+ return 0;
+}
diff --git a/src/freedreno/ir3/instr-a3xx.h b/src/freedreno/ir3/instr-a3xx.h
new file mode 100644
index 00000000000..7f60ee5fd4c
--- /dev/null
+++ b/src/freedreno/ir3/instr-a3xx.h
@@ -0,0 +1,872 @@
+/*
+ * Copyright (c) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef INSTR_A3XX_H_
+#define INSTR_A3XX_H_
+
+#define PACKED __attribute__((__packed__))
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <assert.h>
+
+/* size of largest OPC field of all the instruction categories: */
+#define NOPC_BITS 6
+
+#define _OPC(cat, opc) (((cat) << NOPC_BITS) | opc)
+
+typedef enum {
+ /* category 0: */
+ OPC_NOP = _OPC(0, 0),
+ OPC_BR = _OPC(0, 1),
+ OPC_JUMP = _OPC(0, 2),
+ OPC_CALL = _OPC(0, 3),
+ OPC_RET = _OPC(0, 4),
+ OPC_KILL = _OPC(0, 5),
+ OPC_END = _OPC(0, 6),
+ OPC_EMIT = _OPC(0, 7),
+ OPC_CUT = _OPC(0, 8),
+ OPC_CHMASK = _OPC(0, 9),
+ OPC_CHSH = _OPC(0, 10),
+ OPC_FLOW_REV = _OPC(0, 11),
+
+ /* category 1: */
+ OPC_MOV = _OPC(1, 0),
+
+ /* category 2: */
+ OPC_ADD_F = _OPC(2, 0),
+ OPC_MIN_F = _OPC(2, 1),
+ OPC_MAX_F = _OPC(2, 2),
+ OPC_MUL_F = _OPC(2, 3),
+ OPC_SIGN_F = _OPC(2, 4),
+ OPC_CMPS_F = _OPC(2, 5),
+ OPC_ABSNEG_F = _OPC(2, 6),
+ OPC_CMPV_F = _OPC(2, 7),
+ /* 8 - invalid */
+ OPC_FLOOR_F = _OPC(2, 9),
+ OPC_CEIL_F = _OPC(2, 10),
+ OPC_RNDNE_F = _OPC(2, 11),
+ OPC_RNDAZ_F = _OPC(2, 12),
+ OPC_TRUNC_F = _OPC(2, 13),
+ /* 14-15 - invalid */
+ OPC_ADD_U = _OPC(2, 16),
+ OPC_ADD_S = _OPC(2, 17),
+ OPC_SUB_U = _OPC(2, 18),
+ OPC_SUB_S = _OPC(2, 19),
+ OPC_CMPS_U = _OPC(2, 20),
+ OPC_CMPS_S = _OPC(2, 21),
+ OPC_MIN_U = _OPC(2, 22),
+ OPC_MIN_S = _OPC(2, 23),
+ OPC_MAX_U = _OPC(2, 24),
+ OPC_MAX_S = _OPC(2, 25),
+ OPC_ABSNEG_S = _OPC(2, 26),
+ /* 27 - invalid */
+ OPC_AND_B = _OPC(2, 28),
+ OPC_OR_B = _OPC(2, 29),
+ OPC_NOT_B = _OPC(2, 30),
+ OPC_XOR_B = _OPC(2, 31),
+ /* 32 - invalid */
+ OPC_CMPV_U = _OPC(2, 33),
+ OPC_CMPV_S = _OPC(2, 34),
+ /* 35-47 - invalid */
+ OPC_MUL_U = _OPC(2, 48),
+ OPC_MUL_S = _OPC(2, 49),
+ OPC_MULL_U = _OPC(2, 50),
+ OPC_BFREV_B = _OPC(2, 51),
+ OPC_CLZ_S = _OPC(2, 52),
+ OPC_CLZ_B = _OPC(2, 53),
+ OPC_SHL_B = _OPC(2, 54),
+ OPC_SHR_B = _OPC(2, 55),
+ OPC_ASHR_B = _OPC(2, 56),
+ OPC_BARY_F = _OPC(2, 57),
+ OPC_MGEN_B = _OPC(2, 58),
+ OPC_GETBIT_B = _OPC(2, 59),
+ OPC_SETRM = _OPC(2, 60),
+ OPC_CBITS_B = _OPC(2, 61),
+ OPC_SHB = _OPC(2, 62),
+ OPC_MSAD = _OPC(2, 63),
+
+ /* category 3: */
+ OPC_MAD_U16 = _OPC(3, 0),
+ OPC_MADSH_U16 = _OPC(3, 1),
+ OPC_MAD_S16 = _OPC(3, 2),
+ OPC_MADSH_M16 = _OPC(3, 3), /* should this be .s16? */
+ OPC_MAD_U24 = _OPC(3, 4),
+ OPC_MAD_S24 = _OPC(3, 5),
+ OPC_MAD_F16 = _OPC(3, 6),
+ OPC_MAD_F32 = _OPC(3, 7),
+ OPC_SEL_B16 = _OPC(3, 8),
+ OPC_SEL_B32 = _OPC(3, 9),
+ OPC_SEL_S16 = _OPC(3, 10),
+ OPC_SEL_S32 = _OPC(3, 11),
+ OPC_SEL_F16 = _OPC(3, 12),
+ OPC_SEL_F32 = _OPC(3, 13),
+ OPC_SAD_S16 = _OPC(3, 14),
+ OPC_SAD_S32 = _OPC(3, 15),
+
+ /* category 4: */
+ OPC_RCP = _OPC(4, 0),
+ OPC_RSQ = _OPC(4, 1),
+ OPC_LOG2 = _OPC(4, 2),
+ OPC_EXP2 = _OPC(4, 3),
+ OPC_SIN = _OPC(4, 4),
+ OPC_COS = _OPC(4, 5),
+ OPC_SQRT = _OPC(4, 6),
+ // 7-63 - invalid
+
+ /* category 5: */
+ OPC_ISAM = _OPC(5, 0),
+ OPC_ISAML = _OPC(5, 1),
+ OPC_ISAMM = _OPC(5, 2),
+ OPC_SAM = _OPC(5, 3),
+ OPC_SAMB = _OPC(5, 4),
+ OPC_SAML = _OPC(5, 5),
+ OPC_SAMGQ = _OPC(5, 6),
+ OPC_GETLOD = _OPC(5, 7),
+ OPC_CONV = _OPC(5, 8),
+ OPC_CONVM = _OPC(5, 9),
+ OPC_GETSIZE = _OPC(5, 10),
+ OPC_GETBUF = _OPC(5, 11),
+ OPC_GETPOS = _OPC(5, 12),
+ OPC_GETINFO = _OPC(5, 13),
+ OPC_DSX = _OPC(5, 14),
+ OPC_DSY = _OPC(5, 15),
+ OPC_GATHER4R = _OPC(5, 16),
+ OPC_GATHER4G = _OPC(5, 17),
+ OPC_GATHER4B = _OPC(5, 18),
+ OPC_GATHER4A = _OPC(5, 19),
+ OPC_SAMGP0 = _OPC(5, 20),
+ OPC_SAMGP1 = _OPC(5, 21),
+ OPC_SAMGP2 = _OPC(5, 22),
+ OPC_SAMGP3 = _OPC(5, 23),
+ OPC_DSXPP_1 = _OPC(5, 24),
+ OPC_DSYPP_1 = _OPC(5, 25),
+ OPC_RGETPOS = _OPC(5, 26),
+ OPC_RGETINFO = _OPC(5, 27),
+
+ /* category 6: */
+ OPC_LDG = _OPC(6, 0), /* load-global */
+ OPC_LDL = _OPC(6, 1),
+ OPC_LDP = _OPC(6, 2),
+ OPC_STG = _OPC(6, 3), /* store-global */
+ OPC_STL = _OPC(6, 4),
+ OPC_STP = _OPC(6, 5),
+ OPC_STI = _OPC(6, 6),
+ OPC_G2L = _OPC(6, 7),
+ OPC_L2G = _OPC(6, 8),
+ OPC_PREFETCH = _OPC(6, 9),
+ OPC_LDLW = _OPC(6, 10),
+ OPC_STLW = _OPC(6, 11),
+ OPC_RESFMT = _OPC(6, 14),
+ OPC_RESINFO = _OPC(6, 15),
+ OPC_ATOMIC_ADD = _OPC(6, 16),
+ OPC_ATOMIC_SUB = _OPC(6, 17),
+ OPC_ATOMIC_XCHG = _OPC(6, 18),
+ OPC_ATOMIC_INC = _OPC(6, 19),
+ OPC_ATOMIC_DEC = _OPC(6, 20),
+ OPC_ATOMIC_CMPXCHG = _OPC(6, 21),
+ OPC_ATOMIC_MIN = _OPC(6, 22),
+ OPC_ATOMIC_MAX = _OPC(6, 23),
+ OPC_ATOMIC_AND = _OPC(6, 24),
+ OPC_ATOMIC_OR = _OPC(6, 25),
+ OPC_ATOMIC_XOR = _OPC(6, 26),
+ OPC_LDGB = _OPC(6, 27),
+ OPC_STGB = _OPC(6, 28),
+ OPC_STIB = _OPC(6, 29),
+ OPC_LDC = _OPC(6, 30),
+ OPC_LDLV = _OPC(6, 31),
+
+ /* category 7: */
+ OPC_BAR = _OPC(7, 0),
+ OPC_FENCE = _OPC(7, 1),
+
+ /* meta instructions (category -1): */
+ /* placeholder instr to mark shader inputs: */
+ OPC_META_INPUT = _OPC(-1, 0),
+ /* The "fan-in" and "fan-out" instructions are used for keeping
+ * track of instructions that write to multiple dst registers
+ * (fan-out) like texture sample instructions, or read multiple
+ * consecutive scalar registers (fan-in) (bary.f, texture samp)
+ */
+ OPC_META_FO = _OPC(-1, 2),
+ OPC_META_FI = _OPC(-1, 3),
+
+} opc_t;
+
+#define opc_cat(opc) ((int)((opc) >> NOPC_BITS))
+#define opc_op(opc) ((unsigned)((opc) & ((1 << NOPC_BITS) - 1)))
+
+typedef enum {
+ TYPE_F16 = 0,
+ TYPE_F32 = 1,
+ TYPE_U16 = 2,
+ TYPE_U32 = 3,
+ TYPE_S16 = 4,
+ TYPE_S32 = 5,
+ TYPE_U8 = 6,
+ TYPE_S8 = 7, // XXX I assume?
+} type_t;
+
+static inline uint32_t type_size(type_t type)
+{
+ switch (type) {
+ case TYPE_F32:
+ case TYPE_U32:
+ case TYPE_S32:
+ return 32;
+ case TYPE_F16:
+ case TYPE_U16:
+ case TYPE_S16:
+ return 16;
+ case TYPE_U8:
+ case TYPE_S8:
+ return 8;
+ default:
+ assert(0); /* invalid type */
+ return 0;
+ }
+}
+
+static inline int type_float(type_t type)
+{
+ return (type == TYPE_F32) || (type == TYPE_F16);
+}
+
+static inline int type_uint(type_t type)
+{
+ return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8);
+}
+
+static inline int type_sint(type_t type)
+{
+ return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8);
+}
+
+typedef union PACKED {
+ /* normal gpr or const src register: */
+ struct PACKED {
+ uint32_t comp : 2;
+ uint32_t num : 10;
+ };
+ /* for immediate val: */
+ int32_t iim_val : 11;
+ /* to make compiler happy: */
+ uint32_t dummy32;
+ uint32_t dummy10 : 10;
+ int32_t idummy10 : 10;
+ uint32_t dummy11 : 11;
+ uint32_t dummy12 : 12;
+ uint32_t dummy13 : 13;
+ uint32_t dummy8 : 8;
+} reg_t;
+
+/* special registers: */
+#define REG_A0 61 /* address register */
+#define REG_P0 62 /* predicate register */
+
+static inline int reg_special(reg_t reg)
+{
+ return (reg.num == REG_A0) || (reg.num == REG_P0);
+}
+
+typedef struct PACKED {
+ /* dword0: */
+ union PACKED {
+ struct PACKED {
+ int16_t immed : 16;
+ uint32_t dummy1 : 16;
+ } a3xx;
+ struct PACKED {
+ int32_t immed : 20;
+ uint32_t dummy1 : 12;
+ } a4xx;
+ struct PACKED {
+ int32_t immed : 32;
+ } a5xx;
+ };
+
+ /* dword1: */
+ uint32_t dummy2 : 8;
+ uint32_t repeat : 3;
+ uint32_t dummy3 : 1;
+ uint32_t ss : 1;
+ uint32_t dummy4 : 7;
+ uint32_t inv : 1;
+ uint32_t comp : 2;
+ uint32_t opc : 4;
+ uint32_t jmp_tgt : 1;
+ uint32_t sync : 1;
+ uint32_t opc_cat : 3;
+} instr_cat0_t;
+
+typedef struct PACKED {
+ /* dword0: */
+ union PACKED {
+ /* for normal src register: */
+ struct PACKED {
+ uint32_t src : 11;
+ /* at least low bit of pad must be zero or it will
+ * look like a address relative src
+ */
+ uint32_t pad : 21;
+ };
+ /* for address relative: */
+ struct PACKED {
+ int32_t off : 10;
+ uint32_t src_rel_c : 1;
+ uint32_t src_rel : 1;
+ uint32_t unknown : 20;
+ };
+ /* for immediate: */
+ int32_t iim_val;
+ uint32_t uim_val;
+ float fim_val;
+ };
+
+ /* dword1: */
+ uint32_t dst : 8;
+ uint32_t repeat : 3;
+ uint32_t src_r : 1;
+ uint32_t ss : 1;
+ uint32_t ul : 1;
+ uint32_t dst_type : 3;
+ uint32_t dst_rel : 1;
+ uint32_t src_type : 3;
+ uint32_t src_c : 1;
+ uint32_t src_im : 1;
+ uint32_t even : 1;
+ uint32_t pos_inf : 1;
+ uint32_t must_be_0 : 2;
+ uint32_t jmp_tgt : 1;
+ uint32_t sync : 1;
+ uint32_t opc_cat : 3;
+} instr_cat1_t;
+
+typedef struct PACKED {
+ /* dword0: */
+ union PACKED {
+ struct PACKED {
+ uint32_t src1 : 11;
+ uint32_t must_be_zero1: 2;
+ uint32_t src1_im : 1; /* immediate */
+ uint32_t src1_neg : 1; /* negate */
+ uint32_t src1_abs : 1; /* absolute value */
+ };
+ struct PACKED {
+ uint32_t src1 : 10;
+ uint32_t src1_c : 1; /* relative-const */
+ uint32_t src1_rel : 1; /* relative address */
+ uint32_t must_be_zero : 1;
+ uint32_t dummy : 3;
+ } rel1;
+ struct PACKED {
+ uint32_t src1 : 12;
+ uint32_t src1_c : 1; /* const */
+ uint32_t dummy : 3;
+ } c1;
+ };
+
+ union PACKED {
+ struct PACKED {
+ uint32_t src2 : 11;
+ uint32_t must_be_zero2: 2;
+ uint32_t src2_im : 1; /* immediate */
+ uint32_t src2_neg : 1; /* negate */
+ uint32_t src2_abs : 1; /* absolute value */
+ };
+ struct PACKED {
+ uint32_t src2 : 10;
+ uint32_t src2_c : 1; /* relative-const */
+ uint32_t src2_rel : 1; /* relative address */
+ uint32_t must_be_zero : 1;
+ uint32_t dummy : 3;
+ } rel2;
+ struct PACKED {
+ uint32_t src2 : 12;
+ uint32_t src2_c : 1; /* const */
+ uint32_t dummy : 3;
+ } c2;
+ };
+
+ /* dword1: */
+ uint32_t dst : 8;
+ uint32_t repeat : 2;
+ uint32_t sat : 1;
+ uint32_t src1_r : 1;
+ uint32_t ss : 1;
+ uint32_t ul : 1; /* dunno */
+ uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */
+ uint32_t ei : 1;
+ uint32_t cond : 3;
+ uint32_t src2_r : 1;
+ uint32_t full : 1; /* not half */
+ uint32_t opc : 6;
+ uint32_t jmp_tgt : 1;
+ uint32_t sync : 1;
+ uint32_t opc_cat : 3;
+} instr_cat2_t;
+
+typedef struct PACKED {
+ /* dword0: */
+ union PACKED {
+ struct PACKED {
+ uint32_t src1 : 11;
+ uint32_t must_be_zero1: 2;
+ uint32_t src2_c : 1;
+ uint32_t src1_neg : 1;
+ uint32_t src2_r : 1;
+ };
+ struct PACKED {
+ uint32_t src1 : 10;
+ uint32_t src1_c : 1;
+ uint32_t src1_rel : 1;
+ uint32_t must_be_zero : 1;
+ uint32_t dummy : 3;
+ } rel1;
+ struct PACKED {
+ uint32_t src1 : 12;
+ uint32_t src1_c : 1;
+ uint32_t dummy : 3;
+ } c1;
+ };
+
+ union PACKED {
+ struct PACKED {
+ uint32_t src3 : 11;
+ uint32_t must_be_zero2: 2;
+ uint32_t src3_r : 1;
+ uint32_t src2_neg : 1;
+ uint32_t src3_neg : 1;
+ };
+ struct PACKED {
+ uint32_t src3 : 10;
+ uint32_t src3_c : 1;
+ uint32_t src3_rel : 1;
+ uint32_t must_be_zero : 1;
+ uint32_t dummy : 3;
+ } rel2;
+ struct PACKED {
+ uint32_t src3 : 12;
+ uint32_t src3_c : 1;
+ uint32_t dummy : 3;
+ } c2;
+ };
+
+ /* dword1: */
+ uint32_t dst : 8;
+ uint32_t repeat : 2;
+ uint32_t sat : 1;
+ uint32_t src1_r : 1;
+ uint32_t ss : 1;
+ uint32_t ul : 1;
+ uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */
+ uint32_t src2 : 8;
+ uint32_t opc : 4;
+ uint32_t jmp_tgt : 1;
+ uint32_t sync : 1;
+ uint32_t opc_cat : 3;
+} instr_cat3_t;
+
+static inline bool instr_cat3_full(instr_cat3_t *cat3)
+{
+ switch (_OPC(3, cat3->opc)) {
+ case OPC_MAD_F16:
+ case OPC_MAD_U16:
+ case OPC_MAD_S16:
+ case OPC_SEL_B16:
+ case OPC_SEL_S16:
+ case OPC_SEL_F16:
+ case OPC_SAD_S16:
+ case OPC_SAD_S32: // really??
+ return false;
+ default:
+ return true;
+ }
+}
+
+typedef struct PACKED {
+ /* dword0: */
+ union PACKED {
+ struct PACKED {
+ uint32_t src : 11;
+ uint32_t must_be_zero1: 2;
+ uint32_t src_im : 1; /* immediate */
+ uint32_t src_neg : 1; /* negate */
+ uint32_t src_abs : 1; /* absolute value */
+ };
+ struct PACKED {
+ uint32_t src : 10;
+ uint32_t src_c : 1; /* relative-const */
+ uint32_t src_rel : 1; /* relative address */
+ uint32_t must_be_zero : 1;
+ uint32_t dummy : 3;
+ } rel;
+ struct PACKED {
+ uint32_t src : 12;
+ uint32_t src_c : 1; /* const */
+ uint32_t dummy : 3;
+ } c;
+ };
+ uint32_t dummy1 : 16; /* seem to be ignored */
+
+ /* dword1: */
+ uint32_t dst : 8;
+ uint32_t repeat : 2;
+ uint32_t sat : 1;
+ uint32_t src_r : 1;
+ uint32_t ss : 1;
+ uint32_t ul : 1;
+ uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */
+ uint32_t dummy2 : 5; /* seem to be ignored */
+ uint32_t full : 1; /* not half */
+ uint32_t opc : 6;
+ uint32_t jmp_tgt : 1;
+ uint32_t sync : 1;
+ uint32_t opc_cat : 3;
+} instr_cat4_t;
+
+typedef struct PACKED {
+ /* dword0: */
+ union PACKED {
+ /* normal case: */
+ struct PACKED {
+ uint32_t full : 1; /* not half */
+ uint32_t src1 : 8;
+ uint32_t src2 : 8;
+ uint32_t dummy1 : 4; /* seem to be ignored */
+ uint32_t samp : 4;
+ uint32_t tex : 7;
+ } norm;
+ /* s2en case: */
+ struct PACKED {
+ uint32_t full : 1; /* not half */
+ uint32_t src1 : 8;
+ uint32_t src2 : 11;
+ uint32_t dummy1 : 1;
+ uint32_t src3 : 8;
+ uint32_t dummy2 : 3;
+ } s2en;
+ /* same in either case: */
+ // XXX I think, confirm this
+ struct PACKED {
+ uint32_t full : 1; /* not half */
+ uint32_t src1 : 8;
+ uint32_t pad : 23;
+ };
+ };
+
+ /* dword1: */
+ uint32_t dst : 8;
+ uint32_t wrmask : 4; /* write-mask */
+ uint32_t type : 3;
+ uint32_t dummy2 : 1; /* seems to be ignored */
+ uint32_t is_3d : 1;
+
+ uint32_t is_a : 1;
+ uint32_t is_s : 1;
+ uint32_t is_s2en : 1;
+ uint32_t is_o : 1;
+ uint32_t is_p : 1;
+
+ uint32_t opc : 5;
+ uint32_t jmp_tgt : 1;
+ uint32_t sync : 1;
+ uint32_t opc_cat : 3;
+} instr_cat5_t;
+
+/* dword0 encoding for src_off: [src1 + off], src2: */
+typedef struct PACKED {
+ /* dword0: */
+ uint32_t mustbe1 : 1;
+ int32_t off : 13;
+ uint32_t src1 : 8;
+ uint32_t src1_im : 1;
+ uint32_t src2_im : 1;
+ uint32_t src2 : 8;
+
+ /* dword1: */
+ uint32_t dword1;
+} instr_cat6a_t;
+
+/* dword0 encoding for !src_off: [src1], src2 */
+typedef struct PACKED {
+ /* dword0: */
+ uint32_t mustbe0 : 1;
+ uint32_t src1 : 13;
+ uint32_t ignore0 : 8;
+ uint32_t src1_im : 1;
+ uint32_t src2_im : 1;
+ uint32_t src2 : 8;
+
+ /* dword1: */
+ uint32_t dword1;
+} instr_cat6b_t;
+
+/* dword1 encoding for dst_off: */
+typedef struct PACKED {
+ /* dword0: */
+ uint32_t dword0;
+
+ /* note: there is some weird stuff going on where sometimes
+ * cat6->a.off is involved.. but that seems like a bug in
+ * the blob, since it is used even if !cat6->src_off
+ * It would make sense for there to be some more bits to
+ * bring us to 11 bits worth of offset, but not sure..
+ */
+ int32_t off : 8;
+ uint32_t mustbe1 : 1;
+ uint32_t dst : 8;
+ uint32_t pad1 : 15;
+} instr_cat6c_t;
+
+/* dword1 encoding for !dst_off: */
+typedef struct PACKED {
+ /* dword0: */
+ uint32_t dword0;
+
+ uint32_t dst : 8;
+ uint32_t mustbe0 : 1;
+ uint32_t idx : 8;
+ uint32_t pad0 : 15;
+} instr_cat6d_t;
+
+/* ldgb and atomics..
+ *
+ * ldgb: pad0=0, pad3=1
+ * atomic .g: pad0=1, pad3=1
+ * .l: pad0=1, pad3=0
+ */
+typedef struct PACKED {
+ /* dword0: */
+ uint32_t pad0 : 1;
+ uint32_t src3 : 8;
+ uint32_t d : 2;
+ uint32_t typed : 1;
+ uint32_t type_size : 2;
+ uint32_t src1 : 8;
+ uint32_t src1_im : 1;
+ uint32_t src2_im : 1;
+ uint32_t src2 : 8;
+
+ /* dword1: */
+ uint32_t dst : 8;
+ uint32_t mustbe0 : 1;
+ uint32_t src_ssbo : 8;
+ uint32_t pad2 : 3; // type
+ uint32_t g : 1;
+ uint32_t pad3 : 1;
+ uint32_t pad4 : 10; // opc/jmp_tgt/sync/opc_cat
+} instr_cat6ldgb_t;
+
+/* stgb, pad0=0, pad3=2
+ */
+typedef struct PACKED {
+ /* dword0: */
+ uint32_t mustbe1 : 1; // ???
+ uint32_t src1 : 8;
+ uint32_t d : 2;
+ uint32_t typed : 1;
+ uint32_t type_size : 2;
+ uint32_t pad0 : 9;
+ uint32_t src2_im : 1;
+ uint32_t src2 : 8;
+
+ /* dword1: */
+ uint32_t src3 : 8;
+ uint32_t src3_im : 1;
+ uint32_t dst_ssbo : 8;
+ uint32_t pad2 : 3; // type
+ uint32_t pad3 : 2;
+ uint32_t pad4 : 10; // opc/jmp_tgt/sync/opc_cat
+} instr_cat6stgb_t;
+
+typedef union PACKED {
+ instr_cat6a_t a;
+ instr_cat6b_t b;
+ instr_cat6c_t c;
+ instr_cat6d_t d;
+ instr_cat6ldgb_t ldgb;
+ instr_cat6stgb_t stgb;
+ struct PACKED {
+ /* dword0: */
+ uint32_t src_off : 1;
+ uint32_t pad1 : 31;
+
+ /* dword1: */
+ uint32_t pad2 : 8;
+ uint32_t dst_off : 1;
+ uint32_t pad3 : 8;
+ uint32_t type : 3;
+ uint32_t g : 1; /* or in some cases it means dst immed */
+ uint32_t pad4 : 1;
+ uint32_t opc : 5;
+ uint32_t jmp_tgt : 1;
+ uint32_t sync : 1;
+ uint32_t opc_cat : 3;
+ };
+} instr_cat6_t;
+
+typedef struct PACKED {
+ /* dword0: */
+ uint32_t pad1 : 32;
+
+ /* dword1: */
+ uint32_t pad2 : 12;
+ uint32_t ss : 1; /* maybe in the encoding, but blob only uses (sy) */
+ uint32_t pad3 : 6;
+ uint32_t w : 1; /* write */
+ uint32_t r : 1; /* read */
+ uint32_t l : 1; /* local */
+ uint32_t g : 1; /* global */
+ uint32_t opc : 4; /* presumed, but only a couple known OPCs */
+ uint32_t jmp_tgt : 1; /* (jp) */
+ uint32_t sync : 1; /* (sy) */
+ uint32_t opc_cat : 3;
+} instr_cat7_t;
+
+typedef union PACKED {
+ instr_cat0_t cat0;
+ instr_cat1_t cat1;
+ instr_cat2_t cat2;
+ instr_cat3_t cat3;
+ instr_cat4_t cat4;
+ instr_cat5_t cat5;
+ instr_cat6_t cat6;
+ instr_cat7_t cat7;
+ struct PACKED {
+ /* dword0: */
+ uint32_t pad1 : 32;
+
+ /* dword1: */
+ uint32_t pad2 : 12;
+ uint32_t ss : 1; /* cat1-cat4 (cat0??) and cat7 (?) */
+ uint32_t ul : 1; /* cat2-cat4 (and cat1 in blob.. which may be bug??) */
+ uint32_t pad3 : 13;
+ uint32_t jmp_tgt : 1;
+ uint32_t sync : 1;
+ uint32_t opc_cat : 3;
+
+ };
+} instr_t;
+
+static inline uint32_t instr_repeat(instr_t *instr)
+{
+ switch (instr->opc_cat) {
+ case 0: return instr->cat0.repeat;
+ case 1: return instr->cat1.repeat;
+ case 2: return instr->cat2.repeat;
+ case 3: return instr->cat3.repeat;
+ case 4: return instr->cat4.repeat;
+ default: return 0;
+ }
+}
+
+static inline bool instr_sat(instr_t *instr)
+{
+ switch (instr->opc_cat) {
+ case 2: return instr->cat2.sat;
+ case 3: return instr->cat3.sat;
+ case 4: return instr->cat4.sat;
+ default: return false;
+ }
+}
+
+static inline uint32_t instr_opc(instr_t *instr)
+{
+ switch (instr->opc_cat) {
+ case 0: return instr->cat0.opc;
+ case 1: return 0;
+ case 2: return instr->cat2.opc;
+ case 3: return instr->cat3.opc;
+ case 4: return instr->cat4.opc;
+ case 5: return instr->cat5.opc;
+ case 6: return instr->cat6.opc;
+ case 7: return instr->cat7.opc;
+ default: return 0;
+ }
+}
+
+static inline bool is_mad(opc_t opc)
+{
+ switch (opc) {
+ case OPC_MAD_U16:
+ case OPC_MAD_S16:
+ case OPC_MAD_U24:
+ case OPC_MAD_S24:
+ case OPC_MAD_F16:
+ case OPC_MAD_F32:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool is_madsh(opc_t opc)
+{
+ switch (opc) {
+ case OPC_MADSH_U16:
+ case OPC_MADSH_M16:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool is_atomic(opc_t opc)
+{
+ switch (opc) {
+ case OPC_ATOMIC_ADD:
+ case OPC_ATOMIC_SUB:
+ case OPC_ATOMIC_XCHG:
+ case OPC_ATOMIC_INC:
+ case OPC_ATOMIC_DEC:
+ case OPC_ATOMIC_CMPXCHG:
+ case OPC_ATOMIC_MIN:
+ case OPC_ATOMIC_MAX:
+ case OPC_ATOMIC_AND:
+ case OPC_ATOMIC_OR:
+ case OPC_ATOMIC_XOR:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool is_ssbo(opc_t opc)
+{
+ switch (opc) {
+ case OPC_RESFMT:
+ case OPC_RESINFO:
+ case OPC_LDGB:
+ case OPC_STGB:
+ case OPC_STIB:
+ return true;
+ default:
+ return false;
+ }
+}
+
+int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out);
+
+#endif /* INSTR_A3XX_H_ */
diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c
new file mode 100644
index 00000000000..3d1c4449b12
--- /dev/null
+++ b/src/freedreno/ir3/ir3.c
@@ -0,0 +1,941 @@
+/*
+ * Copyright (c) 2012 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <stdbool.h>
+#include <errno.h>
+
+#include "util/bitscan.h"
+#include "util/ralloc.h"
+#include "util/u_math.h"
+
+#include "instr-a3xx.h"
+
+/* simple allocator to carve allocations out of an up-front allocated heap,
+ * so that we can free everything easily in one shot.
+ */
+void * ir3_alloc(struct ir3 *shader, int sz)
+{
+ return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
+}
+
+struct ir3 * ir3_create(struct ir3_compiler *compiler,
+ unsigned nin, unsigned nout)
+{
+ struct ir3 *shader = rzalloc(compiler, struct ir3);
+
+ shader->compiler = compiler;
+ shader->ninputs = nin;
+ shader->inputs = ir3_alloc(shader, sizeof(shader->inputs[0]) * nin);
+
+ shader->noutputs = nout;
+ shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout);
+
+ list_inithead(&shader->block_list);
+ list_inithead(&shader->array_list);
+
+ return shader;
+}
+
+void ir3_destroy(struct ir3 *shader)
+{
+ ralloc_free(shader);
+}
+
+#define iassert(cond) do { \
+ if (!(cond)) { \
+ debug_assert(cond); \
+ return -1; \
+ } } while (0)
+
+#define iassert_type(reg, full) do { \
+ if ((full)) { \
+ iassert(!((reg)->flags & IR3_REG_HALF)); \
+ } else { \
+ iassert((reg)->flags & IR3_REG_HALF); \
+ } } while (0);
+
+static uint32_t reg(struct ir3_register *reg, struct ir3_info *info,
+ uint32_t repeat, uint32_t valid_flags)
+{
+ reg_t val = { .dummy32 = 0 };
+
+ if (reg->flags & ~valid_flags) {
+ debug_printf("INVALID FLAGS: %x vs %x\n",
+ reg->flags, valid_flags);
+ }
+
+ if (!(reg->flags & IR3_REG_R))
+ repeat = 0;
+
+ if (reg->flags & IR3_REG_IMMED) {
+ val.iim_val = reg->iim_val;
+ } else {
+ unsigned components;
+ int16_t max;
+
+ if (reg->flags & IR3_REG_RELATIV) {
+ components = reg->size;
+ val.idummy10 = reg->array.offset;
+ max = (reg->array.offset + repeat + components - 1) >> 2;
+ } else {
+ components = util_last_bit(reg->wrmask);
+ val.comp = reg->num & 0x3;
+ val.num = reg->num >> 2;
+ max = (reg->num + repeat + components - 1) >> 2;
+ }
+
+ if (reg->flags & IR3_REG_CONST) {
+ info->max_const = MAX2(info->max_const, max);
+ } else if (val.num == 63) {
+ /* ignore writes to dummy register r63.x */
+ } else if (max < 48) {
+ if (reg->flags & IR3_REG_HALF) {
+ if (info->gpu_id >= 600) {
+ /* starting w/ a6xx, half regs conflict with full regs: */
+ info->max_reg = MAX2(info->max_reg, (max+1)/2);
+ } else {
+ info->max_half_reg = MAX2(info->max_half_reg, max);
+ }
+ } else {
+ info->max_reg = MAX2(info->max_reg, max);
+ }
+ }
+ }
+
+ return val.dummy32;
+}
+
+static int emit_cat0(struct ir3_instruction *instr, void *ptr,
+ struct ir3_info *info)
+{
+ instr_cat0_t *cat0 = ptr;
+
+ if (info->gpu_id >= 500) {
+ cat0->a5xx.immed = instr->cat0.immed;
+ } else if (info->gpu_id >= 400) {
+ cat0->a4xx.immed = instr->cat0.immed;
+ } else {
+ cat0->a3xx.immed = instr->cat0.immed;
+ }
+ cat0->repeat = instr->repeat;
+ cat0->ss = !!(instr->flags & IR3_INSTR_SS);
+ cat0->inv = instr->cat0.inv;
+ cat0->comp = instr->cat0.comp;
+ cat0->opc = instr->opc;
+ cat0->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
+ cat0->sync = !!(instr->flags & IR3_INSTR_SY);
+ cat0->opc_cat = 0;
+
+ return 0;
+}
+
+static int emit_cat1(struct ir3_instruction *instr, void *ptr,
+ struct ir3_info *info)
+{
+ struct ir3_register *dst = instr->regs[0];
+ struct ir3_register *src = instr->regs[1];
+ instr_cat1_t *cat1 = ptr;
+
+ iassert(instr->regs_count == 2);
+ iassert_type(dst, type_size(instr->cat1.dst_type) == 32);
+ if (!(src->flags & IR3_REG_IMMED))
+ iassert_type(src, type_size(instr->cat1.src_type) == 32);
+
+ if (src->flags & IR3_REG_IMMED) {
+ cat1->iim_val = src->iim_val;
+ cat1->src_im = 1;
+ } else if (src->flags & IR3_REG_RELATIV) {
+ cat1->off = reg(src, info, instr->repeat,
+ IR3_REG_R | IR3_REG_CONST | IR3_REG_HALF | IR3_REG_RELATIV);
+ cat1->src_rel = 1;
+ cat1->src_rel_c = !!(src->flags & IR3_REG_CONST);
+ } else {
+ cat1->src = reg(src, info, instr->repeat,
+ IR3_REG_R | IR3_REG_CONST | IR3_REG_HALF);
+ cat1->src_c = !!(src->flags & IR3_REG_CONST);
+ }
+
+ cat1->dst = reg(dst, info, instr->repeat,
+ IR3_REG_RELATIV | IR3_REG_EVEN |
+ IR3_REG_R | IR3_REG_POS_INF | IR3_REG_HALF);
+ cat1->repeat = instr->repeat;
+ cat1->src_r = !!(src->flags & IR3_REG_R);
+ cat1->ss = !!(instr->flags & IR3_INSTR_SS);
+ cat1->ul = !!(instr->flags & IR3_INSTR_UL);
+ cat1->dst_type = instr->cat1.dst_type;
+ cat1->dst_rel = !!(dst->flags & IR3_REG_RELATIV);
+ cat1->src_type = instr->cat1.src_type;
+ cat1->even = !!(dst->flags & IR3_REG_EVEN);
+ cat1->pos_inf = !!(dst->flags & IR3_REG_POS_INF);
+ cat1->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
+ cat1->sync = !!(instr->flags & IR3_INSTR_SY);
+ cat1->opc_cat = 1;
+
+ return 0;
+}
+
+static int emit_cat2(struct ir3_instruction *instr, void *ptr,
+ struct ir3_info *info)
+{
+ struct ir3_register *dst = instr->regs[0];
+ struct ir3_register *src1 = instr->regs[1];
+ struct ir3_register *src2 = instr->regs[2];
+ instr_cat2_t *cat2 = ptr;
+ unsigned absneg = ir3_cat2_absneg(instr->opc);
+
+ iassert((instr->regs_count == 2) || (instr->regs_count == 3));
+
+ if (src1->flags & IR3_REG_RELATIV) {
+ iassert(src1->array.offset < (1 << 10));
+ cat2->rel1.src1 = reg(src1, info, instr->repeat,
+ IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
+ IR3_REG_HALF | absneg);
+ cat2->rel1.src1_c = !!(src1->flags & IR3_REG_CONST);
+ cat2->rel1.src1_rel = 1;
+ } else if (src1->flags & IR3_REG_CONST) {
+ iassert(src1->num < (1 << 12));
+ cat2->c1.src1 = reg(src1, info, instr->repeat,
+ IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
+ cat2->c1.src1_c = 1;
+ } else {
+ iassert(src1->num < (1 << 11));
+ cat2->src1 = reg(src1, info, instr->repeat,
+ IR3_REG_IMMED | IR3_REG_R | IR3_REG_HALF |
+ absneg);
+ }
+ cat2->src1_im = !!(src1->flags & IR3_REG_IMMED);
+ cat2->src1_neg = !!(src1->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
+ cat2->src1_abs = !!(src1->flags & (IR3_REG_FABS | IR3_REG_SABS));
+ cat2->src1_r = !!(src1->flags & IR3_REG_R);
+
+ if (src2) {
+ iassert((src2->flags & IR3_REG_IMMED) ||
+ !((src1->flags ^ src2->flags) & IR3_REG_HALF));
+
+ if (src2->flags & IR3_REG_RELATIV) {
+ iassert(src2->array.offset < (1 << 10));
+ cat2->rel2.src2 = reg(src2, info, instr->repeat,
+ IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
+ IR3_REG_HALF | absneg);
+ cat2->rel2.src2_c = !!(src2->flags & IR3_REG_CONST);
+ cat2->rel2.src2_rel = 1;
+ } else if (src2->flags & IR3_REG_CONST) {
+ iassert(src2->num < (1 << 12));
+ cat2->c2.src2 = reg(src2, info, instr->repeat,
+ IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
+ cat2->c2.src2_c = 1;
+ } else {
+ iassert(src2->num < (1 << 11));
+ cat2->src2 = reg(src2, info, instr->repeat,
+ IR3_REG_IMMED | IR3_REG_R | IR3_REG_HALF |
+ absneg);
+ }
+
+ cat2->src2_im = !!(src2->flags & IR3_REG_IMMED);
+ cat2->src2_neg = !!(src2->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
+ cat2->src2_abs = !!(src2->flags & (IR3_REG_FABS | IR3_REG_SABS));
+ cat2->src2_r = !!(src2->flags & IR3_REG_R);
+ }
+
+ cat2->dst = reg(dst, info, instr->repeat,
+ IR3_REG_R | IR3_REG_EI | IR3_REG_HALF);
+ cat2->repeat = instr->repeat;
+ cat2->sat = !!(instr->flags & IR3_INSTR_SAT);
+ cat2->ss = !!(instr->flags & IR3_INSTR_SS);
+ cat2->ul = !!(instr->flags & IR3_INSTR_UL);
+ cat2->dst_half = !!((src1->flags ^ dst->flags) & IR3_REG_HALF);
+ cat2->ei = !!(dst->flags & IR3_REG_EI);
+ cat2->cond = instr->cat2.condition;
+ cat2->full = ! (src1->flags & IR3_REG_HALF);
+ cat2->opc = instr->opc;
+ cat2->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
+ cat2->sync = !!(instr->flags & IR3_INSTR_SY);
+ cat2->opc_cat = 2;
+
+ return 0;
+}
+
+static int emit_cat3(struct ir3_instruction *instr, void *ptr,
+ struct ir3_info *info)
+{
+ struct ir3_register *dst = instr->regs[0];
+ struct ir3_register *src1 = instr->regs[1];
+ struct ir3_register *src2 = instr->regs[2];
+ struct ir3_register *src3 = instr->regs[3];
+ unsigned absneg = ir3_cat3_absneg(instr->opc);
+ instr_cat3_t *cat3 = ptr;
+ uint32_t src_flags = 0;
+
+ switch (instr->opc) {
+ case OPC_MAD_F16:
+ case OPC_MAD_U16:
+ case OPC_MAD_S16:
+ case OPC_SEL_B16:
+ case OPC_SEL_S16:
+ case OPC_SEL_F16:
+ case OPC_SAD_S16:
+ case OPC_SAD_S32: // really??
+ src_flags |= IR3_REG_HALF;
+ break;
+ default:
+ break;
+ }
+
+ iassert(instr->regs_count == 4);
+ iassert(!((src1->flags ^ src_flags) & IR3_REG_HALF));
+ iassert(!((src2->flags ^ src_flags) & IR3_REG_HALF));
+ iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF));
+
+ if (src1->flags & IR3_REG_RELATIV) {
+ iassert(src1->array.offset < (1 << 10));
+ cat3->rel1.src1 = reg(src1, info, instr->repeat,
+ IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
+ IR3_REG_HALF | absneg);
+ cat3->rel1.src1_c = !!(src1->flags & IR3_REG_CONST);
+ cat3->rel1.src1_rel = 1;
+ } else if (src1->flags & IR3_REG_CONST) {
+ iassert(src1->num < (1 << 12));
+ cat3->c1.src1 = reg(src1, info, instr->repeat,
+ IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
+ cat3->c1.src1_c = 1;
+ } else {
+ iassert(src1->num < (1 << 11));
+ cat3->src1 = reg(src1, info, instr->repeat,
+ IR3_REG_R | IR3_REG_HALF | absneg);
+ }
+
+ cat3->src1_neg = !!(src1->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
+ cat3->src1_r = !!(src1->flags & IR3_REG_R);
+
+ cat3->src2 = reg(src2, info, instr->repeat,
+ IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF | absneg);
+ cat3->src2_c = !!(src2->flags & IR3_REG_CONST);
+ cat3->src2_neg = !!(src2->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
+ cat3->src2_r = !!(src2->flags & IR3_REG_R);
+
+
+ if (src3->flags & IR3_REG_RELATIV) {
+ iassert(src3->array.offset < (1 << 10));
+ cat3->rel2.src3 = reg(src3, info, instr->repeat,
+ IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
+ IR3_REG_HALF | absneg);
+ cat3->rel2.src3_c = !!(src3->flags & IR3_REG_CONST);
+ cat3->rel2.src3_rel = 1;
+ } else if (src3->flags & IR3_REG_CONST) {
+ iassert(src3->num < (1 << 12));
+ cat3->c2.src3 = reg(src3, info, instr->repeat,
+ IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
+ cat3->c2.src3_c = 1;
+ } else {
+ iassert(src3->num < (1 << 11));
+ cat3->src3 = reg(src3, info, instr->repeat,
+ IR3_REG_R | IR3_REG_HALF | absneg);
+ }
+
+ cat3->src3_neg = !!(src3->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
+ cat3->src3_r = !!(src3->flags & IR3_REG_R);
+
+ cat3->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+ cat3->repeat = instr->repeat;
+ cat3->sat = !!(instr->flags & IR3_INSTR_SAT);
+ cat3->ss = !!(instr->flags & IR3_INSTR_SS);
+ cat3->ul = !!(instr->flags & IR3_INSTR_UL);
+ cat3->dst_half = !!((src_flags ^ dst->flags) & IR3_REG_HALF);
+ cat3->opc = instr->opc;
+ cat3->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
+ cat3->sync = !!(instr->flags & IR3_INSTR_SY);
+ cat3->opc_cat = 3;
+
+ return 0;
+}
+
+static int emit_cat4(struct ir3_instruction *instr, void *ptr,
+ struct ir3_info *info)
+{
+ struct ir3_register *dst = instr->regs[0];
+ struct ir3_register *src = instr->regs[1];
+ instr_cat4_t *cat4 = ptr;
+
+ iassert(instr->regs_count == 2);
+
+ if (src->flags & IR3_REG_RELATIV) {
+ iassert(src->array.offset < (1 << 10));
+ cat4->rel.src = reg(src, info, instr->repeat,
+ IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_FNEG |
+ IR3_REG_FABS | IR3_REG_R | IR3_REG_HALF);
+ cat4->rel.src_c = !!(src->flags & IR3_REG_CONST);
+ cat4->rel.src_rel = 1;
+ } else if (src->flags & IR3_REG_CONST) {
+ iassert(src->num < (1 << 12));
+ cat4->c.src = reg(src, info, instr->repeat,
+ IR3_REG_CONST | IR3_REG_FNEG | IR3_REG_FABS |
+ IR3_REG_R | IR3_REG_HALF);
+ cat4->c.src_c = 1;
+ } else {
+ iassert(src->num < (1 << 11));
+ cat4->src = reg(src, info, instr->repeat,
+ IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS |
+ IR3_REG_R | IR3_REG_HALF);
+ }
+
+ cat4->src_im = !!(src->flags & IR3_REG_IMMED);
+ cat4->src_neg = !!(src->flags & IR3_REG_FNEG);
+ cat4->src_abs = !!(src->flags & IR3_REG_FABS);
+ cat4->src_r = !!(src->flags & IR3_REG_R);
+
+ cat4->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+ cat4->repeat = instr->repeat;
+ cat4->sat = !!(instr->flags & IR3_INSTR_SAT);
+ cat4->ss = !!(instr->flags & IR3_INSTR_SS);
+ cat4->ul = !!(instr->flags & IR3_INSTR_UL);
+ cat4->dst_half = !!((src->flags ^ dst->flags) & IR3_REG_HALF);
+ cat4->full = ! (src->flags & IR3_REG_HALF);
+ cat4->opc = instr->opc;
+ cat4->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
+ cat4->sync = !!(instr->flags & IR3_INSTR_SY);
+ cat4->opc_cat = 4;
+
+ return 0;
+}
+
+static int emit_cat5(struct ir3_instruction *instr, void *ptr,
+ struct ir3_info *info)
+{
+ struct ir3_register *dst = instr->regs[0];
+ struct ir3_register *src1 = instr->regs[1];
+ struct ir3_register *src2 = instr->regs[2];
+ struct ir3_register *src3 = instr->regs[3];
+ instr_cat5_t *cat5 = ptr;
+
+ iassert_type(dst, type_size(instr->cat5.type) == 32)
+
+ assume(src1 || !src2);
+ assume(src2 || !src3);
+
+ if (src1) {
+ cat5->full = ! (src1->flags & IR3_REG_HALF);
+ cat5->src1 = reg(src1, info, instr->repeat, IR3_REG_HALF);
+ }
+
+ if (instr->flags & IR3_INSTR_S2EN) {
+ if (src2) {
+ iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF));
+ cat5->s2en.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF);
+ }
+ if (src3) {
+ iassert(src3->flags & IR3_REG_HALF);
+ cat5->s2en.src3 = reg(src3, info, instr->repeat, IR3_REG_HALF);
+ }
+ iassert(!(instr->cat5.samp | instr->cat5.tex));
+ } else {
+ iassert(!src3);
+ if (src2) {
+ iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF));
+ cat5->norm.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF);
+ }
+ cat5->norm.samp = instr->cat5.samp;
+ cat5->norm.tex = instr->cat5.tex;
+ }
+
+ cat5->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+ cat5->wrmask = dst->wrmask;
+ cat5->type = instr->cat5.type;
+ cat5->is_3d = !!(instr->flags & IR3_INSTR_3D);
+ cat5->is_a = !!(instr->flags & IR3_INSTR_A);
+ cat5->is_s = !!(instr->flags & IR3_INSTR_S);
+ cat5->is_s2en = !!(instr->flags & IR3_INSTR_S2EN);
+ cat5->is_o = !!(instr->flags & IR3_INSTR_O);
+ cat5->is_p = !!(instr->flags & IR3_INSTR_P);
+ cat5->opc = instr->opc;
+ cat5->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
+ cat5->sync = !!(instr->flags & IR3_INSTR_SY);
+ cat5->opc_cat = 5;
+
+ return 0;
+}
+
+static int emit_cat6(struct ir3_instruction *instr, void *ptr,
+ struct ir3_info *info)
+{
+ struct ir3_register *dst, *src1, *src2;
+ instr_cat6_t *cat6 = ptr;
+ bool type_full = type_size(instr->cat6.type) == 32;
+
+ cat6->type = instr->cat6.type;
+ cat6->opc = instr->opc;
+ cat6->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
+ cat6->sync = !!(instr->flags & IR3_INSTR_SY);
+ cat6->g = !!(instr->flags & IR3_INSTR_G);
+ cat6->opc_cat = 6;
+
+ switch (instr->opc) {
+ case OPC_RESINFO:
+ case OPC_RESFMT:
+ iassert_type(instr->regs[0], type_full); /* dst */
+ iassert_type(instr->regs[1], type_full); /* src1 */
+ break;
+ case OPC_L2G:
+ case OPC_G2L:
+ iassert_type(instr->regs[0], true); /* dst */
+ iassert_type(instr->regs[1], true); /* src1 */
+ break;
+ case OPC_STG:
+ case OPC_STL:
+ case OPC_STP:
+ case OPC_STI:
+ case OPC_STLW:
+ case OPC_STIB:
+ /* no dst, so regs[0] is dummy */
+ iassert_type(instr->regs[1], true); /* dst */
+ iassert_type(instr->regs[2], type_full); /* src1 */
+ iassert_type(instr->regs[3], true); /* src2 */
+ break;
+ default:
+ iassert_type(instr->regs[0], type_full); /* dst */
+ iassert_type(instr->regs[1], true); /* src1 */
+ if (instr->regs_count > 2)
+ iassert_type(instr->regs[2], true); /* src1 */
+ break;
+ }
+
+ /* the "dst" for a store instruction is (from the perspective
+ * of data flow in the shader, ie. register use/def, etc) in
+ * fact a register that is read by the instruction, rather
+ * than written:
+ */
+ if (is_store(instr)) {
+ iassert(instr->regs_count >= 3);
+
+ dst = instr->regs[1];
+ src1 = instr->regs[2];
+ src2 = (instr->regs_count >= 4) ? instr->regs[3] : NULL;
+ } else {
+ iassert(instr->regs_count >= 2);
+
+ dst = instr->regs[0];
+ src1 = instr->regs[1];
+ src2 = (instr->regs_count >= 3) ? instr->regs[2] : NULL;
+ }
+
+ /* TODO we need a more comprehensive list about which instructions
+ * can be encoded which way. Or possibly use IR3_INSTR_0 flag to
+ * indicate to use the src_off encoding even if offset is zero
+ * (but then what to do about dst_off?)
+ */
+ if (is_atomic(instr->opc)) {
+ instr_cat6ldgb_t *ldgb = ptr;
+
+ /* maybe these two bits both determine the instruction encoding? */
+ cat6->src_off = false;
+
+ ldgb->d = instr->cat6.d - 1;
+ ldgb->typed = instr->cat6.typed;
+ ldgb->type_size = instr->cat6.iim_val - 1;
+
+ ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+
+ if (ldgb->g) {
+ struct ir3_register *src3 = instr->regs[3];
+ struct ir3_register *src4 = instr->regs[4];
+
+ /* first src is src_ssbo: */
+ iassert(src1->flags & IR3_REG_IMMED);
+ ldgb->src_ssbo = src1->uim_val;
+
+ ldgb->src1 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+ ldgb->src1_im = !!(src2->flags & IR3_REG_IMMED);
+ ldgb->src2 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
+ ldgb->src2_im = !!(src3->flags & IR3_REG_IMMED);
+
+ ldgb->src3 = reg(src4, info, instr->repeat, 0);
+ ldgb->pad0 = 0x1;
+ ldgb->pad3 = 0x1;
+ } else {
+ ldgb->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED);
+ ldgb->src1_im = !!(src1->flags & IR3_REG_IMMED);
+ ldgb->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+ ldgb->src2_im = !!(src2->flags & IR3_REG_IMMED);
+ ldgb->pad0 = 0x1;
+ ldgb->pad3 = 0x0;
+ }
+
+ return 0;
+ } else if (instr->opc == OPC_LDGB) {
+ struct ir3_register *src3 = instr->regs[3];
+ instr_cat6ldgb_t *ldgb = ptr;
+
+ /* maybe these two bits both determine the instruction encoding? */
+ cat6->src_off = false;
+
+ ldgb->d = instr->cat6.d - 1;
+ ldgb->typed = instr->cat6.typed;
+ ldgb->type_size = instr->cat6.iim_val - 1;
+
+ ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+
+ /* first src is src_ssbo: */
+ iassert(src1->flags & IR3_REG_IMMED);
+ ldgb->src_ssbo = src1->uim_val;
+
+ /* then next two are src1/src2: */
+ ldgb->src1 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+ ldgb->src1_im = !!(src2->flags & IR3_REG_IMMED);
+ ldgb->src2 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
+ ldgb->src2_im = !!(src3->flags & IR3_REG_IMMED);
+
+ ldgb->pad0 = 0x0;
+ ldgb->pad3 = 0x1;
+
+ return 0;
+ } else if (instr->opc == OPC_RESINFO) {
+ instr_cat6ldgb_t *ldgb = ptr;
+
+ ldgb->d = instr->cat6.d - 1;
+
+ ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+
+ /* first src is src_ssbo: */
+ iassert(src1->flags & IR3_REG_IMMED);
+ ldgb->src_ssbo = src1->uim_val;
+
+ return 0;
+ } else if ((instr->opc == OPC_STGB) || (instr->opc == OPC_STIB)) {
+ struct ir3_register *src3 = instr->regs[4];
+ instr_cat6stgb_t *stgb = ptr;
+
+ /* maybe these two bits both determine the instruction encoding? */
+ cat6->src_off = true;
+ stgb->pad3 = 0x2;
+
+ stgb->d = instr->cat6.d - 1;
+ stgb->typed = instr->cat6.typed;
+ stgb->type_size = instr->cat6.iim_val - 1;
+
+ /* first src is dst_ssbo: */
+ iassert(dst->flags & IR3_REG_IMMED);
+ stgb->dst_ssbo = dst->uim_val;
+
+ /* then src1/src2/src3: */
+ stgb->src1 = reg(src1, info, instr->repeat, 0);
+ stgb->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+ stgb->src2_im = !!(src2->flags & IR3_REG_IMMED);
+ stgb->src3 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
+ stgb->src3_im = !!(src3->flags & IR3_REG_IMMED);
+
+ return 0;
+ } else if (instr->cat6.src_offset || (instr->opc == OPC_LDG) ||
+ (instr->opc == OPC_LDL)) {
+ instr_cat6a_t *cat6a = ptr;
+
+ cat6->src_off = true;
+
+ cat6a->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED);
+ cat6a->src1_im = !!(src1->flags & IR3_REG_IMMED);
+ if (src2) {
+ cat6a->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+ cat6a->src2_im = !!(src2->flags & IR3_REG_IMMED);
+ }
+ cat6a->off = instr->cat6.src_offset;
+ } else {
+ instr_cat6b_t *cat6b = ptr;
+
+ cat6->src_off = false;
+
+ cat6b->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED | IR3_REG_HALF);
+ cat6b->src1_im = !!(src1->flags & IR3_REG_IMMED);
+ if (src2) {
+ cat6b->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+ cat6b->src2_im = !!(src2->flags & IR3_REG_IMMED);
+ }
+ }
+
+ if (instr->cat6.dst_offset || (instr->opc == OPC_STG) ||
+ (instr->opc == OPC_STL)) {
+ instr_cat6c_t *cat6c = ptr;
+ cat6->dst_off = true;
+ cat6c->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+ cat6c->off = instr->cat6.dst_offset;
+ } else {
+ instr_cat6d_t *cat6d = ptr;
+ cat6->dst_off = false;
+ cat6d->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+ }
+
+ return 0;
+}
+
+static int emit_cat7(struct ir3_instruction *instr, void *ptr,
+ struct ir3_info *info)
+{
+ instr_cat7_t *cat7 = ptr;
+
+ cat7->ss = !!(instr->flags & IR3_INSTR_SS);
+ cat7->w = instr->cat7.w;
+ cat7->r = instr->cat7.r;
+ cat7->l = instr->cat7.l;
+ cat7->g = instr->cat7.g;
+ cat7->opc = instr->opc;
+ cat7->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
+ cat7->sync = !!(instr->flags & IR3_INSTR_SY);
+ cat7->opc_cat = 7;
+
+ return 0;
+}
+
+static int (*emit[])(struct ir3_instruction *instr, void *ptr,
+ struct ir3_info *info) = {
+ emit_cat0, emit_cat1, emit_cat2, emit_cat3, emit_cat4, emit_cat5, emit_cat6,
+ emit_cat7,
+};
+
+void * ir3_assemble(struct ir3 *shader, struct ir3_info *info,
+ uint32_t gpu_id)
+{
+ uint32_t *ptr, *dwords;
+
+ info->gpu_id = gpu_id;
+ info->max_reg = -1;
+ info->max_half_reg = -1;
+ info->max_const = -1;
+ info->instrs_count = 0;
+ info->sizedwords = 0;
+ info->ss = info->sy = 0;
+
+ list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ info->sizedwords += 2;
+ }
+ }
+
+ /* need an integer number of instruction "groups" (sets of 16
+ * instructions on a4xx or sets of 4 instructions on a3xx),
+ * so pad out w/ NOPs if needed: (NOTE each instruction is 64bits)
+ */
+ if (gpu_id >= 400) {
+ info->sizedwords = align(info->sizedwords, 16 * 2);
+ } else {
+ info->sizedwords = align(info->sizedwords, 4 * 2);
+ }
+
+ ptr = dwords = calloc(4, info->sizedwords);
+
+ list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ int ret = emit[opc_cat(instr->opc)](instr, dwords, info);
+ if (ret)
+ goto fail;
+ info->instrs_count += 1 + instr->repeat;
+ dwords += 2;
+
+ if (instr->flags & IR3_INSTR_SS)
+ info->ss++;
+
+ if (instr->flags & IR3_INSTR_SY)
+ info->sy++;
+ }
+ }
+
+ return ptr;
+
+fail:
+ free(ptr);
+ return NULL;
+}
+
+static struct ir3_register * reg_create(struct ir3 *shader,
+ int num, int flags)
+{
+ struct ir3_register *reg =
+ ir3_alloc(shader, sizeof(struct ir3_register));
+ reg->wrmask = 1;
+ reg->flags = flags;
+ reg->num = num;
+ return reg;
+}
+
+static void insert_instr(struct ir3_block *block,
+ struct ir3_instruction *instr)
+{
+ struct ir3 *shader = block->shader;
+#ifdef DEBUG
+ instr->serialno = ++shader->instr_count;
+#endif
+ list_addtail(&instr->node, &block->instr_list);
+
+ if (is_input(instr))
+ array_insert(shader, shader->baryfs, instr);
+}
+
+struct ir3_block * ir3_block_create(struct ir3 *shader)
+{
+ struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
+#ifdef DEBUG
+ block->serialno = ++shader->block_count;
+#endif
+ block->shader = shader;
+ list_inithead(&block->node);
+ list_inithead(&block->instr_list);
+ return block;
+}
+
+static struct ir3_instruction *instr_create(struct ir3_block *block, int nreg)
+{
+ struct ir3_instruction *instr;
+ unsigned sz = sizeof(*instr) + (nreg * sizeof(instr->regs[0]));
+ char *ptr = ir3_alloc(block->shader, sz);
+
+ instr = (struct ir3_instruction *)ptr;
+ ptr += sizeof(*instr);
+ instr->regs = (struct ir3_register **)ptr;
+
+#ifdef DEBUG
+ instr->regs_max = nreg;
+#endif
+
+ return instr;
+}
+
+struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
+ opc_t opc, int nreg)
+{
+ struct ir3_instruction *instr = instr_create(block, nreg);
+ instr->block = block;
+ instr->opc = opc;
+ insert_instr(block, instr);
+ return instr;
+}
+
+struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc)
+{
+ /* NOTE: we could be slightly more clever, at least for non-meta,
+ * and choose # of regs based on category.
+ */
+ return ir3_instr_create2(block, opc, 4);
+}
+
+struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
+{
+ struct ir3_instruction *new_instr = instr_create(instr->block,
+ instr->regs_count);
+ struct ir3_register **regs;
+ unsigned i;
+
+ regs = new_instr->regs;
+ *new_instr = *instr;
+ new_instr->regs = regs;
+
+ insert_instr(instr->block, new_instr);
+
+ /* clone registers: */
+ new_instr->regs_count = 0;
+ for (i = 0; i < instr->regs_count; i++) {
+ struct ir3_register *reg = instr->regs[i];
+ struct ir3_register *new_reg =
+ ir3_reg_create(new_instr, reg->num, reg->flags);
+ *new_reg = *reg;
+ }
+
+ return new_instr;
+}
+
+/* Add a false dependency to instruction, to ensure it is scheduled first: */
+void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
+{
+ array_insert(instr, instr->deps, dep);
+}
+
+struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
+ int num, int flags)
+{
+ struct ir3 *shader = instr->block->shader;
+ struct ir3_register *reg = reg_create(shader, num, flags);
+#ifdef DEBUG
+ debug_assert(instr->regs_count < instr->regs_max);
+#endif
+ instr->regs[instr->regs_count++] = reg;
+ return reg;
+}
+
+struct ir3_register * ir3_reg_clone(struct ir3 *shader,
+ struct ir3_register *reg)
+{
+ struct ir3_register *new_reg = reg_create(shader, 0, 0);
+ *new_reg = *reg;
+ return new_reg;
+}
+
+void
+ir3_instr_set_address(struct ir3_instruction *instr,
+ struct ir3_instruction *addr)
+{
+ if (instr->address != addr) {
+ struct ir3 *ir = instr->block->shader;
+ instr->address = addr;
+ array_insert(ir, ir->indirects, instr);
+ }
+}
+
+void
+ir3_block_clear_mark(struct ir3_block *block)
+{
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
+ instr->flags &= ~IR3_INSTR_MARK;
+}
+
+void
+ir3_clear_mark(struct ir3 *ir)
+{
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ ir3_block_clear_mark(block);
+ }
+}
+
+/* note: this will destroy instr->depth, don't do it until after sched! */
+unsigned
+ir3_count_instructions(struct ir3 *ir)
+{
+ unsigned cnt = 0;
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ instr->ip = cnt++;
+ }
+ block->start_ip = list_first_entry(&block->instr_list, struct ir3_instruction, node)->ip;
+ block->end_ip = list_last_entry(&block->instr_list, struct ir3_instruction, node)->ip;
+ }
+ return cnt;
+}
+
+struct ir3_array *
+ir3_lookup_array(struct ir3 *ir, unsigned id)
+{
+ list_for_each_entry (struct ir3_array, arr, &ir->array_list, node)
+ if (arr->id == id)
+ return arr;
+ return NULL;
+}
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
new file mode 100644
index 00000000000..ea3218828df
--- /dev/null
+++ b/src/freedreno/ir3/ir3.h
@@ -0,0 +1,1394 @@
+/*
+ * Copyright (c) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IR3_H_
+#define IR3_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "compiler/shader_enums.h"
+
+#include "util/u_debug.h"
+#include "util/list.h"
+
+#include "instr-a3xx.h"
+
+/* low level intermediate representation of an adreno shader program */
+
+struct ir3_compiler;
+struct ir3;
+struct ir3_instruction;
+struct ir3_block;
+
+struct ir3_info {
+ uint32_t gpu_id;
+ uint16_t sizedwords;
+ uint16_t instrs_count; /* expanded to account for rpt's */
+ /* NOTE: max_reg, etc, does not include registers not touched
+ * by the shader (ie. vertex fetched via VFD_DECODE but not
+ * touched by shader)
+ */
+ int8_t max_reg; /* highest GPR # used by shader */
+ int8_t max_half_reg;
+ int16_t max_const;
+
+ /* number of sync bits: */
+ uint16_t ss, sy;
+};
+
+struct ir3_register {
+ enum {
+ IR3_REG_CONST = 0x001,
+ IR3_REG_IMMED = 0x002,
+ IR3_REG_HALF = 0x004,
+ /* high registers are used for some things in compute shaders,
+ * for example. Seems to be for things that are global to all
+ * threads in a wave, so possibly these are global/shared by
+ * all the threads in the wave?
+ */
+ IR3_REG_HIGH = 0x008,
+ IR3_REG_RELATIV= 0x010,
+ IR3_REG_R = 0x020,
+ /* Most instructions, it seems, can do float abs/neg but not
+ * integer. The CP pass needs to know what is intended (int or
+ * float) in order to do the right thing. For this reason the
+ * abs/neg flags are split out into float and int variants. In
+ * addition, .b (bitwise) operations, the negate is actually a
+ * bitwise not, so split that out into a new flag to make it
+ * more clear.
+ */
+ IR3_REG_FNEG = 0x040,
+ IR3_REG_FABS = 0x080,
+ IR3_REG_SNEG = 0x100,
+ IR3_REG_SABS = 0x200,
+ IR3_REG_BNOT = 0x400,
+ IR3_REG_EVEN = 0x800,
+ IR3_REG_POS_INF= 0x1000,
+ /* (ei) flag, end-input? Set on last bary, presumably to signal
+ * that the shader needs no more input:
+ */
+ IR3_REG_EI = 0x2000,
+ /* meta-flags, for intermediate stages of IR, ie.
+ * before register assignment is done:
+ */
+ IR3_REG_SSA = 0x4000, /* 'instr' is ptr to assigning instr */
+ IR3_REG_ARRAY = 0x8000,
+
+ } flags;
+
+ /* normal registers:
+ * the component is in the low two bits of the reg #, so
+ * rN.x becomes: (N << 2) | x
+ */
+ int num;
+ union {
+ /* immediate: */
+ int32_t iim_val;
+ uint32_t uim_val;
+ float fim_val;
+ /* relative: */
+ struct {
+ uint16_t id;
+ int16_t offset;
+ } array;
+ };
+
+ /* For IR3_REG_SSA, src registers contain ptr back to assigning
+ * instruction.
+ *
+ * For IR3_REG_ARRAY, the pointer is back to the last dependent
+ * array access (although the net effect is the same, it points
+ * back to a previous instruction that we depend on).
+ */
+ struct ir3_instruction *instr;
+
+ union {
+ /* used for cat5 instructions, but also for internal/IR level
+ * tracking of what registers are read/written by an instruction.
+ * wrmask may be a bad name since it is used to represent both
+ * src and dst that touch multiple adjacent registers.
+ */
+ unsigned wrmask;
+ /* for relative addressing, 32bits for array size is too small,
+ * but otoh we don't need to deal with disjoint sets, so instead
+ * use a simple size field (number of scalar components).
+ */
+ unsigned size;
+ };
+};
+
+/*
+ * Stupid/simple growable array implementation:
+ */
+#define DECLARE_ARRAY(type, name) \
+ unsigned name ## _count, name ## _sz; \
+ type * name;
+
+#define array_insert(ctx, arr, val) do { \
+ if (arr ## _count == arr ## _sz) { \
+ arr ## _sz = MAX2(2 * arr ## _sz, 16); \
+ arr = reralloc_size(ctx, arr, arr ## _sz * sizeof(arr[0])); \
+ } \
+ arr[arr ##_count++] = val; \
+ } while (0)
+
+struct ir3_instruction {
+ struct ir3_block *block;
+ opc_t opc;
+ enum {
+ /* (sy) flag is set on first instruction, and after sample
+ * instructions (probably just on RAW hazard).
+ */
+ IR3_INSTR_SY = 0x001,
+ /* (ss) flag is set on first instruction, and first instruction
+ * to depend on the result of "long" instructions (RAW hazard):
+ *
+ * rcp, rsq, log2, exp2, sin, cos, sqrt
+ *
+ * It seems to synchronize until all in-flight instructions are
+ * completed, for example:
+ *
+ * rsq hr1.w, hr1.w
+ * add.f hr2.z, (neg)hr2.z, hc0.y
+ * mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
+ * rsq hr2.x, hr2.x
+ * (rpt1)nop
+ * mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
+ * nop
+ * mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
+ * (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
+ * (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
+ *
+ * The last mul.f does not have (ss) set, presumably because the
+ * (ss) on the previous instruction does the job.
+ *
+ * The blob driver also seems to set it on WAR hazards, although
+ * not really clear if this is needed or just blob compiler being
+ * sloppy. So far I haven't found a case where removing the (ss)
+ * causes problems for WAR hazard, but I could just be getting
+ * lucky:
+ *
+ * rcp r1.y, r3.y
+ * (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
+ *
+ */
+ IR3_INSTR_SS = 0x002,
+ /* (jp) flag is set on jump targets:
+ */
+ IR3_INSTR_JP = 0x004,
+ IR3_INSTR_UL = 0x008,
+ IR3_INSTR_3D = 0x010,
+ IR3_INSTR_A = 0x020,
+ IR3_INSTR_O = 0x040,
+ IR3_INSTR_P = 0x080,
+ IR3_INSTR_S = 0x100,
+ IR3_INSTR_S2EN = 0x200,
+ IR3_INSTR_G = 0x400,
+ IR3_INSTR_SAT = 0x800,
+ /* meta-flags, for intermediate stages of IR, ie.
+ * before register assignment is done:
+ */
+ IR3_INSTR_MARK = 0x1000,
+ IR3_INSTR_UNUSED= 0x2000,
+ } flags;
+ int repeat;
+#ifdef DEBUG
+ unsigned regs_max;
+#endif
+ unsigned regs_count;
+ struct ir3_register **regs;
+ union {
+ struct {
+ char inv;
+ char comp;
+ int immed;
+ struct ir3_block *target;
+ } cat0;
+ struct {
+ type_t src_type, dst_type;
+ } cat1;
+ struct {
+ enum {
+ IR3_COND_LT = 0,
+ IR3_COND_LE = 1,
+ IR3_COND_GT = 2,
+ IR3_COND_GE = 3,
+ IR3_COND_EQ = 4,
+ IR3_COND_NE = 5,
+ } condition;
+ } cat2;
+ struct {
+ unsigned samp, tex;
+ type_t type;
+ } cat5;
+ struct {
+ type_t type;
+ int src_offset;
+ int dst_offset;
+ int iim_val : 3; /* for ldgb/stgb, # of components */
+ int d : 3;
+ bool typed : 1;
+ } cat6;
+ struct {
+ unsigned w : 1; /* write */
+ unsigned r : 1; /* read */
+ unsigned l : 1; /* local */
+ unsigned g : 1; /* global */
+ } cat7;
+ /* for meta-instructions, just used to hold extra data
+ * before instruction scheduling, etc
+ */
+ struct {
+ int off; /* component/offset */
+ } fo;
+ struct {
+ struct ir3_block *block;
+ } inout;
+ };
+
+ /* transient values used during various algorithms: */
+ union {
+ /* The instruction depth is the max dependency distance to output.
+ *
+ * You can also think of it as the "cost", if we did any sort of
+ * optimization for register footprint. Ie. a value that is just
+ * result of moving a const to a reg would have a low cost, so to
+ * it could make sense to duplicate the instruction at various
+ * points where the result is needed to reduce register footprint.
+ */
+ unsigned depth;
+ /* When we get to the RA stage, we no longer need depth, but
+ * we do need instruction's position/name:
+ */
+ struct {
+ uint16_t ip;
+ uint16_t name;
+ };
+ };
+
+ /* used for per-pass extra instruction data.
+ */
+ void *data;
+
+ /* Used during CP and RA stages. For fanin and shader inputs/
+ * outputs where we need a sequence of consecutive registers,
+ * keep track of each src instructions left (ie 'n-1') and right
+ * (ie 'n+1') neighbor. The front-end must insert enough mov's
+ * to ensure that each instruction has at most one left and at
+ * most one right neighbor. During the copy-propagation pass,
+ * we only remove mov's when we can preserve this constraint.
+ * And during the RA stage, we use the neighbor information to
+ * allocate a block of registers in one shot.
+ *
+ * TODO: maybe just add something like:
+ * struct ir3_instruction_ref {
+ * struct ir3_instruction *instr;
+ * unsigned cnt;
+ * }
+ *
+ * Or can we get away without the refcnt stuff? It seems like
+ * it should be overkill.. the problem is if, potentially after
+ * already eliminating some mov's, if you have a single mov that
+ * needs to be grouped with it's neighbors in two different
+ * places (ex. shader output and a fanin).
+ */
+ struct {
+ struct ir3_instruction *left, *right;
+ uint16_t left_cnt, right_cnt;
+ } cp;
+
+ /* an instruction can reference at most one address register amongst
+ * it's src/dst registers. Beyond that, you need to insert mov's.
+ *
+ * NOTE: do not write this directly, use ir3_instr_set_address()
+ */
+ struct ir3_instruction *address;
+
+ /* Tracking for additional dependent instructions. Used to handle
+ * barriers, WAR hazards for arrays/SSBOs/etc.
+ */
+ DECLARE_ARRAY(struct ir3_instruction *, deps);
+
+ /*
+ * From PoV of instruction scheduling, not execution (ie. ignores global/
+ * local distinction):
+ * shared image atomic SSBO everything
+ * barrier()/ - R/W R/W R/W R/W X
+ * groupMemoryBarrier()
+ * memoryBarrier() - R/W R/W
+ * (but only images declared coherent?)
+ * memoryBarrierAtomic() - R/W
+ * memoryBarrierBuffer() - R/W
+ * memoryBarrierImage() - R/W
+ * memoryBarrierShared() - R/W
+ *
+ * TODO I think for SSBO/image/shared, in cases where we can determine
+ * which variable is accessed, we don't need to care about accesses to
+ * different variables (unless declared coherent??)
+ */
+ enum {
+ IR3_BARRIER_EVERYTHING = 1 << 0,
+ IR3_BARRIER_SHARED_R = 1 << 1,
+ IR3_BARRIER_SHARED_W = 1 << 2,
+ IR3_BARRIER_IMAGE_R = 1 << 3,
+ IR3_BARRIER_IMAGE_W = 1 << 4,
+ IR3_BARRIER_BUFFER_R = 1 << 5,
+ IR3_BARRIER_BUFFER_W = 1 << 6,
+ IR3_BARRIER_ARRAY_R = 1 << 7,
+ IR3_BARRIER_ARRAY_W = 1 << 8,
+ } barrier_class, barrier_conflict;
+
+ /* Entry in ir3_block's instruction list: */
+ struct list_head node;
+
+ int use_count; /* currently just updated/used by cp */
+
+#ifdef DEBUG
+ uint32_t serialno;
+#endif
+};
+
+static inline struct ir3_instruction *
+ir3_neighbor_first(struct ir3_instruction *instr)
+{
+ int cnt = 0;
+ while (instr->cp.left) {
+ instr = instr->cp.left;
+ if (++cnt > 0xffff) {
+ debug_assert(0);
+ break;
+ }
+ }
+ return instr;
+}
+
+static inline int ir3_neighbor_count(struct ir3_instruction *instr)
+{
+ int num = 1;
+
+ debug_assert(!instr->cp.left);
+
+ while (instr->cp.right) {
+ num++;
+ instr = instr->cp.right;
+ if (num > 0xffff) {
+ debug_assert(0);
+ break;
+ }
+ }
+
+ return num;
+}
+
+struct ir3 {
+ struct ir3_compiler *compiler;
+
+ unsigned ninputs, noutputs;
+ struct ir3_instruction **inputs;
+ struct ir3_instruction **outputs;
+
+ /* Track bary.f (and ldlv) instructions.. this is needed in
+ * scheduling to ensure that all varying fetches happen before
+ * any potential kill instructions. The hw gets grumpy if all
+ * threads in a group are killed before the last bary.f gets
+ * a chance to signal end of input (ei).
+ */
+ DECLARE_ARRAY(struct ir3_instruction *, baryfs);
+
+ /* Track all indirect instructions (read and write). To avoid
+ * deadlock scenario where an address register gets scheduled,
+ * but other dependent src instructions cannot be scheduled due
+ * to dependency on a *different* address register value, the
+ * scheduler needs to ensure that all dependencies other than
+ * the instruction other than the address register are scheduled
+ * before the one that writes the address register. Having a
+ * convenient list of instructions that reference some address
+ * register simplifies this.
+ */
+ DECLARE_ARRAY(struct ir3_instruction *, indirects);
+
+ /* and same for instructions that consume predicate register: */
+ DECLARE_ARRAY(struct ir3_instruction *, predicates);
+
+ /* Track texture sample instructions which need texture state
+ * patched in (for astc-srgb workaround):
+ */
+ DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
+
+ /* List of blocks: */
+ struct list_head block_list;
+
+ /* List of ir3_array's: */
+ struct list_head array_list;
+
+#ifdef DEBUG
+ unsigned block_count, instr_count;
+#endif
+};
+
+struct ir3_array {
+ struct list_head node;
+ unsigned length;
+ unsigned id;
+
+ struct nir_register *r;
+
+ /* To avoid array write's from getting DCE'd, keep track of the
+ * most recent write. Any array access depends on the most
+ * recent write. This way, nothing depends on writes after the
+ * last read. But all the writes that happen before that have
+ * something depending on them
+ */
+ struct ir3_instruction *last_write;
+
+ /* extra stuff used in RA pass: */
+ unsigned base; /* base vreg name */
+ unsigned reg; /* base physical reg */
+ uint16_t start_ip, end_ip;
+};
+
+struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id);
+
+struct ir3_block {
+ struct list_head node;
+ struct ir3 *shader;
+
+ const struct nir_block *nblock;
+
+ struct list_head instr_list; /* list of ir3_instruction */
+
+ /* each block has either one or two successors.. in case of
+ * two successors, 'condition' decides which one to follow.
+ * A block preceding an if/else has two successors.
+ */
+ struct ir3_instruction *condition;
+ struct ir3_block *successors[2];
+
+ unsigned predecessors_count;
+ struct ir3_block **predecessors;
+
+ uint16_t start_ip, end_ip;
+
+ /* Track instructions which do not write a register but other-
+ * wise must not be discarded (such as kill, stg, etc)
+ */
+ DECLARE_ARRAY(struct ir3_instruction *, keeps);
+
+ /* used for per-pass extra block data. Mainly used right
+ * now in RA step to track livein/liveout.
+ */
+ void *data;
+
+#ifdef DEBUG
+ uint32_t serialno;
+#endif
+};
+
+static inline uint32_t
+block_id(struct ir3_block *block)
+{
+#ifdef DEBUG
+ return block->serialno;
+#else
+ return (uint32_t)(unsigned long)block;
+#endif
+}
+
+struct ir3 * ir3_create(struct ir3_compiler *compiler,
+ unsigned nin, unsigned nout);
+void ir3_destroy(struct ir3 *shader);
+void * ir3_assemble(struct ir3 *shader,
+ struct ir3_info *info, uint32_t gpu_id);
+void * ir3_alloc(struct ir3 *shader, int sz);
+
+struct ir3_block * ir3_block_create(struct ir3 *shader);
+
+struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc);
+struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
+ opc_t opc, int nreg);
+struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr);
+void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep);
+const char *ir3_instr_name(struct ir3_instruction *instr);
+
+struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
+ int num, int flags);
+struct ir3_register * ir3_reg_clone(struct ir3 *shader,
+ struct ir3_register *reg);
+
+void ir3_instr_set_address(struct ir3_instruction *instr,
+ struct ir3_instruction *addr);
+
+static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
+{
+ if (instr->flags & IR3_INSTR_MARK)
+ return true; /* already visited */
+ instr->flags |= IR3_INSTR_MARK;
+ return false;
+}
+
+void ir3_block_clear_mark(struct ir3_block *block);
+void ir3_clear_mark(struct ir3 *shader);
+
+unsigned ir3_count_instructions(struct ir3 *ir);
+
+static inline int ir3_instr_regno(struct ir3_instruction *instr,
+ struct ir3_register *reg)
+{
+ unsigned i;
+ for (i = 0; i < instr->regs_count; i++)
+ if (reg == instr->regs[i])
+ return i;
+ return -1;
+}
+
+
+#define MAX_ARRAYS 16
+
+/* comp:
+ * 0 - x
+ * 1 - y
+ * 2 - z
+ * 3 - w
+ */
+static inline uint32_t regid(int num, int comp)
+{
+ return (num << 2) | (comp & 0x3);
+}
+
+static inline uint32_t reg_num(struct ir3_register *reg)
+{
+ return reg->num >> 2;
+}
+
+static inline uint32_t reg_comp(struct ir3_register *reg)
+{
+ return reg->num & 0x3;
+}
+
+static inline bool is_flow(struct ir3_instruction *instr)
+{
+ return (opc_cat(instr->opc) == 0);
+}
+
+static inline bool is_kill(struct ir3_instruction *instr)
+{
+ return instr->opc == OPC_KILL;
+}
+
+static inline bool is_nop(struct ir3_instruction *instr)
+{
+ return instr->opc == OPC_NOP;
+}
+
+/* Is it a non-transformative (ie. not type changing) mov? This can
+ * also include absneg.s/absneg.f, which for the most part can be
+ * treated as a mov (single src argument).
+ */
+static inline bool is_same_type_mov(struct ir3_instruction *instr)
+{
+ struct ir3_register *dst;
+
+ switch (instr->opc) {
+ case OPC_MOV:
+ if (instr->cat1.src_type != instr->cat1.dst_type)
+ return false;
+ break;
+ case OPC_ABSNEG_F:
+ case OPC_ABSNEG_S:
+ if (instr->flags & IR3_INSTR_SAT)
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ dst = instr->regs[0];
+
+ /* mov's that write to a0.x or p0.x are special: */
+ if (dst->num == regid(REG_P0, 0))
+ return false;
+ if (dst->num == regid(REG_A0, 0))
+ return false;
+
+ if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
+ return false;
+
+ return true;
+}
+
+static inline bool is_alu(struct ir3_instruction *instr)
+{
+ return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
+}
+
+static inline bool is_sfu(struct ir3_instruction *instr)
+{
+ return (opc_cat(instr->opc) == 4);
+}
+
+static inline bool is_tex(struct ir3_instruction *instr)
+{
+ return (opc_cat(instr->opc) == 5);
+}
+
+static inline bool is_mem(struct ir3_instruction *instr)
+{
+ return (opc_cat(instr->opc) == 6);
+}
+
+static inline bool is_barrier(struct ir3_instruction *instr)
+{
+ return (opc_cat(instr->opc) == 7);
+}
+
+static inline bool
+is_store(struct ir3_instruction *instr)
+{
+ /* these instructions, the "destination" register is
+ * actually a source, the address to store to.
+ */
+ switch (instr->opc) {
+ case OPC_STG:
+ case OPC_STGB:
+ case OPC_STIB:
+ case OPC_STP:
+ case OPC_STL:
+ case OPC_STLW:
+ case OPC_L2G:
+ case OPC_G2L:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool is_load(struct ir3_instruction *instr)
+{
+ switch (instr->opc) {
+ case OPC_LDG:
+ case OPC_LDGB:
+ case OPC_LDL:
+ case OPC_LDP:
+ case OPC_L2G:
+ case OPC_LDLW:
+ case OPC_LDC:
+ case OPC_LDLV:
+ /* probably some others too.. */
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool is_input(struct ir3_instruction *instr)
+{
+ /* in some cases, ldlv is used to fetch varying without
+ * interpolation.. fortunately inloc is the first src
+ * register in either case
+ */
+ switch (instr->opc) {
+ case OPC_LDLV:
+ case OPC_BARY_F:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool is_bool(struct ir3_instruction *instr)
+{
+ switch (instr->opc) {
+ case OPC_CMPS_F:
+ case OPC_CMPS_S:
+ case OPC_CMPS_U:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool is_meta(struct ir3_instruction *instr)
+{
+ /* TODO how should we count PHI (and maybe fan-in/out) which
+ * might actually contribute some instructions to the final
+ * result?
+ */
+ return (opc_cat(instr->opc) == -1);
+}
+
+static inline bool writes_addr(struct ir3_instruction *instr)
+{
+ if (instr->regs_count > 0) {
+ struct ir3_register *dst = instr->regs[0];
+ return reg_num(dst) == REG_A0;
+ }
+ return false;
+}
+
+static inline bool writes_pred(struct ir3_instruction *instr)
+{
+ if (instr->regs_count > 0) {
+ struct ir3_register *dst = instr->regs[0];
+ return reg_num(dst) == REG_P0;
+ }
+ return false;
+}
+
+/* returns defining instruction for reg */
+/* TODO better name */
+static inline struct ir3_instruction *ssa(struct ir3_register *reg)
+{
+ if (reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) {
+ return reg->instr;
+ }
+ return NULL;
+}
+
+static inline bool conflicts(struct ir3_instruction *a,
+ struct ir3_instruction *b)
+{
+ return (a && b) && (a != b);
+}
+
+static inline bool reg_gpr(struct ir3_register *r)
+{
+ if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
+ return false;
+ if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
+ return false;
+ return true;
+}
+
+static inline type_t half_type(type_t type)
+{
+ switch (type) {
+ case TYPE_F32: return TYPE_F16;
+ case TYPE_U32: return TYPE_U16;
+ case TYPE_S32: return TYPE_S16;
+ case TYPE_F16:
+ case TYPE_U16:
+ case TYPE_S16:
+ return type;
+ default:
+ assert(0);
+ return ~0;
+ }
+}
+
+/* some cat2 instructions (ie. those which are not float) can embed an
+ * immediate:
+ */
+static inline bool ir3_cat2_int(opc_t opc)
+{
+ switch (opc) {
+ case OPC_ADD_U:
+ case OPC_ADD_S:
+ case OPC_SUB_U:
+ case OPC_SUB_S:
+ case OPC_CMPS_U:
+ case OPC_CMPS_S:
+ case OPC_MIN_U:
+ case OPC_MIN_S:
+ case OPC_MAX_U:
+ case OPC_MAX_S:
+ case OPC_CMPV_U:
+ case OPC_CMPV_S:
+ case OPC_MUL_U:
+ case OPC_MUL_S:
+ case OPC_MULL_U:
+ case OPC_CLZ_S:
+ case OPC_ABSNEG_S:
+ case OPC_AND_B:
+ case OPC_OR_B:
+ case OPC_NOT_B:
+ case OPC_XOR_B:
+ case OPC_BFREV_B:
+ case OPC_CLZ_B:
+ case OPC_SHL_B:
+ case OPC_SHR_B:
+ case OPC_ASHR_B:
+ case OPC_MGEN_B:
+ case OPC_GETBIT_B:
+ case OPC_CBITS_B:
+ case OPC_BARY_F:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+
+/* map cat2 instruction to valid abs/neg flags: */
+static inline unsigned ir3_cat2_absneg(opc_t opc)
+{
+ switch (opc) {
+ case OPC_ADD_F:
+ case OPC_MIN_F:
+ case OPC_MAX_F:
+ case OPC_MUL_F:
+ case OPC_SIGN_F:
+ case OPC_CMPS_F:
+ case OPC_ABSNEG_F:
+ case OPC_CMPV_F:
+ case OPC_FLOOR_F:
+ case OPC_CEIL_F:
+ case OPC_RNDNE_F:
+ case OPC_RNDAZ_F:
+ case OPC_TRUNC_F:
+ case OPC_BARY_F:
+ return IR3_REG_FABS | IR3_REG_FNEG;
+
+ case OPC_ADD_U:
+ case OPC_ADD_S:
+ case OPC_SUB_U:
+ case OPC_SUB_S:
+ case OPC_CMPS_U:
+ case OPC_CMPS_S:
+ case OPC_MIN_U:
+ case OPC_MIN_S:
+ case OPC_MAX_U:
+ case OPC_MAX_S:
+ case OPC_CMPV_U:
+ case OPC_CMPV_S:
+ case OPC_MUL_U:
+ case OPC_MUL_S:
+ case OPC_MULL_U:
+ case OPC_CLZ_S:
+ return 0;
+
+ case OPC_ABSNEG_S:
+ return IR3_REG_SABS | IR3_REG_SNEG;
+
+ case OPC_AND_B:
+ case OPC_OR_B:
+ case OPC_NOT_B:
+ case OPC_XOR_B:
+ case OPC_BFREV_B:
+ case OPC_CLZ_B:
+ case OPC_SHL_B:
+ case OPC_SHR_B:
+ case OPC_ASHR_B:
+ case OPC_MGEN_B:
+ case OPC_GETBIT_B:
+ case OPC_CBITS_B:
+ return IR3_REG_BNOT;
+
+ default:
+ return 0;
+ }
+}
+
+/* map cat3 instructions to valid abs/neg flags: */
+static inline unsigned ir3_cat3_absneg(opc_t opc)
+{
+ switch (opc) {
+ case OPC_MAD_F16:
+ case OPC_MAD_F32:
+ case OPC_SEL_F16:
+ case OPC_SEL_F32:
+ return IR3_REG_FNEG;
+
+ case OPC_MAD_U16:
+ case OPC_MADSH_U16:
+ case OPC_MAD_S16:
+ case OPC_MADSH_M16:
+ case OPC_MAD_U24:
+ case OPC_MAD_S24:
+ case OPC_SEL_S16:
+ case OPC_SEL_S32:
+ case OPC_SAD_S16:
+ case OPC_SAD_S32:
+ /* neg *may* work on 3rd src.. */
+
+ case OPC_SEL_B16:
+ case OPC_SEL_B32:
+
+ default:
+ return 0;
+ }
+}
+
+#define MASK(n) ((1 << (n)) - 1)
+
+/* iterator for an instructions's sources (reg), also returns src #: */
+#define foreach_src_n(__srcreg, __n, __instr) \
+ if ((__instr)->regs_count) \
+ for (unsigned __cnt = (__instr)->regs_count - 1, __n = 0; __n < __cnt; __n++) \
+ if ((__srcreg = (__instr)->regs[__n + 1]))
+
+/* iterator for an instructions's sources (reg): */
+#define foreach_src(__srcreg, __instr) \
+ foreach_src_n(__srcreg, __i, __instr)
+
+static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
+{
+ unsigned cnt = instr->regs_count + instr->deps_count;
+ if (instr->address)
+ cnt++;
+ return cnt;
+}
+
+static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n)
+{
+ if (n == (instr->regs_count + instr->deps_count))
+ return instr->address;
+ if (n >= instr->regs_count)
+ return instr->deps[n - instr->regs_count];
+ return ssa(instr->regs[n]);
+}
+
+static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n)
+{
+ if (n == (instr->regs_count + instr->deps_count))
+ return false;
+ if (n >= instr->regs_count)
+ return true;
+ return false;
+}
+
+#define __src_cnt(__instr) ((__instr)->address ? (__instr)->regs_count : (__instr)->regs_count - 1)
+
+/* iterator for an instruction's SSA sources (instr), also returns src #: */
+#define foreach_ssa_src_n(__srcinst, __n, __instr) \
+ for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \
+ if ((__srcinst = __ssa_src_n(__instr, __n)))
+
+/* iterator for an instruction's SSA sources (instr): */
+#define foreach_ssa_src(__srcinst, __instr) \
+ foreach_ssa_src_n(__srcinst, __i, __instr)
+
+
+/* dump: */
+void ir3_print(struct ir3 *ir);
+void ir3_print_instr(struct ir3_instruction *instr);
+
+/* depth calculation: */
+int ir3_delayslots(struct ir3_instruction *assigner,
+ struct ir3_instruction *consumer, unsigned n);
+void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list);
+void ir3_depth(struct ir3 *ir);
+
+/* copy-propagate: */
+struct ir3_shader_variant;
+void ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
+
+/* group neighbors and insert mov's to resolve conflicts: */
+void ir3_group(struct ir3 *ir);
+
+/* scheduling: */
+void ir3_sched_add_deps(struct ir3 *ir);
+int ir3_sched(struct ir3 *ir);
+
+/* register assignment: */
+struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(struct ir3_compiler *compiler);
+int ir3_ra(struct ir3 *ir3, gl_shader_stage type,
+ bool frag_coord, bool frag_face);
+
+/* legalize: */
+void ir3_legalize(struct ir3 *ir, int *num_samp, bool *has_ssbo, int *max_bary);
+
+/* ************************************************************************* */
+/* instruction helpers */
+
+/* creates SSA src of correct type (ie. half vs full precision) */
+static inline struct ir3_register * __ssa_src(struct ir3_instruction *instr,
+ struct ir3_instruction *src, unsigned flags)
+{
+ struct ir3_register *reg;
+ if (src->regs[0]->flags & IR3_REG_HALF)
+ flags |= IR3_REG_HALF;
+ reg = ir3_reg_create(instr, 0, IR3_REG_SSA | flags);
+ reg->instr = src;
+ return reg;
+}
+
+static inline struct ir3_instruction *
+ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
+{
+ struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
+ ir3_reg_create(instr, 0, 0); /* dst */
+ if (src->regs[0]->flags & IR3_REG_ARRAY) {
+ struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
+ src_reg->array = src->regs[0]->array;
+ } else {
+ __ssa_src(instr, src, 0);
+ }
+ debug_assert(!(src->regs[0]->flags & IR3_REG_RELATIV));
+ instr->cat1.src_type = type;
+ instr->cat1.dst_type = type;
+ return instr;
+}
+
+static inline struct ir3_instruction *
+ir3_COV(struct ir3_block *block, struct ir3_instruction *src,
+ type_t src_type, type_t dst_type)
+{
+ struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
+ unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0;
+ unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0;
+
+ debug_assert((src->regs[0]->flags & IR3_REG_HALF) == src_flags);
+
+ ir3_reg_create(instr, 0, dst_flags); /* dst */
+ __ssa_src(instr, src, 0);
+ instr->cat1.src_type = src_type;
+ instr->cat1.dst_type = dst_type;
+ debug_assert(!(src->regs[0]->flags & IR3_REG_ARRAY));
+ return instr;
+}
+
+static inline struct ir3_instruction *
+ir3_NOP(struct ir3_block *block)
+{
+ return ir3_instr_create(block, OPC_NOP);
+}
+
+#define INSTR0(name) \
+static inline struct ir3_instruction * \
+ir3_##name(struct ir3_block *block) \
+{ \
+ struct ir3_instruction *instr = \
+ ir3_instr_create(block, OPC_##name); \
+ return instr; \
+}
+
+#define INSTR1(name) \
+static inline struct ir3_instruction * \
+ir3_##name(struct ir3_block *block, \
+ struct ir3_instruction *a, unsigned aflags) \
+{ \
+ struct ir3_instruction *instr = \
+ ir3_instr_create(block, OPC_##name); \
+ ir3_reg_create(instr, 0, 0); /* dst */ \
+ __ssa_src(instr, a, aflags); \
+ return instr; \
+}
+
+#define INSTR2(name) \
+static inline struct ir3_instruction * \
+ir3_##name(struct ir3_block *block, \
+ struct ir3_instruction *a, unsigned aflags, \
+ struct ir3_instruction *b, unsigned bflags) \
+{ \
+ struct ir3_instruction *instr = \
+ ir3_instr_create(block, OPC_##name); \
+ ir3_reg_create(instr, 0, 0); /* dst */ \
+ __ssa_src(instr, a, aflags); \
+ __ssa_src(instr, b, bflags); \
+ return instr; \
+}
+
+#define INSTR3(name) \
+static inline struct ir3_instruction * \
+ir3_##name(struct ir3_block *block, \
+ struct ir3_instruction *a, unsigned aflags, \
+ struct ir3_instruction *b, unsigned bflags, \
+ struct ir3_instruction *c, unsigned cflags) \
+{ \
+ struct ir3_instruction *instr = \
+ ir3_instr_create(block, OPC_##name); \
+ ir3_reg_create(instr, 0, 0); /* dst */ \
+ __ssa_src(instr, a, aflags); \
+ __ssa_src(instr, b, bflags); \
+ __ssa_src(instr, c, cflags); \
+ return instr; \
+}
+
+#define INSTR4(name) \
+static inline struct ir3_instruction * \
+ir3_##name(struct ir3_block *block, \
+ struct ir3_instruction *a, unsigned aflags, \
+ struct ir3_instruction *b, unsigned bflags, \
+ struct ir3_instruction *c, unsigned cflags, \
+ struct ir3_instruction *d, unsigned dflags) \
+{ \
+ struct ir3_instruction *instr = \
+ ir3_instr_create2(block, OPC_##name, 5); \
+ ir3_reg_create(instr, 0, 0); /* dst */ \
+ __ssa_src(instr, a, aflags); \
+ __ssa_src(instr, b, bflags); \
+ __ssa_src(instr, c, cflags); \
+ __ssa_src(instr, d, dflags); \
+ return instr; \
+}
+
+#define INSTR4F(f, name) \
+static inline struct ir3_instruction * \
+ir3_##name##_##f(struct ir3_block *block, \
+ struct ir3_instruction *a, unsigned aflags, \
+ struct ir3_instruction *b, unsigned bflags, \
+ struct ir3_instruction *c, unsigned cflags, \
+ struct ir3_instruction *d, unsigned dflags) \
+{ \
+ struct ir3_instruction *instr = \
+ ir3_instr_create2(block, OPC_##name, 5); \
+ ir3_reg_create(instr, 0, 0); /* dst */ \
+ __ssa_src(instr, a, aflags); \
+ __ssa_src(instr, b, bflags); \
+ __ssa_src(instr, c, cflags); \
+ __ssa_src(instr, d, dflags); \
+ instr->flags |= IR3_INSTR_##f; \
+ return instr; \
+}
+
+/* cat0 instructions: */
+INSTR0(BR)
+INSTR0(JUMP)
+INSTR1(KILL)
+INSTR0(END)
+
+/* cat2 instructions, most 2 src but some 1 src: */
+INSTR2(ADD_F)
+INSTR2(MIN_F)
+INSTR2(MAX_F)
+INSTR2(MUL_F)
+INSTR1(SIGN_F)
+INSTR2(CMPS_F)
+INSTR1(ABSNEG_F)
+INSTR2(CMPV_F)
+INSTR1(FLOOR_F)
+INSTR1(CEIL_F)
+INSTR1(RNDNE_F)
+INSTR1(RNDAZ_F)
+INSTR1(TRUNC_F)
+INSTR2(ADD_U)
+INSTR2(ADD_S)
+INSTR2(SUB_U)
+INSTR2(SUB_S)
+INSTR2(CMPS_U)
+INSTR2(CMPS_S)
+INSTR2(MIN_U)
+INSTR2(MIN_S)
+INSTR2(MAX_U)
+INSTR2(MAX_S)
+INSTR1(ABSNEG_S)
+INSTR2(AND_B)
+INSTR2(OR_B)
+INSTR1(NOT_B)
+INSTR2(XOR_B)
+INSTR2(CMPV_U)
+INSTR2(CMPV_S)
+INSTR2(MUL_U)
+INSTR2(MUL_S)
+INSTR2(MULL_U)
+INSTR1(BFREV_B)
+INSTR1(CLZ_S)
+INSTR1(CLZ_B)
+INSTR2(SHL_B)
+INSTR2(SHR_B)
+INSTR2(ASHR_B)
+INSTR2(BARY_F)
+INSTR2(MGEN_B)
+INSTR2(GETBIT_B)
+INSTR1(SETRM)
+INSTR1(CBITS_B)
+INSTR2(SHB)
+INSTR2(MSAD)
+
+/* cat3 instructions: */
+INSTR3(MAD_U16)
+INSTR3(MADSH_U16)
+INSTR3(MAD_S16)
+INSTR3(MADSH_M16)
+INSTR3(MAD_U24)
+INSTR3(MAD_S24)
+INSTR3(MAD_F16)
+INSTR3(MAD_F32)
+INSTR3(SEL_B16)
+INSTR3(SEL_B32)
+INSTR3(SEL_S16)
+INSTR3(SEL_S32)
+INSTR3(SEL_F16)
+INSTR3(SEL_F32)
+INSTR3(SAD_S16)
+INSTR3(SAD_S32)
+
+/* cat4 instructions: */
+INSTR1(RCP)
+INSTR1(RSQ)
+INSTR1(LOG2)
+INSTR1(EXP2)
+INSTR1(SIN)
+INSTR1(COS)
+INSTR1(SQRT)
+
+/* cat5 instructions: */
+INSTR1(DSX)
+INSTR1(DSY)
+
+static inline struct ir3_instruction *
+ir3_SAM(struct ir3_block *block, opc_t opc, type_t type,
+ unsigned wrmask, unsigned flags, unsigned samp, unsigned tex,
+ struct ir3_instruction *src0, struct ir3_instruction *src1)
+{
+ struct ir3_instruction *sam;
+ struct ir3_register *reg;
+
+ sam = ir3_instr_create(block, opc);
+ sam->flags |= flags;
+ ir3_reg_create(sam, 0, 0)->wrmask = wrmask;
+ if (src0) {
+ reg = ir3_reg_create(sam, 0, IR3_REG_SSA);
+ reg->wrmask = (1 << (src0->regs_count - 1)) - 1;
+ reg->instr = src0;
+ }
+ if (src1) {
+ reg = ir3_reg_create(sam, 0, IR3_REG_SSA);
+ reg->instr = src1;
+ reg->wrmask = (1 << (src1->regs_count - 1)) - 1;
+ }
+ sam->cat5.samp = samp;
+ sam->cat5.tex = tex;
+ sam->cat5.type = type;
+
+ return sam;
+}
+
+/* cat6 instructions: */
+INSTR2(LDLV)
+INSTR2(LDG)
+INSTR2(LDL)
+INSTR3(STG)
+INSTR3(STL)
+INSTR3(LDGB)
+INSTR4(STGB)
+INSTR4(STIB)
+INSTR1(RESINFO)
+INSTR1(RESFMT)
+INSTR2(ATOMIC_ADD)
+INSTR2(ATOMIC_SUB)
+INSTR2(ATOMIC_XCHG)
+INSTR2(ATOMIC_INC)
+INSTR2(ATOMIC_DEC)
+INSTR2(ATOMIC_CMPXCHG)
+INSTR2(ATOMIC_MIN)
+INSTR2(ATOMIC_MAX)
+INSTR2(ATOMIC_AND)
+INSTR2(ATOMIC_OR)
+INSTR2(ATOMIC_XOR)
+INSTR4F(G, ATOMIC_ADD)
+INSTR4F(G, ATOMIC_SUB)
+INSTR4F(G, ATOMIC_XCHG)
+INSTR4F(G, ATOMIC_INC)
+INSTR4F(G, ATOMIC_DEC)
+INSTR4F(G, ATOMIC_CMPXCHG)
+INSTR4F(G, ATOMIC_MIN)
+INSTR4F(G, ATOMIC_MAX)
+INSTR4F(G, ATOMIC_AND)
+INSTR4F(G, ATOMIC_OR)
+INSTR4F(G, ATOMIC_XOR)
+
+/* cat7 instructions: */
+INSTR0(BAR)
+INSTR0(FENCE)
+
+/* ************************************************************************* */
+/* split this out or find some helper to use.. like main/bitset.h.. */
+
+#include <string.h>
+
+#define MAX_REG 256
+
+typedef uint8_t regmask_t[2 * MAX_REG / 8];
+
+static inline unsigned regmask_idx(struct ir3_register *reg)
+{
+ unsigned num = (reg->flags & IR3_REG_RELATIV) ? reg->array.offset : reg->num;
+ debug_assert(num < MAX_REG);
+ if (reg->flags & IR3_REG_HALF)
+ num += MAX_REG;
+ return num;
+}
+
+static inline void regmask_init(regmask_t *regmask)
+{
+ memset(regmask, 0, sizeof(*regmask));
+}
+
+static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg)
+{
+ unsigned idx = regmask_idx(reg);
+ if (reg->flags & IR3_REG_RELATIV) {
+ unsigned i;
+ for (i = 0; i < reg->size; i++, idx++)
+ (*regmask)[idx / 8] |= 1 << (idx % 8);
+ } else {
+ unsigned mask;
+ for (mask = reg->wrmask; mask; mask >>= 1, idx++)
+ if (mask & 1)
+ (*regmask)[idx / 8] |= 1 << (idx % 8);
+ }
+}
+
+static inline void regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
+{
+ unsigned i;
+ for (i = 0; i < ARRAY_SIZE(*dst); i++)
+ (*dst)[i] = (*a)[i] | (*b)[i];
+}
+
+/* set bits in a if not set in b, conceptually:
+ * a |= (reg & ~b)
+ */
+static inline void regmask_set_if_not(regmask_t *a,
+ struct ir3_register *reg, regmask_t *b)
+{
+ unsigned idx = regmask_idx(reg);
+ if (reg->flags & IR3_REG_RELATIV) {
+ unsigned i;
+ for (i = 0; i < reg->size; i++, idx++)
+ if (!((*b)[idx / 8] & (1 << (idx % 8))))
+ (*a)[idx / 8] |= 1 << (idx % 8);
+ } else {
+ unsigned mask;
+ for (mask = reg->wrmask; mask; mask >>= 1, idx++)
+ if (mask & 1)
+ if (!((*b)[idx / 8] & (1 << (idx % 8))))
+ (*a)[idx / 8] |= 1 << (idx % 8);
+ }
+}
+
+static inline bool regmask_get(regmask_t *regmask,
+ struct ir3_register *reg)
+{
+ unsigned idx = regmask_idx(reg);
+ if (reg->flags & IR3_REG_RELATIV) {
+ unsigned i;
+ for (i = 0; i < reg->size; i++, idx++)
+ if ((*regmask)[idx / 8] & (1 << (idx % 8)))
+ return true;
+ } else {
+ unsigned mask;
+ for (mask = reg->wrmask; mask; mask >>= 1, idx++)
+ if (mask & 1)
+ if ((*regmask)[idx / 8] & (1 << (idx % 8)))
+ return true;
+ }
+ return false;
+}
+
+/* ************************************************************************* */
+
+#endif /* IR3_H_ */
diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c
new file mode 100644
index 00000000000..f00daebabf5
--- /dev/null
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2015 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#include "util/ralloc.h"
+
+#include "ir3_compiler.h"
+
+static const struct debug_named_value shader_debug_options[] = {
+ {"vs", IR3_DBG_SHADER_VS, "Print shader disasm for vertex shaders"},
+ {"fs", IR3_DBG_SHADER_FS, "Print shader disasm for fragment shaders"},
+ {"cs", IR3_DBG_SHADER_CS, "Print shader disasm for compute shaders"},
+ {"disasm", IR3_DBG_DISASM, "Dump NIR and adreno shader disassembly"},
+ {"optmsgs", IR3_DBG_OPTMSGS,"Enable optimizer debug messages"},
+ DEBUG_NAMED_VALUE_END
+};
+
+DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG", shader_debug_options, 0)
+
+enum ir3_shader_debug ir3_shader_debug = 0;
+
+struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id)
+{
+ struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
+
+ ir3_shader_debug = debug_get_option_ir3_shader_debug();
+
+ compiler->dev = dev;
+ compiler->gpu_id = gpu_id;
+ compiler->set = ir3_ra_alloc_reg_set(compiler);
+
+ if (compiler->gpu_id >= 400) {
+ /* need special handling for "flat" */
+ compiler->flat_bypass = true;
+ compiler->levels_add_one = false;
+ compiler->unminify_coords = false;
+ compiler->txf_ms_with_isaml = false;
+ compiler->array_index_add_half = true;
+ } else {
+ /* no special handling for "flat" */
+ compiler->flat_bypass = false;
+ compiler->levels_add_one = true;
+ compiler->unminify_coords = true;
+ compiler->txf_ms_with_isaml = true;
+ compiler->array_index_add_half = false;
+ }
+
+ return compiler;
+}
diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h
new file mode 100644
index 00000000000..e2336062b29
--- /dev/null
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (C) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#ifndef IR3_COMPILER_H_
+#define IR3_COMPILER_H_
+
+#include "ir3_shader.h"
+
+struct ir3_ra_reg_set;
+
+struct ir3_compiler {
+ struct fd_device *dev;
+ uint32_t gpu_id;
+ struct ir3_ra_reg_set *set;
+ uint32_t shader_count;
+
+ /*
+ * Configuration options for things that are handled differently on
+ * different generations:
+ */
+
+ /* a4xx (and later) drops SP_FS_FLAT_SHAD_MODE_REG_* for flat-interpolate
+ * so we need to use ldlv.u32 to load the varying directly:
+ */
+ bool flat_bypass;
+
+ /* on a3xx, we need to add one to # of array levels:
+ */
+ bool levels_add_one;
+
+ /* on a3xx, we need to scale up integer coords for isaml based
+ * on LoD:
+ */
+ bool unminify_coords;
+
+ /* on a3xx do txf_ms w/ isaml and scaled coords: */
+ bool txf_ms_with_isaml;
+
+ /* on a4xx, for array textures we need to add 0.5 to the array
+ * index coordinate:
+ */
+ bool array_index_add_half;
+};
+
+struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id);
+
+int ir3_compile_shader_nir(struct ir3_compiler *compiler,
+ struct ir3_shader_variant *so);
+
+enum ir3_shader_debug {
+ IR3_DBG_SHADER_VS = 0x01,
+ IR3_DBG_SHADER_FS = 0x02,
+ IR3_DBG_SHADER_CS = 0x04,
+ IR3_DBG_DISASM = 0x08,
+ IR3_DBG_OPTMSGS = 0x10,
+};
+
+extern enum ir3_shader_debug ir3_shader_debug;
+
+static inline bool
+shader_debug_enabled(gl_shader_stage type)
+{
+ switch (type) {
+ case MESA_SHADER_VERTEX: return !!(ir3_shader_debug & IR3_DBG_SHADER_VS);
+ case MESA_SHADER_FRAGMENT: return !!(ir3_shader_debug & IR3_DBG_SHADER_FS);
+ case MESA_SHADER_COMPUTE: return !!(ir3_shader_debug & IR3_DBG_SHADER_CS);
+ default:
+ debug_assert(0);
+ return false;
+ }
+}
+
+#endif /* IR3_COMPILER_H_ */
diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c
new file mode 100644
index 00000000000..445a2b291e9
--- /dev/null
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -0,0 +1,3818 @@
+/*
+ * Copyright (C) 2015 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#include <stdarg.h>
+
+#include "util/u_string.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "ir3_compiler.h"
+#include "ir3_shader.h"
+#include "ir3_nir.h"
+
+#include "instr-a3xx.h"
+#include "ir3.h"
+
+/* for conditionally setting boolean flag(s): */
+#define COND(bool, val) ((bool) ? (val) : 0)
+
+#define DBG(fmt, ...) \
+ do { debug_printf("%s:%d: "fmt "\n", \
+ __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
+
+struct ir3_context {
+ struct ir3_compiler *compiler;
+
+ struct nir_shader *s;
+
+ struct nir_instr *cur_instr; /* current instruction, just for debug */
+
+ struct ir3 *ir;
+ struct ir3_shader_variant *so;
+
+ struct ir3_block *block; /* the current block */
+ struct ir3_block *in_block; /* block created for shader inputs */
+
+ nir_function_impl *impl;
+
+ /* For fragment shaders, varyings are not actual shader inputs,
+ * instead the hw passes a varying-coord which is used with
+ * bary.f.
+ *
+ * But NIR doesn't know that, it still declares varyings as
+ * inputs. So we do all the input tracking normally and fix
+ * things up after compile_instructions()
+ *
+ * NOTE that frag_vcoord is the hardware position (possibly it
+ * is actually an index or tag or some such.. it is *not*
+ * values that can be directly used for gl_FragCoord..)
+ */
+ struct ir3_instruction *frag_vcoord;
+
+ /* for fragment shaders, for gl_FrontFacing and gl_FragCoord: */
+ struct ir3_instruction *frag_face, *frag_coord;
+
+ /* For vertex shaders, keep track of the system values sources */
+ struct ir3_instruction *vertex_id, *basevertex, *instance_id;
+
+ /* For fragment shaders: */
+ struct ir3_instruction *samp_id, *samp_mask_in;
+
+ /* Compute shader inputs: */
+ struct ir3_instruction *local_invocation_id, *work_group_id;
+
+ /* mapping from nir_register to defining instruction: */
+ struct hash_table *def_ht;
+
+ unsigned num_arrays;
+
+ /* a common pattern for indirect addressing is to request the
+ * same address register multiple times. To avoid generating
+ * duplicate instruction sequences (which our backend does not
+ * try to clean up, since that should be done as the NIR stage)
+ * we cache the address value generated for a given src value:
+ *
+ * Note that we have to cache these per alignment, since same
+ * src used for an array of vec1 cannot be also used for an
+ * array of vec4.
+ */
+ struct hash_table *addr_ht[4];
+
+ /* last dst array, for indirect we need to insert a var-store.
+ */
+ struct ir3_instruction **last_dst;
+ unsigned last_dst_n;
+
+ /* maps nir_block to ir3_block, mostly for the purposes of
+ * figuring out the blocks successors
+ */
+ struct hash_table *block_ht;
+
+ /* on a4xx, bitmask of samplers which need astc+srgb workaround: */
+ unsigned astc_srgb;
+
+ unsigned samples; /* bitmask of x,y sample shifts */
+
+ unsigned max_texture_index;
+
+ /* set if we encounter something we can't handle yet, so we
+ * can bail cleanly and fallback to TGSI compiler f/e
+ */
+ bool error;
+};
+
+/* gpu pointer size in units of 32bit registers/slots */
+static unsigned pointer_size(struct ir3_context *ctx)
+{
+ return (ctx->compiler->gpu_id >= 500) ? 2 : 1;
+}
+
+static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
+static struct ir3_block * get_block(struct ir3_context *ctx, const nir_block *nblock);
+
+
+static struct ir3_context *
+compile_init(struct ir3_compiler *compiler,
+ struct ir3_shader_variant *so)
+{
+ struct ir3_context *ctx = rzalloc(NULL, struct ir3_context);
+
+ if (compiler->gpu_id >= 400) {
+ if (so->type == MESA_SHADER_VERTEX) {
+ ctx->astc_srgb = so->key.vastc_srgb;
+ } else if (so->type == MESA_SHADER_FRAGMENT) {
+ ctx->astc_srgb = so->key.fastc_srgb;
+ }
+
+ } else {
+ if (so->type == MESA_SHADER_VERTEX) {
+ ctx->samples = so->key.vsamples;
+ } else if (so->type == MESA_SHADER_FRAGMENT) {
+ ctx->samples = so->key.fsamples;
+ }
+ }
+
+ ctx->compiler = compiler;
+ ctx->so = so;
+ ctx->def_ht = _mesa_hash_table_create(ctx,
+ _mesa_hash_pointer, _mesa_key_pointer_equal);
+ ctx->block_ht = _mesa_hash_table_create(ctx,
+ _mesa_hash_pointer, _mesa_key_pointer_equal);
+
+ /* TODO: maybe generate some sort of bitmask of what key
+ * lowers vs what shader has (ie. no need to lower
+ * texture clamp lowering if no texture sample instrs)..
+ * although should be done further up the stack to avoid
+ * creating duplicate variants..
+ */
+
+ if (ir3_key_lowers_nir(&so->key)) {
+ nir_shader *s = nir_shader_clone(ctx, so->shader->nir);
+ ctx->s = ir3_optimize_nir(so->shader, s, &so->key);
+ } else {
+ /* fast-path for shader key that lowers nothing in NIR: */
+ ctx->s = so->shader->nir;
+ }
+
+ /* this needs to be the last pass run, so do this here instead of
+ * in ir3_optimize_nir():
+ */
+ NIR_PASS_V(ctx->s, nir_lower_locals_to_regs);
+ NIR_PASS_V(ctx->s, nir_convert_from_ssa, true);
+
+ if (ir3_shader_debug & IR3_DBG_DISASM) {
+ printf("dump nir%dv%d: type=%d, k={cts=%u,hp=%u}",
+ so->shader->id, so->id, so->type,
+ so->key.color_two_side, so->key.half_precision);
+ nir_print_shader(ctx->s, stdout);
+ }
+
+ if (shader_debug_enabled(so->type)) {
+ fprintf(stderr, "NIR (final form) for %s shader:\n",
+ _mesa_shader_stage_to_string(so->type));
+ nir_print_shader(ctx->s, stderr);
+ }
+
+ ir3_nir_scan_driver_consts(ctx->s, &so->const_layout);
+
+ so->num_uniforms = ctx->s->num_uniforms;
+ so->num_ubos = ctx->s->info.num_ubos;
+
+ /* Layout of constant registers, each section aligned to vec4. Note
+ * that pointer size (ubo, etc) changes depending on generation.
+ *
+ * user consts
+ * UBO addresses
+ * SSBO sizes
+ * if (vertex shader) {
+ * driver params (IR3_DP_*)
+ * if (stream_output.num_outputs > 0)
+ * stream-out addresses
+ * }
+ * immediates
+ *
+ * Immediates go last mostly because they are inserted in the CP pass
+ * after the nir -> ir3 frontend.
+ */
+ unsigned constoff = align(ctx->s->num_uniforms, 4);
+ unsigned ptrsz = pointer_size(ctx);
+
+ memset(&so->constbase, ~0, sizeof(so->constbase));
+
+ if (so->num_ubos > 0) {
+ so->constbase.ubo = constoff;
+ constoff += align(ctx->s->info.num_ubos * ptrsz, 4) / 4;
+ }
+
+ if (so->const_layout.ssbo_size.count > 0) {
+ unsigned cnt = so->const_layout.ssbo_size.count;
+ so->constbase.ssbo_sizes = constoff;
+ constoff += align(cnt, 4) / 4;
+ }
+
+ if (so->const_layout.image_dims.count > 0) {
+ unsigned cnt = so->const_layout.image_dims.count;
+ so->constbase.image_dims = constoff;
+ constoff += align(cnt, 4) / 4;
+ }
+
+ unsigned num_driver_params = 0;
+ if (so->type == MESA_SHADER_VERTEX) {
+ num_driver_params = IR3_DP_VS_COUNT;
+ } else if (so->type == MESA_SHADER_COMPUTE) {
+ num_driver_params = IR3_DP_CS_COUNT;
+ }
+
+ so->constbase.driver_param = constoff;
+ constoff += align(num_driver_params, 4) / 4;
+
+ if ((so->type == MESA_SHADER_VERTEX) &&
+ (compiler->gpu_id < 500) &&
+ so->shader->stream_output.num_outputs > 0) {
+ so->constbase.tfbo = constoff;
+ constoff += align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4;
+ }
+
+ so->constbase.immediate = constoff;
+
+ return ctx;
+}
+
+static void
+compile_error(struct ir3_context *ctx, const char *format, ...)
+{
+ struct hash_table *errors = NULL;
+ va_list ap;
+ va_start(ap, format);
+ if (ctx->cur_instr) {
+ errors = _mesa_hash_table_create(NULL,
+ _mesa_hash_pointer,
+ _mesa_key_pointer_equal);
+ char *msg = ralloc_vasprintf(errors, format, ap);
+ _mesa_hash_table_insert(errors, ctx->cur_instr, msg);
+ } else {
+ _debug_vprintf(format, ap);
+ }
+ va_end(ap);
+ nir_print_shader_annotated(ctx->s, stdout, errors);
+ ralloc_free(errors);
+ ctx->error = true;
+ debug_assert(0);
+}
+
+#define compile_assert(ctx, cond) do { \
+ if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
+ } while (0)
+
+static void
+compile_free(struct ir3_context *ctx)
+{
+ ralloc_free(ctx);
+}
+
+static void
+declare_array(struct ir3_context *ctx, nir_register *reg)
+{
+ struct ir3_array *arr = rzalloc(ctx, struct ir3_array);
+ arr->id = ++ctx->num_arrays;
+ /* NOTE: sometimes we get non array regs, for example for arrays of
+ * length 1. See fs-const-array-of-struct-of-array.shader_test. So
+ * treat a non-array as if it was an array of length 1.
+ *
+ * It would be nice if there was a nir pass to convert arrays of
+ * length 1 to ssa.
+ */
+ arr->length = reg->num_components * MAX2(1, reg->num_array_elems);
+ compile_assert(ctx, arr->length > 0);
+ arr->r = reg;
+ list_addtail(&arr->node, &ctx->ir->array_list);
+}
+
+static struct ir3_array *
+get_array(struct ir3_context *ctx, nir_register *reg)
+{
+ list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+ if (arr->r == reg)
+ return arr;
+ }
+ compile_error(ctx, "bogus reg: %s\n", reg->name);
+ return NULL;
+}
+
+/* relative (indirect) if address!=NULL */
+static struct ir3_instruction *
+create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n,
+ struct ir3_instruction *address)
+{
+ struct ir3_block *block = ctx->block;
+ struct ir3_instruction *mov;
+ struct ir3_register *src;
+
+ mov = ir3_instr_create(block, OPC_MOV);
+ mov->cat1.src_type = TYPE_U32;
+ mov->cat1.dst_type = TYPE_U32;
+ mov->barrier_class = IR3_BARRIER_ARRAY_R;
+ mov->barrier_conflict = IR3_BARRIER_ARRAY_W;
+ ir3_reg_create(mov, 0, 0);
+ src = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
+ COND(address, IR3_REG_RELATIV));
+ src->instr = arr->last_write;
+ src->size = arr->length;
+ src->array.id = arr->id;
+ src->array.offset = n;
+
+ if (address)
+ ir3_instr_set_address(mov, address);
+
+ return mov;
+}
+
+/* relative (indirect) if address!=NULL */
+static void
+create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n,
+ struct ir3_instruction *src, struct ir3_instruction *address)
+{
+ struct ir3_block *block = ctx->block;
+ struct ir3_instruction *mov;
+ struct ir3_register *dst;
+
+ /* if not relative store, don't create an extra mov, since that
+ * ends up being difficult for cp to remove.
+ */
+ if (!address) {
+ dst = src->regs[0];
+
+ src->barrier_class |= IR3_BARRIER_ARRAY_W;
+ src->barrier_conflict |= IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
+
+ dst->flags |= IR3_REG_ARRAY;
+ dst->instr = arr->last_write;
+ dst->size = arr->length;
+ dst->array.id = arr->id;
+ dst->array.offset = n;
+
+ arr->last_write = src;
+
+ array_insert(block, block->keeps, src);
+
+ return;
+ }
+
+ mov = ir3_instr_create(block, OPC_MOV);
+ mov->cat1.src_type = TYPE_U32;
+ mov->cat1.dst_type = TYPE_U32;
+ mov->barrier_class = IR3_BARRIER_ARRAY_W;
+ mov->barrier_conflict = IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
+ dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
+ COND(address, IR3_REG_RELATIV));
+ dst->instr = arr->last_write;
+ dst->size = arr->length;
+ dst->array.id = arr->id;
+ dst->array.offset = n;
+ ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src;
+
+ if (address)
+ ir3_instr_set_address(mov, address);
+
+ arr->last_write = mov;
+
+ /* the array store may only matter to something in an earlier
+ * block (ie. loops), but since arrays are not in SSA, depth
+ * pass won't know this.. so keep all array stores:
+ */
+ array_insert(block, block->keeps, mov);
+}
+
+static inline type_t utype_for_size(unsigned bit_size)
+{
+ switch (bit_size) {
+ case 32: return TYPE_U32;
+ case 16: return TYPE_U16;
+ case 8: return TYPE_U8;
+ default: unreachable("bad bitsize"); return ~0;
+ }
+}
+
+static inline type_t utype_src(nir_src src)
+{ return utype_for_size(nir_src_bit_size(src)); }
+
+static inline type_t utype_dst(nir_dest dst)
+{ return utype_for_size(nir_dest_bit_size(dst)); }
+
+/* allocate a n element value array (to be populated by caller) and
+ * insert in def_ht
+ */
+static struct ir3_instruction **
+get_dst_ssa(struct ir3_context *ctx, nir_ssa_def *dst, unsigned n)
+{
+ struct ir3_instruction **value =
+ ralloc_array(ctx->def_ht, struct ir3_instruction *, n);
+ _mesa_hash_table_insert(ctx->def_ht, dst, value);
+ return value;
+}
+
+static struct ir3_instruction **
+get_dst(struct ir3_context *ctx, nir_dest *dst, unsigned n)
+{
+ struct ir3_instruction **value;
+
+ if (dst->is_ssa) {
+ value = get_dst_ssa(ctx, &dst->ssa, n);
+ } else {
+ value = ralloc_array(ctx, struct ir3_instruction *, n);
+ }
+
+ /* NOTE: in non-ssa case, we don't really need to store last_dst
+ * but this helps us catch cases where put_dst() call is forgotten
+ */
+ compile_assert(ctx, !ctx->last_dst);
+ ctx->last_dst = value;
+ ctx->last_dst_n = n;
+
+ return value;
+}
+
+static struct ir3_instruction * get_addr(struct ir3_context *ctx, struct ir3_instruction *src, int align);
+
+static struct ir3_instruction * const *
+get_src(struct ir3_context *ctx, nir_src *src)
+{
+ if (src->is_ssa) {
+ struct hash_entry *entry;
+ entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
+ compile_assert(ctx, entry);
+ return entry->data;
+ } else {
+ nir_register *reg = src->reg.reg;
+ struct ir3_array *arr = get_array(ctx, reg);
+ unsigned num_components = arr->r->num_components;
+ struct ir3_instruction *addr = NULL;
+ struct ir3_instruction **value =
+ ralloc_array(ctx, struct ir3_instruction *, num_components);
+
+ if (src->reg.indirect)
+ addr = get_addr(ctx, get_src(ctx, src->reg.indirect)[0],
+ reg->num_components);
+
+ for (unsigned i = 0; i < num_components; i++) {
+ unsigned n = src->reg.base_offset * reg->num_components + i;
+ compile_assert(ctx, n < arr->length);
+ value[i] = create_array_load(ctx, arr, n, addr);
+ }
+
+ return value;
+ }
+}
+
+static void
+put_dst(struct ir3_context *ctx, nir_dest *dst)
+{
+ unsigned bit_size = nir_dest_bit_size(*dst);
+
+ if (bit_size < 32) {
+ for (unsigned i = 0; i < ctx->last_dst_n; i++) {
+ struct ir3_instruction *dst = ctx->last_dst[i];
+ dst->regs[0]->flags |= IR3_REG_HALF;
+ if (ctx->last_dst[i]->opc == OPC_META_FO)
+ dst->regs[1]->instr->regs[0]->flags |= IR3_REG_HALF;
+ }
+ }
+
+ if (!dst->is_ssa) {
+ nir_register *reg = dst->reg.reg;
+ struct ir3_array *arr = get_array(ctx, reg);
+ unsigned num_components = ctx->last_dst_n;
+ struct ir3_instruction *addr = NULL;
+
+ if (dst->reg.indirect)
+ addr = get_addr(ctx, get_src(ctx, dst->reg.indirect)[0],
+ reg->num_components);
+
+ for (unsigned i = 0; i < num_components; i++) {
+ unsigned n = dst->reg.base_offset * reg->num_components + i;
+ compile_assert(ctx, n < arr->length);
+ if (!ctx->last_dst[i])
+ continue;
+ create_array_store(ctx, arr, n, ctx->last_dst[i], addr);
+ }
+
+ ralloc_free(ctx->last_dst);
+ }
+ ctx->last_dst = NULL;
+ ctx->last_dst_n = 0;
+}
+
+static struct ir3_instruction *
+create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
+{
+ struct ir3_instruction *mov;
+ unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
+
+ mov = ir3_instr_create(block, OPC_MOV);
+ mov->cat1.src_type = type;
+ mov->cat1.dst_type = type;
+ ir3_reg_create(mov, 0, flags);
+ ir3_reg_create(mov, 0, IR3_REG_IMMED)->uim_val = val;
+
+ return mov;
+}
+
+static struct ir3_instruction *
+create_immed(struct ir3_block *block, uint32_t val)
+{
+ return create_immed_typed(block, val, TYPE_U32);
+}
+
+static struct ir3_instruction *
+create_addr(struct ir3_block *block, struct ir3_instruction *src, int align)
+{
+ struct ir3_instruction *instr, *immed;
+
+ /* TODO in at least some cases, the backend could probably be
+ * made clever enough to propagate IR3_REG_HALF..
+ */
+ instr = ir3_COV(block, src, TYPE_U32, TYPE_S16);
+ instr->regs[0]->flags |= IR3_REG_HALF;
+
+ switch(align){
+ case 1:
+ /* src *= 1: */
+ break;
+ case 2:
+ /* src *= 2 => src <<= 1: */
+ immed = create_immed(block, 1);
+ immed->regs[0]->flags |= IR3_REG_HALF;
+
+ instr = ir3_SHL_B(block, instr, 0, immed, 0);
+ instr->regs[0]->flags |= IR3_REG_HALF;
+ instr->regs[1]->flags |= IR3_REG_HALF;
+ break;
+ case 3:
+ /* src *= 3: */
+ immed = create_immed(block, 3);
+ immed->regs[0]->flags |= IR3_REG_HALF;
+
+ instr = ir3_MULL_U(block, instr, 0, immed, 0);
+ instr->regs[0]->flags |= IR3_REG_HALF;
+ instr->regs[1]->flags |= IR3_REG_HALF;
+ break;
+ case 4:
+ /* src *= 4 => src <<= 2: */
+ immed = create_immed(block, 2);
+ immed->regs[0]->flags |= IR3_REG_HALF;
+
+ instr = ir3_SHL_B(block, instr, 0, immed, 0);
+ instr->regs[0]->flags |= IR3_REG_HALF;
+ instr->regs[1]->flags |= IR3_REG_HALF;
+ break;
+ default:
+ unreachable("bad align");
+ return NULL;
+ }
+
+ instr = ir3_MOV(block, instr, TYPE_S16);
+ instr->regs[0]->num = regid(REG_A0, 0);
+ instr->regs[0]->flags |= IR3_REG_HALF;
+ instr->regs[1]->flags |= IR3_REG_HALF;
+
+ return instr;
+}
+
+/* caches addr values to avoid generating multiple cov/shl/mova
+ * sequences for each use of a given NIR level src as address
+ */
+static struct ir3_instruction *
+get_addr(struct ir3_context *ctx, struct ir3_instruction *src, int align)
+{
+ struct ir3_instruction *addr;
+ unsigned idx = align - 1;
+
+ compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr_ht));
+
+ if (!ctx->addr_ht[idx]) {
+ ctx->addr_ht[idx] = _mesa_hash_table_create(ctx,
+ _mesa_hash_pointer, _mesa_key_pointer_equal);
+ } else {
+ struct hash_entry *entry;
+ entry = _mesa_hash_table_search(ctx->addr_ht[idx], src);
+ if (entry)
+ return entry->data;
+ }
+
+ addr = create_addr(ctx->block, src, align);
+ _mesa_hash_table_insert(ctx->addr_ht[idx], src, addr);
+
+ return addr;
+}
+
+static struct ir3_instruction *
+get_predicate(struct ir3_context *ctx, struct ir3_instruction *src)
+{
+ struct ir3_block *b = ctx->block;
+ struct ir3_instruction *cond;
+
+ /* NOTE: only cmps.*.* can write p0.x: */
+ cond = ir3_CMPS_S(b, src, 0, create_immed(b, 0), 0);
+ cond->cat2.condition = IR3_COND_NE;
+
+ /* condition always goes in predicate register: */
+ cond->regs[0]->num = regid(REG_P0, 0);
+
+ return cond;
+}
+
+static struct ir3_instruction *
+create_uniform(struct ir3_context *ctx, unsigned n)
+{
+ struct ir3_instruction *mov;
+
+ mov = ir3_instr_create(ctx->block, OPC_MOV);
+ /* TODO get types right? */
+ mov->cat1.src_type = TYPE_F32;
+ mov->cat1.dst_type = TYPE_F32;
+ ir3_reg_create(mov, 0, 0);
+ ir3_reg_create(mov, n, IR3_REG_CONST);
+
+ return mov;
+}
+
+static struct ir3_instruction *
+create_uniform_indirect(struct ir3_context *ctx, int n,
+ struct ir3_instruction *address)
+{
+ struct ir3_instruction *mov;
+
+ mov = ir3_instr_create(ctx->block, OPC_MOV);
+ mov->cat1.src_type = TYPE_U32;
+ mov->cat1.dst_type = TYPE_U32;
+ ir3_reg_create(mov, 0, 0);
+ ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
+
+ ir3_instr_set_address(mov, address);
+
+ return mov;
+}
+
+static struct ir3_instruction *
+create_collect(struct ir3_context *ctx, struct ir3_instruction *const *arr,
+ unsigned arrsz)
+{
+ struct ir3_block *block = ctx->block;
+ struct ir3_instruction *collect;
+
+ if (arrsz == 0)
+ return NULL;
+
+ unsigned flags = arr[0]->regs[0]->flags & IR3_REG_HALF;
+
+ collect = ir3_instr_create2(block, OPC_META_FI, 1 + arrsz);
+ ir3_reg_create(collect, 0, flags); /* dst */
+ for (unsigned i = 0; i < arrsz; i++) {
+ struct ir3_instruction *elem = arr[i];
+
+ /* Since arrays are pre-colored in RA, we can't assume that
+ * things will end up in the right place. (Ie. if a collect
+ * joins elements from two different arrays.) So insert an
+ * extra mov.
+ *
+ * We could possibly skip this if all the collected elements
+ * are contiguous elements in a single array.. not sure how
+ * likely that is to happen.
+ *
+ * Fixes a problem with glamor shaders, that in effect do
+ * something like:
+ *
+ * if (foo)
+ * texcoord = ..
+ * else
+ * texcoord = ..
+ * color = texture2D(tex, texcoord);
+ *
+ * In this case, texcoord will end up as nir registers (which
+ * translate to ir3 array's of length 1. And we can't assume
+ * the two (or more) arrays will get allocated in consecutive
+ * scalar registers.
+ *
+ */
+ if (elem->regs[0]->flags & IR3_REG_ARRAY) {
+ type_t type = (flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+ elem = ir3_MOV(block, elem, type);
+ }
+
+ compile_assert(ctx, (elem->regs[0]->flags & IR3_REG_HALF) == flags);
+ ir3_reg_create(collect, 0, IR3_REG_SSA | flags)->instr = elem;
+ }
+
+ return collect;
+}
+
+static struct ir3_instruction *
+create_indirect_load(struct ir3_context *ctx, unsigned arrsz, int n,
+ struct ir3_instruction *address, struct ir3_instruction *collect)
+{
+ struct ir3_block *block = ctx->block;
+ struct ir3_instruction *mov;
+ struct ir3_register *src;
+
+ mov = ir3_instr_create(block, OPC_MOV);
+ mov->cat1.src_type = TYPE_U32;
+ mov->cat1.dst_type = TYPE_U32;
+ ir3_reg_create(mov, 0, 0);
+ src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV);
+ src->instr = collect;
+ src->size = arrsz;
+ src->array.offset = n;
+
+ ir3_instr_set_address(mov, address);
+
+ return mov;
+}
+
+static struct ir3_instruction *
+create_input_compmask(struct ir3_context *ctx, unsigned n, unsigned compmask)
+{
+ struct ir3_instruction *in;
+
+ in = ir3_instr_create(ctx->in_block, OPC_META_INPUT);
+ in->inout.block = ctx->in_block;
+ ir3_reg_create(in, n, 0);
+
+ in->regs[0]->wrmask = compmask;
+
+ return in;
+}
+
+static struct ir3_instruction *
+create_input(struct ir3_context *ctx, unsigned n)
+{
+ return create_input_compmask(ctx, n, 0x1);
+}
+
+static struct ir3_instruction *
+create_frag_input(struct ir3_context *ctx, bool use_ldlv)
+{
+ struct ir3_block *block = ctx->block;
+ struct ir3_instruction *instr;
+ /* actual inloc is assigned and fixed up later: */
+ struct ir3_instruction *inloc = create_immed(block, 0);
+
+ if (use_ldlv) {
+ instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);
+ instr->cat6.type = TYPE_U32;
+ instr->cat6.iim_val = 1;
+ } else {
+ instr = ir3_BARY_F(block, inloc, 0, ctx->frag_vcoord, 0);
+ instr->regs[2]->wrmask = 0x3;
+ }
+
+ return instr;
+}
+
+static struct ir3_instruction *
+create_driver_param(struct ir3_context *ctx, enum ir3_driver_param dp)
+{
+ /* first four vec4 sysval's reserved for UBOs: */
+ /* NOTE: dp is in scalar, but there can be >4 dp components: */
+ unsigned n = ctx->so->constbase.driver_param;
+ unsigned r = regid(n + dp / 4, dp % 4);
+ return create_uniform(ctx, r);
+}
+
+/* helper for instructions that produce multiple consecutive scalar
+ * outputs which need to have a split/fanout meta instruction inserted
+ */
+static void
+split_dest(struct ir3_block *block, struct ir3_instruction **dst,
+ struct ir3_instruction *src, unsigned base, unsigned n)
+{
+ struct ir3_instruction *prev = NULL;
+
+ if ((n == 1) && (src->regs[0]->wrmask == 0x1)) {
+ dst[0] = src;
+ return;
+ }
+
+ for (int i = 0, j = 0; i < n; i++) {
+ struct ir3_instruction *split = ir3_instr_create(block, OPC_META_FO);
+ ir3_reg_create(split, 0, IR3_REG_SSA);
+ ir3_reg_create(split, 0, IR3_REG_SSA)->instr = src;
+ split->fo.off = i + base;
+
+ if (prev) {
+ split->cp.left = prev;
+ split->cp.left_cnt++;
+ prev->cp.right = split;
+ prev->cp.right_cnt++;
+ }
+ prev = split;
+
+ if (src->regs[0]->wrmask & (1 << (i + base)))
+ dst[j++] = split;
+ }
+}
+
+/*
+ * Adreno uses uint rather than having dedicated bool type,
+ * which (potentially) requires some conversion, in particular
+ * when using output of an bool instr to int input, or visa
+ * versa.
+ *
+ * | Adreno | NIR |
+ * -------+---------+-------+-
+ * true | 1 | ~0 |
+ * false | 0 | 0 |
+ *
+ * To convert from an adreno bool (uint) to nir, use:
+ *
+ * absneg.s dst, (neg)src
+ *
+ * To convert back in the other direction:
+ *
+ * absneg.s dst, (abs)arc
+ *
+ * The CP step can clean up the absneg.s that cancel each other
+ * out, and with a slight bit of extra cleverness (to recognize
+ * the instructions which produce either a 0 or 1) can eliminate
+ * the absneg.s's completely when an instruction that wants
+ * 0/1 consumes the result. For example, when a nir 'bcsel'
+ * consumes the result of 'feq'. So we should be able to get by
+ * without a boolean resolve step, and without incuring any
+ * extra penalty in instruction count.
+ */
+
+/* NIR bool -> native (adreno): */
+static struct ir3_instruction *
+ir3_b2n(struct ir3_block *block, struct ir3_instruction *instr)
+{
+ return ir3_ABSNEG_S(block, instr, IR3_REG_SABS);
+}
+
+/* native (adreno) -> NIR bool: */
+static struct ir3_instruction *
+ir3_n2b(struct ir3_block *block, struct ir3_instruction *instr)
+{
+ return ir3_ABSNEG_S(block, instr, IR3_REG_SNEG);
+}
+
+/*
+ * alu/sfu instructions:
+ */
+
+static struct ir3_instruction *
+create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
+ unsigned src_bitsize, nir_op op)
+{
+ type_t src_type, dst_type;
+
+ switch (op) {
+ case nir_op_f2f32:
+ case nir_op_f2f16_rtne:
+ case nir_op_f2f16_rtz:
+ case nir_op_f2f16:
+ case nir_op_f2i32:
+ case nir_op_f2i16:
+ case nir_op_f2i8:
+ case nir_op_f2u32:
+ case nir_op_f2u16:
+ case nir_op_f2u8:
+ switch (src_bitsize) {
+ case 32:
+ src_type = TYPE_F32;
+ break;
+ case 16:
+ src_type = TYPE_F16;
+ break;
+ default:
+ compile_error(ctx, "invalid src bit size: %u", src_bitsize);
+ }
+ break;
+
+ case nir_op_i2f32:
+ case nir_op_i2f16:
+ case nir_op_i2i32:
+ case nir_op_i2i16:
+ case nir_op_i2i8:
+ switch (src_bitsize) {
+ case 32:
+ src_type = TYPE_S32;
+ break;
+ case 16:
+ src_type = TYPE_S16;
+ break;
+ case 8:
+ src_type = TYPE_S8;
+ break;
+ default:
+ compile_error(ctx, "invalid src bit size: %u", src_bitsize);
+ }
+ break;
+
+ case nir_op_u2f32:
+ case nir_op_u2f16:
+ case nir_op_u2u32:
+ case nir_op_u2u16:
+ case nir_op_u2u8:
+ switch (src_bitsize) {
+ case 32:
+ src_type = TYPE_U32;
+ break;
+ case 16:
+ src_type = TYPE_U16;
+ break;
+ case 8:
+ src_type = TYPE_U8;
+ break;
+ default:
+ compile_error(ctx, "invalid src bit size: %u", src_bitsize);
+ }
+ break;
+
+ default:
+ compile_error(ctx, "invalid conversion op: %u", op);
+ }
+
+ switch (op) {
+ case nir_op_f2f32:
+ case nir_op_i2f32:
+ case nir_op_u2f32:
+ dst_type = TYPE_F32;
+ break;
+
+ case nir_op_f2f16_rtne:
+ case nir_op_f2f16_rtz:
+ case nir_op_f2f16:
+ /* TODO how to handle rounding mode? */
+ case nir_op_i2f16:
+ case nir_op_u2f16:
+ dst_type = TYPE_F16;
+ break;
+
+ case nir_op_f2i32:
+ case nir_op_i2i32:
+ dst_type = TYPE_S32;
+ break;
+
+ case nir_op_f2i16:
+ case nir_op_i2i16:
+ dst_type = TYPE_S16;
+ break;
+
+ case nir_op_f2i8:
+ case nir_op_i2i8:
+ dst_type = TYPE_S8;
+ break;
+
+ case nir_op_f2u32:
+ case nir_op_u2u32:
+ dst_type = TYPE_U32;
+ break;
+
+ case nir_op_f2u16:
+ case nir_op_u2u16:
+ dst_type = TYPE_U16;
+ break;
+
+ case nir_op_f2u8:
+ case nir_op_u2u8:
+ dst_type = TYPE_U8;
+ break;
+
+ default:
+ compile_error(ctx, "invalid conversion op: %u", op);
+ }
+
+ return ir3_COV(ctx->block, src, src_type, dst_type);
+}
+
+static void
+emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
+{
+ const nir_op_info *info = &nir_op_infos[alu->op];
+ struct ir3_instruction **dst, *src[info->num_inputs];
+ unsigned bs[info->num_inputs]; /* bit size */
+ struct ir3_block *b = ctx->block;
+ unsigned dst_sz, wrmask;
+
+ if (alu->dest.dest.is_ssa) {
+ dst_sz = alu->dest.dest.ssa.num_components;
+ wrmask = (1 << dst_sz) - 1;
+ } else {
+ dst_sz = alu->dest.dest.reg.reg->num_components;
+ wrmask = alu->dest.write_mask;
+ }
+
+ dst = get_dst(ctx, &alu->dest.dest, dst_sz);
+
+ /* Vectors are special in that they have non-scalarized writemasks,
+ * and just take the first swizzle channel for each argument in
+ * order into each writemask channel.
+ */
+ if ((alu->op == nir_op_vec2) ||
+ (alu->op == nir_op_vec3) ||
+ (alu->op == nir_op_vec4)) {
+
+ for (int i = 0; i < info->num_inputs; i++) {
+ nir_alu_src *asrc = &alu->src[i];
+
+ compile_assert(ctx, !asrc->abs);
+ compile_assert(ctx, !asrc->negate);
+
+ src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[0]];
+ if (!src[i])
+ src[i] = create_immed(ctx->block, 0);
+ dst[i] = ir3_MOV(b, src[i], TYPE_U32);
+ }
+
+ put_dst(ctx, &alu->dest.dest);
+ return;
+ }
+
+ /* We also get mov's with more than one component for mov's so
+ * handle those specially:
+ */
+ if ((alu->op == nir_op_imov) || (alu->op == nir_op_fmov)) {
+ type_t type = (alu->op == nir_op_imov) ? TYPE_U32 : TYPE_F32;
+ nir_alu_src *asrc = &alu->src[0];
+ struct ir3_instruction *const *src0 = get_src(ctx, &asrc->src);
+
+ for (unsigned i = 0; i < dst_sz; i++) {
+ if (wrmask & (1 << i)) {
+ dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], type);
+ } else {
+ dst[i] = NULL;
+ }
+ }
+
+ put_dst(ctx, &alu->dest.dest);
+ return;
+ }
+
+ /* General case: We can just grab the one used channel per src. */
+ for (int i = 0; i < info->num_inputs; i++) {
+ unsigned chan = ffs(alu->dest.write_mask) - 1;
+ nir_alu_src *asrc = &alu->src[i];
+
+ compile_assert(ctx, !asrc->abs);
+ compile_assert(ctx, !asrc->negate);
+
+ src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[chan]];
+ bs[i] = nir_src_bit_size(asrc->src);
+
+ compile_assert(ctx, src[i]);
+ }
+
+ switch (alu->op) {
+ case nir_op_f2f32:
+ case nir_op_f2f16_rtne:
+ case nir_op_f2f16_rtz:
+ case nir_op_f2f16:
+ case nir_op_f2i32:
+ case nir_op_f2i16:
+ case nir_op_f2i8:
+ case nir_op_f2u32:
+ case nir_op_f2u16:
+ case nir_op_f2u8:
+ case nir_op_i2f32:
+ case nir_op_i2f16:
+ case nir_op_i2i32:
+ case nir_op_i2i16:
+ case nir_op_i2i8:
+ case nir_op_u2f32:
+ case nir_op_u2f16:
+ case nir_op_u2u32:
+ case nir_op_u2u16:
+ case nir_op_u2u8:
+ dst[0] = create_cov(ctx, src[0], bs[0], alu->op);
+ break;
+ case nir_op_f2b:
+ dst[0] = ir3_CMPS_F(b, src[0], 0, create_immed(b, fui(0.0)), 0);
+ dst[0]->cat2.condition = IR3_COND_NE;
+ dst[0] = ir3_n2b(b, dst[0]);
+ break;
+ case nir_op_b2f:
+ dst[0] = ir3_COV(b, ir3_b2n(b, src[0]), TYPE_U32, TYPE_F32);
+ break;
+ case nir_op_b2i:
+ dst[0] = ir3_b2n(b, src[0]);
+ break;
+ case nir_op_i2b:
+ dst[0] = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
+ dst[0]->cat2.condition = IR3_COND_NE;
+ dst[0] = ir3_n2b(b, dst[0]);
+ break;
+
+ case nir_op_fneg:
+ dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG);
+ break;
+ case nir_op_fabs:
+ dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS);
+ break;
+ case nir_op_fmax:
+ dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0);
+ break;
+ case nir_op_fmin:
+ dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0);
+ break;
+ case nir_op_fsat:
+ /* if there is just a single use of the src, and it supports
+ * (sat) bit, we can just fold the (sat) flag back to the
+ * src instruction and create a mov. This is easier for cp
+ * to eliminate.
+ *
+ * TODO probably opc_cat==4 is ok too
+ */
+ if (alu->src[0].src.is_ssa &&
+ (list_length(&alu->src[0].src.ssa->uses) == 1) &&
+ ((opc_cat(src[0]->opc) == 2) || (opc_cat(src[0]->opc) == 3))) {
+ src[0]->flags |= IR3_INSTR_SAT;
+ dst[0] = ir3_MOV(b, src[0], TYPE_U32);
+ } else {
+ /* otherwise generate a max.f that saturates.. blob does
+ * similar (generating a cat2 mov using max.f)
+ */
+ dst[0] = ir3_MAX_F(b, src[0], 0, src[0], 0);
+ dst[0]->flags |= IR3_INSTR_SAT;
+ }
+ break;
+ case nir_op_fmul:
+ dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0);
+ break;
+ case nir_op_fadd:
+ dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0);
+ break;
+ case nir_op_fsub:
+ dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG);
+ break;
+ case nir_op_ffma:
+ dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0);
+ break;
+ case nir_op_fddx:
+ dst[0] = ir3_DSX(b, src[0], 0);
+ dst[0]->cat5.type = TYPE_F32;
+ break;
+ case nir_op_fddy:
+ dst[0] = ir3_DSY(b, src[0], 0);
+ dst[0]->cat5.type = TYPE_F32;
+ break;
+ break;
+ case nir_op_flt:
+ dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
+ dst[0]->cat2.condition = IR3_COND_LT;
+ dst[0] = ir3_n2b(b, dst[0]);
+ break;
+ case nir_op_fge:
+ dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
+ dst[0]->cat2.condition = IR3_COND_GE;
+ dst[0] = ir3_n2b(b, dst[0]);
+ break;
+ case nir_op_feq:
+ dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
+ dst[0]->cat2.condition = IR3_COND_EQ;
+ dst[0] = ir3_n2b(b, dst[0]);
+ break;
+ case nir_op_fne:
+ dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
+ dst[0]->cat2.condition = IR3_COND_NE;
+ dst[0] = ir3_n2b(b, dst[0]);
+ break;
+ case nir_op_fceil:
+ dst[0] = ir3_CEIL_F(b, src[0], 0);
+ break;
+ case nir_op_ffloor:
+ dst[0] = ir3_FLOOR_F(b, src[0], 0);
+ break;
+ case nir_op_ftrunc:
+ dst[0] = ir3_TRUNC_F(b, src[0], 0);
+ break;
+ case nir_op_fround_even:
+ dst[0] = ir3_RNDNE_F(b, src[0], 0);
+ break;
+ case nir_op_fsign:
+ dst[0] = ir3_SIGN_F(b, src[0], 0);
+ break;
+
+ case nir_op_fsin:
+ dst[0] = ir3_SIN(b, src[0], 0);
+ break;
+ case nir_op_fcos:
+ dst[0] = ir3_COS(b, src[0], 0);
+ break;
+ case nir_op_frsq:
+ dst[0] = ir3_RSQ(b, src[0], 0);
+ break;
+ case nir_op_frcp:
+ dst[0] = ir3_RCP(b, src[0], 0);
+ break;
+ case nir_op_flog2:
+ dst[0] = ir3_LOG2(b, src[0], 0);
+ break;
+ case nir_op_fexp2:
+ dst[0] = ir3_EXP2(b, src[0], 0);
+ break;
+ case nir_op_fsqrt:
+ dst[0] = ir3_SQRT(b, src[0], 0);
+ break;
+
+ case nir_op_iabs:
+ dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS);
+ break;
+ case nir_op_iadd:
+ dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
+ break;
+ case nir_op_iand:
+ dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0);
+ break;
+ case nir_op_imax:
+ dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);
+ break;
+ case nir_op_umax:
+ dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0);
+ break;
+ case nir_op_imin:
+ dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);
+ break;
+ case nir_op_umin:
+ dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0);
+ break;
+ case nir_op_imul:
+ /*
+ * dst = (al * bl) + (ah * bl << 16) + (al * bh << 16)
+ * mull.u tmp0, a, b ; mul low, i.e. al * bl
+ * madsh.m16 tmp1, a, b, tmp0 ; mul-add shift high mix, i.e. ah * bl << 16
+ * madsh.m16 dst, b, a, tmp1 ; i.e. al * bh << 16
+ */
+ dst[0] = ir3_MADSH_M16(b, src[1], 0, src[0], 0,
+ ir3_MADSH_M16(b, src[0], 0, src[1], 0,
+ ir3_MULL_U(b, src[0], 0, src[1], 0), 0), 0);
+ break;
+ case nir_op_ineg:
+ dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
+ break;
+ case nir_op_inot:
+ dst[0] = ir3_NOT_B(b, src[0], 0);
+ break;
+ case nir_op_ior:
+ dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0);
+ break;
+ case nir_op_ishl:
+ dst[0] = ir3_SHL_B(b, src[0], 0, src[1], 0);
+ break;
+ case nir_op_ishr:
+ dst[0] = ir3_ASHR_B(b, src[0], 0, src[1], 0);
+ break;
+ case nir_op_isign: {
+ /* maybe this would be sane to lower in nir.. */
+ struct ir3_instruction *neg, *pos;
+
+ neg = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
+ neg->cat2.condition = IR3_COND_LT;
+
+ pos = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
+ pos->cat2.condition = IR3_COND_GT;
+
+ dst[0] = ir3_SUB_U(b, pos, 0, neg, 0);
+
+ break;
+ }
+ case nir_op_isub:
+ dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
+ break;
+ case nir_op_ixor:
+ dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0);
+ break;
+ case nir_op_ushr:
+ dst[0] = ir3_SHR_B(b, src[0], 0, src[1], 0);
+ break;
+ case nir_op_ilt:
+ dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
+ dst[0]->cat2.condition = IR3_COND_LT;
+ dst[0] = ir3_n2b(b, dst[0]);
+ break;
+ case nir_op_ige:
+ dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
+ dst[0]->cat2.condition = IR3_COND_GE;
+ dst[0] = ir3_n2b(b, dst[0]);
+ break;
+ case nir_op_ieq:
+ dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
+ dst[0]->cat2.condition = IR3_COND_EQ;
+ dst[0] = ir3_n2b(b, dst[0]);
+ break;
+ case nir_op_ine:
+ dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
+ dst[0]->cat2.condition = IR3_COND_NE;
+ dst[0] = ir3_n2b(b, dst[0]);
+ break;
+ case nir_op_ult:
+ dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
+ dst[0]->cat2.condition = IR3_COND_LT;
+ dst[0] = ir3_n2b(b, dst[0]);
+ break;
+ case nir_op_uge:
+ dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
+ dst[0]->cat2.condition = IR3_COND_GE;
+ dst[0] = ir3_n2b(b, dst[0]);
+ break;
+
+ case nir_op_bcsel: {
+ struct ir3_instruction *cond = ir3_b2n(b, src[0]);
+ compile_assert(ctx, bs[1] == bs[2]);
+ /* the boolean condition is 32b even if src[1] and src[2] are
+ * half-precision, but sel.b16 wants all three src's to be the
+ * same type.
+ */
+ if (bs[1] < 32)
+ cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16);
+ dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0);
+ break;
+ }
+ case nir_op_bit_count:
+ dst[0] = ir3_CBITS_B(b, src[0], 0);
+ break;
+ case nir_op_ifind_msb: {
+ struct ir3_instruction *cmp;
+ dst[0] = ir3_CLZ_S(b, src[0], 0);
+ cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0);
+ cmp->cat2.condition = IR3_COND_GE;
+ dst[0] = ir3_SEL_B32(b,
+ ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0,
+ cmp, 0, dst[0], 0);
+ break;
+ }
+ case nir_op_ufind_msb:
+ dst[0] = ir3_CLZ_B(b, src[0], 0);
+ dst[0] = ir3_SEL_B32(b,
+ ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0,
+ src[0], 0, dst[0], 0);
+ break;
+ case nir_op_find_lsb:
+ dst[0] = ir3_BFREV_B(b, src[0], 0);
+ dst[0] = ir3_CLZ_B(b, dst[0], 0);
+ break;
+ case nir_op_bitfield_reverse:
+ dst[0] = ir3_BFREV_B(b, src[0], 0);
+ break;
+
+ default:
+ compile_error(ctx, "Unhandled ALU op: %s\n",
+ nir_op_infos[alu->op].name);
+ break;
+ }
+
+ put_dst(ctx, &alu->dest.dest);
+}
+
+/* handles direct/indirect UBO reads: */
+static void
+emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+ struct ir3_instruction **dst)
+{
+ struct ir3_block *b = ctx->block;
+ struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
+ nir_const_value *const_offset;
+ /* UBO addresses are the first driver params: */
+ unsigned ubo = regid(ctx->so->constbase.ubo, 0);
+ const unsigned ptrsz = pointer_size(ctx);
+
+ int off = 0;
+
+ /* First src is ubo index, which could either be an immed or not: */
+ src0 = get_src(ctx, &intr->src[0])[0];
+ if (is_same_type_mov(src0) &&
+ (src0->regs[1]->flags & IR3_REG_IMMED)) {
+ base_lo = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz));
+ base_hi = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz) + 1);
+ } else {
+ base_lo = create_uniform_indirect(ctx, ubo, get_addr(ctx, src0, 4));
+ base_hi = create_uniform_indirect(ctx, ubo + 1, get_addr(ctx, src0, 4));
+ }
+
+ /* note: on 32bit gpu's base_hi is ignored and DCE'd */
+ addr = base_lo;
+
+ const_offset = nir_src_as_const_value(intr->src[1]);
+ if (const_offset) {
+ off += const_offset->u32[0];
+ } else {
+ /* For load_ubo_indirect, second src is indirect offset: */
+ src1 = get_src(ctx, &intr->src[1])[0];
+
+ /* and add offset to addr: */
+ addr = ir3_ADD_S(b, addr, 0, src1, 0);
+ }
+
+ /* if offset is to large to encode in the ldg, split it out: */
+ if ((off + (intr->num_components * 4)) > 1024) {
+ /* split out the minimal amount to improve the odds that
+ * cp can fit the immediate in the add.s instruction:
+ */
+ unsigned off2 = off + (intr->num_components * 4) - 1024;
+ addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0);
+ off -= off2;
+ }
+
+ if (ptrsz == 2) {
+ struct ir3_instruction *carry;
+
+ /* handle 32b rollover, ie:
+ * if (addr < base_lo)
+ * base_hi++
+ */
+ carry = ir3_CMPS_U(b, addr, 0, base_lo, 0);
+ carry->cat2.condition = IR3_COND_LT;
+ base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0);
+
+ addr = create_collect(ctx, (struct ir3_instruction*[]){ addr, base_hi }, 2);
+ }
+
+ for (int i = 0; i < intr->num_components; i++) {
+ struct ir3_instruction *load =
+ ir3_LDG(b, addr, 0, create_immed(b, 1), 0);
+ load->cat6.type = TYPE_U32;
+ load->cat6.src_offset = off + i * 4; /* byte offset */
+ dst[i] = load;
+ }
+}
+
+/* src[] = { buffer_index, offset }. No const_index */
+static void
+emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+ struct ir3_instruction **dst)
+{
+ struct ir3_block *b = ctx->block;
+ struct ir3_instruction *ldgb, *src0, *src1, *offset;
+ nir_const_value *const_offset;
+
+ /* can this be non-const buffer_index? how do we handle that? */
+ const_offset = nir_src_as_const_value(intr->src[0]);
+ compile_assert(ctx, const_offset);
+
+ offset = get_src(ctx, &intr->src[1])[0];
+
+ /* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
+ src0 = create_collect(ctx, (struct ir3_instruction*[]){
+ offset,
+ create_immed(b, 0),
+ }, 2);
+ src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+
+ ldgb = ir3_LDGB(b, create_immed(b, const_offset->u32[0]), 0,
+ src0, 0, src1, 0);
+ ldgb->regs[0]->wrmask = MASK(intr->num_components);
+ ldgb->cat6.iim_val = intr->num_components;
+ ldgb->cat6.d = 4;
+ ldgb->cat6.type = TYPE_U32;
+ ldgb->barrier_class = IR3_BARRIER_BUFFER_R;
+ ldgb->barrier_conflict = IR3_BARRIER_BUFFER_W;
+
+ split_dest(b, dst, ldgb, 0, intr->num_components);
+}
+
+/* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
+static void
+emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+ struct ir3_block *b = ctx->block;
+ struct ir3_instruction *stgb, *src0, *src1, *src2, *offset;
+ nir_const_value *const_offset;
+ /* TODO handle wrmask properly, see _store_shared().. but I think
+ * it is more a PITA than that, since blob ends up loading the
+ * masked components and writing them back out.
+ */
+ unsigned wrmask = intr->const_index[0];
+ unsigned ncomp = ffs(~wrmask) - 1;
+
+ /* can this be non-const buffer_index? how do we handle that? */
+ const_offset = nir_src_as_const_value(intr->src[1]);
+ compile_assert(ctx, const_offset);
+
+ offset = get_src(ctx, &intr->src[2])[0];
+
+ /* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
+ * nir already *= 4:
+ */
+ src0 = create_collect(ctx, get_src(ctx, &intr->src[0]), ncomp);
+ src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+ src2 = create_collect(ctx, (struct ir3_instruction*[]){
+ offset,
+ create_immed(b, 0),
+ }, 2);
+
+ stgb = ir3_STGB(b, create_immed(b, const_offset->u32[0]), 0,
+ src0, 0, src1, 0, src2, 0);
+ stgb->cat6.iim_val = ncomp;
+ stgb->cat6.d = 4;
+ stgb->cat6.type = TYPE_U32;
+ stgb->barrier_class = IR3_BARRIER_BUFFER_W;
+ stgb->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+
+ array_insert(b, b->keeps, stgb);
+}
+
+/* src[] = { block_index } */
+static void
+emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+ struct ir3_instruction **dst)
+{
+ /* SSBO size stored as a const starting at ssbo_sizes: */
+ unsigned blk_idx = nir_src_as_const_value(intr->src[0])->u32[0];
+ unsigned idx = regid(ctx->so->constbase.ssbo_sizes, 0) +
+ ctx->so->const_layout.ssbo_size.off[blk_idx];
+
+ debug_assert(ctx->so->const_layout.ssbo_size.mask & (1 << blk_idx));
+
+ dst[0] = create_uniform(ctx, idx);
+}
+
+/*
+ * SSBO atomic intrinsics
+ *
+ * All of the SSBO atomic memory operations read a value from memory,
+ * compute a new value using one of the operations below, write the new
+ * value to memory, and return the original value read.
+ *
+ * All operations take 3 sources except CompSwap that takes 4. These
+ * sources represent:
+ *
+ * 0: The SSBO buffer index.
+ * 1: The offset into the SSBO buffer of the variable that the atomic
+ * operation will operate on.
+ * 2: The data parameter to the atomic function (i.e. the value to add
+ * in ssbo_atomic_add, etc).
+ * 3: For CompSwap only: the second data parameter.
+ */
+static struct ir3_instruction *
+emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+ struct ir3_block *b = ctx->block;
+ struct ir3_instruction *atomic, *ssbo, *src0, *src1, *src2, *offset;
+ nir_const_value *const_offset;
+ type_t type = TYPE_U32;
+
+ /* can this be non-const buffer_index? how do we handle that? */
+ const_offset = nir_src_as_const_value(intr->src[0]);
+ compile_assert(ctx, const_offset);
+ ssbo = create_immed(b, const_offset->u32[0]);
+
+ offset = get_src(ctx, &intr->src[1])[0];
+
+ /* src0 is data (or uvec2(data, compare))
+ * src1 is offset
+ * src2 is uvec2(offset*4, 0) (appears to be 64b byte offset)
+ *
+ * Note that nir already multiplies the offset by four
+ */
+ src0 = get_src(ctx, &intr->src[2])[0];
+ src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+ src2 = create_collect(ctx, (struct ir3_instruction*[]){
+ offset,
+ create_immed(b, 0),
+ }, 2);
+
+ switch (intr->intrinsic) {
+ case nir_intrinsic_ssbo_atomic_add:
+ atomic = ir3_ATOMIC_ADD_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_ssbo_atomic_imin:
+ atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+ type = TYPE_S32;
+ break;
+ case nir_intrinsic_ssbo_atomic_umin:
+ atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_ssbo_atomic_imax:
+ atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+ type = TYPE_S32;
+ break;
+ case nir_intrinsic_ssbo_atomic_umax:
+ atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_ssbo_atomic_and:
+ atomic = ir3_ATOMIC_AND_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_ssbo_atomic_or:
+ atomic = ir3_ATOMIC_OR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_ssbo_atomic_xor:
+ atomic = ir3_ATOMIC_XOR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_ssbo_atomic_exchange:
+ atomic = ir3_ATOMIC_XCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_ssbo_atomic_comp_swap:
+ /* for cmpxchg, src0 is [ui]vec2(data, compare): */
+ src0 = create_collect(ctx, (struct ir3_instruction*[]){
+ get_src(ctx, &intr->src[3])[0],
+ src0,
+ }, 2);
+ atomic = ir3_ATOMIC_CMPXCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ default:
+ unreachable("boo");
+ }
+
+ atomic->cat6.iim_val = 1;
+ atomic->cat6.d = 4;
+ atomic->cat6.type = type;
+ atomic->barrier_class = IR3_BARRIER_BUFFER_W;
+ atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+
+ /* even if nothing consume the result, we can't DCE the instruction: */
+ array_insert(b, b->keeps, atomic);
+
+ return atomic;
+}
+
+/* src[] = { offset }. const_index[] = { base } */
+static void
+emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+ struct ir3_instruction **dst)
+{
+ struct ir3_block *b = ctx->block;
+ struct ir3_instruction *ldl, *offset;
+ unsigned base;
+
+ offset = get_src(ctx, &intr->src[0])[0];
+ base = nir_intrinsic_base(intr);
+
+ ldl = ir3_LDL(b, offset, 0, create_immed(b, intr->num_components), 0);
+ ldl->cat6.src_offset = base;
+ ldl->cat6.type = utype_dst(intr->dest);
+ ldl->regs[0]->wrmask = MASK(intr->num_components);
+
+ ldl->barrier_class = IR3_BARRIER_SHARED_R;
+ ldl->barrier_conflict = IR3_BARRIER_SHARED_W;
+
+ split_dest(b, dst, ldl, 0, intr->num_components);
+}
+
+/* src[] = { value, offset }. const_index[] = { base, write_mask } */
+static void
+emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+ struct ir3_block *b = ctx->block;
+ struct ir3_instruction *stl, *offset;
+ struct ir3_instruction * const *value;
+ unsigned base, wrmask;
+
+ value = get_src(ctx, &intr->src[0]);
+ offset = get_src(ctx, &intr->src[1])[0];
+
+ base = nir_intrinsic_base(intr);
+ wrmask = nir_intrinsic_write_mask(intr);
+
+ /* Combine groups of consecutive enabled channels in one write
+ * message. We use ffs to find the first enabled channel and then ffs on
+ * the bit-inverse, down-shifted writemask to determine the length of
+ * the block of enabled bits.
+ *
+ * (trick stolen from i965's fs_visitor::nir_emit_cs_intrinsic())
+ */
+ while (wrmask) {
+ unsigned first_component = ffs(wrmask) - 1;
+ unsigned length = ffs(~(wrmask >> first_component)) - 1;
+
+ stl = ir3_STL(b, offset, 0,
+ create_collect(ctx, &value[first_component], length), 0,
+ create_immed(b, length), 0);
+ stl->cat6.dst_offset = first_component + base;
+ stl->cat6.type = utype_src(intr->src[0]);
+ stl->barrier_class = IR3_BARRIER_SHARED_W;
+ stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
+
+ array_insert(b, b->keeps, stl);
+
+ /* Clear the bits in the writemask that we just wrote, then try
+ * again to see if more channels are left.
+ */
+ wrmask &= (15 << (first_component + length));
+ }
+}
+
+/*
+ * CS shared variable atomic intrinsics
+ *
+ * All of the shared variable atomic memory operations read a value from
+ * memory, compute a new value using one of the operations below, write the
+ * new value to memory, and return the original value read.
+ *
+ * All operations take 2 sources except CompSwap that takes 3. These
+ * sources represent:
+ *
+ * 0: The offset into the shared variable storage region that the atomic
+ * operation will operate on.
+ * 1: The data parameter to the atomic function (i.e. the value to add
+ * in shared_atomic_add, etc).
+ * 2: For CompSwap only: the second data parameter.
+ */
+static struct ir3_instruction *
+emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+ struct ir3_block *b = ctx->block;
+ struct ir3_instruction *atomic, *src0, *src1;
+ type_t type = TYPE_U32;
+
+ src0 = get_src(ctx, &intr->src[0])[0]; /* offset */
+ src1 = get_src(ctx, &intr->src[1])[0]; /* value */
+
+ switch (intr->intrinsic) {
+ case nir_intrinsic_shared_atomic_add:
+ atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0);
+ break;
+ case nir_intrinsic_shared_atomic_imin:
+ atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
+ type = TYPE_S32;
+ break;
+ case nir_intrinsic_shared_atomic_umin:
+ atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
+ break;
+ case nir_intrinsic_shared_atomic_imax:
+ atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
+ type = TYPE_S32;
+ break;
+ case nir_intrinsic_shared_atomic_umax:
+ atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
+ break;
+ case nir_intrinsic_shared_atomic_and:
+ atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0);
+ break;
+ case nir_intrinsic_shared_atomic_or:
+ atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0);
+ break;
+ case nir_intrinsic_shared_atomic_xor:
+ atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0);
+ break;
+ case nir_intrinsic_shared_atomic_exchange:
+ atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0);
+ break;
+ case nir_intrinsic_shared_atomic_comp_swap:
+ /* for cmpxchg, src1 is [ui]vec2(data, compare): */
+ src1 = create_collect(ctx, (struct ir3_instruction*[]){
+ get_src(ctx, &intr->src[2])[0],
+ src1,
+ }, 2);
+ atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0);
+ break;
+ default:
+ unreachable("boo");
+ }
+
+ atomic->cat6.iim_val = 1;
+ atomic->cat6.d = 1;
+ atomic->cat6.type = type;
+ atomic->barrier_class = IR3_BARRIER_SHARED_W;
+ atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
+
+ /* even if nothing consume the result, we can't DCE the instruction: */
+ array_insert(b, b->keeps, atomic);
+
+ return atomic;
+}
+
+/* Images get mapped into SSBO/image state (for store/atomic) and texture
+ * state block (for load). To simplify things, invert the image id and
+ * map it from end of state block, ie. image 0 becomes num-1, image 1
+ * becomes num-2, etc. This potentially avoids needing to re-emit texture
+ * state when switching shaders.
+ *
+ * TODO is max # of samplers and SSBOs the same. This shouldn't be hard-
+ * coded. Also, since all the gl shader stages (ie. everything but CS)
+ * share the same SSBO/image state block, this might require some more
+ * logic if we supported images in anything other than FS..
+ */
+static unsigned
+get_image_slot(struct ir3_context *ctx, nir_deref_instr *deref)
+{
+ unsigned int loc = 0;
+ unsigned inner_size = 1;
+
+ while (deref->deref_type != nir_deref_type_var) {
+ assert(deref->deref_type == nir_deref_type_array);
+ nir_const_value *const_index = nir_src_as_const_value(deref->arr.index);
+ assert(const_index);
+
+ /* Go to the next instruction */
+ deref = nir_deref_instr_parent(deref);
+
+ assert(glsl_type_is_array(deref->type));
+ const unsigned array_len = glsl_get_length(deref->type);
+ loc += MIN2(const_index->u32[0], array_len - 1) * inner_size;
+
+ /* Update the inner size */
+ inner_size *= array_len;
+ }
+
+ loc += deref->var->data.driver_location;
+
+ /* TODO figure out real limit per generation, and don't hardcode: */
+ const unsigned max_samplers = 16;
+ return max_samplers - loc - 1;
+}
+
+/* see tex_info() for equiv logic for texture instructions.. it would be
+ * nice if this could be better unified..
+ */
+static unsigned
+get_image_coords(const nir_variable *var, unsigned *flagsp)
+{
+ const struct glsl_type *type = glsl_without_array(var->type);
+ unsigned coords, flags = 0;
+
+ switch (glsl_get_sampler_dim(type)) {
+ case GLSL_SAMPLER_DIM_1D:
+ case GLSL_SAMPLER_DIM_BUF:
+ coords = 1;
+ break;
+ case GLSL_SAMPLER_DIM_2D:
+ case GLSL_SAMPLER_DIM_RECT:
+ case GLSL_SAMPLER_DIM_EXTERNAL:
+ case GLSL_SAMPLER_DIM_MS:
+ coords = 2;
+ break;
+ case GLSL_SAMPLER_DIM_3D:
+ case GLSL_SAMPLER_DIM_CUBE:
+ flags |= IR3_INSTR_3D;
+ coords = 3;
+ break;
+ default:
+ unreachable("bad sampler dim");
+ return 0;
+ }
+
+ if (glsl_sampler_type_is_array(type)) {
+ /* note: unlike tex_info(), adjust # of coords to include array idx: */
+ coords++;
+ flags |= IR3_INSTR_A;
+ }
+
+ if (flagsp)
+ *flagsp = flags;
+
+ return coords;
+}
+
+static type_t
+get_image_type(const nir_variable *var)
+{
+ switch (glsl_get_sampler_result_type(glsl_without_array(var->type))) {
+ case GLSL_TYPE_UINT:
+ return TYPE_U32;
+ case GLSL_TYPE_INT:
+ return TYPE_S32;
+ case GLSL_TYPE_FLOAT:
+ return TYPE_F32;
+ default:
+ unreachable("bad sampler type.");
+ return 0;
+ }
+}
+
+static struct ir3_instruction *
+get_image_offset(struct ir3_context *ctx, const nir_variable *var,
+ struct ir3_instruction * const *coords, bool byteoff)
+{
+ struct ir3_block *b = ctx->block;
+ struct ir3_instruction *offset;
+ unsigned ncoords = get_image_coords(var, NULL);
+
+ /* to calculate the byte offset (yes, uggg) we need (up to) three
+ * const values to know the bytes per pixel, and y and z stride:
+ */
+ unsigned cb = regid(ctx->so->constbase.image_dims, 0) +
+ ctx->so->const_layout.image_dims.off[var->data.driver_location];
+
+ debug_assert(ctx->so->const_layout.image_dims.mask &
+ (1 << var->data.driver_location));
+
+ /* offset = coords.x * bytes_per_pixel: */
+ offset = ir3_MUL_S(b, coords[0], 0, create_uniform(ctx, cb + 0), 0);
+ if (ncoords > 1) {
+ /* offset += coords.y * y_pitch: */
+ offset = ir3_MAD_S24(b, create_uniform(ctx, cb + 1), 0,
+ coords[1], 0, offset, 0);
+ }
+ if (ncoords > 2) {
+ /* offset += coords.z * z_pitch: */
+ offset = ir3_MAD_S24(b, create_uniform(ctx, cb + 2), 0,
+ coords[2], 0, offset, 0);
+ }
+
+ if (!byteoff) {
+ /* Some cases, like atomics, seem to use dword offset instead
+ * of byte offsets.. blob just puts an extra shr.b in there
+ * in those cases:
+ */
+ offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+ }
+
+ return create_collect(ctx, (struct ir3_instruction*[]){
+ offset,
+ create_immed(b, 0),
+ }, 2);
+}
+
+/* src[] = { deref, coord, sample_index }. const_index[] = {} */
+static void
+emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+ struct ir3_instruction **dst)
+{
+ struct ir3_block *b = ctx->block;
+ const nir_variable *var = nir_intrinsic_get_var(intr, 0);
+ struct ir3_instruction *sam;
+ struct ir3_instruction * const *src0 = get_src(ctx, &intr->src[1]);
+ struct ir3_instruction *coords[4];
+ unsigned flags, ncoords = get_image_coords(var, &flags);
+ unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0]));
+ type_t type = get_image_type(var);
+
+ /* hmm, this seems a bit odd, but it is what blob does and (at least
+ * a5xx) just faults on bogus addresses otherwise:
+ */
+ if (flags & IR3_INSTR_3D) {
+ flags &= ~IR3_INSTR_3D;
+ flags |= IR3_INSTR_A;
+ }
+
+ for (unsigned i = 0; i < ncoords; i++)
+ coords[i] = src0[i];
+
+ if (ncoords == 1)
+ coords[ncoords++] = create_immed(b, 0);
+
+ sam = ir3_SAM(b, OPC_ISAM, type, 0b1111, flags,
+ tex_idx, tex_idx, create_collect(ctx, coords, ncoords), NULL);
+
+ sam->barrier_class = IR3_BARRIER_IMAGE_R;
+ sam->barrier_conflict = IR3_BARRIER_IMAGE_W;
+
+ split_dest(b, dst, sam, 0, 4);
+}
+
+/* src[] = { deref, coord, sample_index, value }. const_index[] = {} */
+static void
+emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+ struct ir3_block *b = ctx->block;
+ const nir_variable *var = nir_intrinsic_get_var(intr, 0);
+ struct ir3_instruction *stib, *offset;
+ struct ir3_instruction * const *value = get_src(ctx, &intr->src[3]);
+ struct ir3_instruction * const *coords = get_src(ctx, &intr->src[1]);
+ unsigned ncoords = get_image_coords(var, NULL);
+ unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0]));
+
+ /* src0 is value
+ * src1 is coords
+ * src2 is 64b byte offset
+ */
+
+ offset = get_image_offset(ctx, var, coords, true);
+
+ /* NOTE: stib seems to take byte offset, but stgb.typed can be used
+ * too and takes a dword offset.. not quite sure yet why blob uses
+ * one over the other in various cases.
+ */
+
+ stib = ir3_STIB(b, create_immed(b, tex_idx), 0,
+ create_collect(ctx, value, 4), 0,
+ create_collect(ctx, coords, ncoords), 0,
+ offset, 0);
+ stib->cat6.iim_val = 4;
+ stib->cat6.d = ncoords;
+ stib->cat6.type = get_image_type(var);
+ stib->cat6.typed = true;
+ stib->barrier_class = IR3_BARRIER_IMAGE_W;
+ stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
+
+ array_insert(b, b->keeps, stib);
+}
+
+static void
+emit_intrinsic_image_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+ struct ir3_instruction **dst)
+{
+ struct ir3_block *b = ctx->block;
+ const nir_variable *var = nir_intrinsic_get_var(intr, 0);
+ unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0]));
+ struct ir3_instruction *sam, *lod;
+ unsigned flags, ncoords = get_image_coords(var, &flags);
+
+ lod = create_immed(b, 0);
+ sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, 0b1111, flags,
+ tex_idx, tex_idx, lod, NULL);
+
+ /* Array size actually ends up in .w rather than .z. This doesn't
+ * matter for miplevel 0, but for higher mips the value in z is
+ * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
+ * returned, which means that we have to add 1 to it for arrays for
+ * a3xx.
+ *
+ * Note use a temporary dst and then copy, since the size of the dst
+ * array that is passed in is based on nir's understanding of the
+ * result size, not the hardware's
+ */
+ struct ir3_instruction *tmp[4];
+
+ split_dest(b, tmp, sam, 0, 4);
+
+ /* get_size instruction returns size in bytes instead of texels
+ * for imageBuffer, so we need to divide it by the pixel size
+ * of the image format.
+ *
+ * TODO: This is at least true on a5xx. Check other gens.
+ */
+ enum glsl_sampler_dim dim =
+ glsl_get_sampler_dim(glsl_without_array(var->type));
+ if (dim == GLSL_SAMPLER_DIM_BUF) {
+ /* Since all the possible values the divisor can take are
+ * power-of-two (4, 8, or 16), the division is implemented
+ * as a shift-right.
+ * During shader setup, the log2 of the image format's
+ * bytes-per-pixel should have been emitted in 2nd slot of
+ * image_dims. See ir3_shader::emit_image_dims().
+ */
+ unsigned cb = regid(ctx->so->constbase.image_dims, 0) +
+ ctx->so->const_layout.image_dims.off[var->data.driver_location];
+ struct ir3_instruction *aux = create_uniform(ctx, cb + 1);
+
+ tmp[0] = ir3_SHR_B(b, tmp[0], 0, aux, 0);
+ }
+
+ for (unsigned i = 0; i < ncoords; i++)
+ dst[i] = tmp[i];
+
+ if (flags & IR3_INSTR_A) {
+ if (ctx->compiler->levels_add_one) {
+ dst[ncoords-1] = ir3_ADD_U(b, tmp[3], 0, create_immed(b, 1), 0);
+ } else {
+ dst[ncoords-1] = ir3_MOV(b, tmp[3], TYPE_U32);
+ }
+ }
+}
+
+/* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
+static struct ir3_instruction *
+emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+ struct ir3_block *b = ctx->block;
+ const nir_variable *var = nir_intrinsic_get_var(intr, 0);
+ struct ir3_instruction *atomic, *image, *src0, *src1, *src2;
+ struct ir3_instruction * const *coords = get_src(ctx, &intr->src[1]);
+ unsigned ncoords = get_image_coords(var, NULL);
+
+ image = create_immed(b, get_image_slot(ctx, nir_src_as_deref(intr->src[0])));
+
+ /* src0 is value (or uvec2(value, compare))
+ * src1 is coords
+ * src2 is 64b byte offset
+ */
+ src0 = get_src(ctx, &intr->src[3])[0];
+ src1 = create_collect(ctx, coords, ncoords);
+ src2 = get_image_offset(ctx, var, coords, false);
+
+ switch (intr->intrinsic) {
+ case nir_intrinsic_image_deref_atomic_add:
+ atomic = ir3_ATOMIC_ADD_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_image_deref_atomic_min:
+ atomic = ir3_ATOMIC_MIN_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_image_deref_atomic_max:
+ atomic = ir3_ATOMIC_MAX_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_image_deref_atomic_and:
+ atomic = ir3_ATOMIC_AND_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_image_deref_atomic_or:
+ atomic = ir3_ATOMIC_OR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_image_deref_atomic_xor:
+ atomic = ir3_ATOMIC_XOR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_image_deref_atomic_exchange:
+ atomic = ir3_ATOMIC_XCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_image_deref_atomic_comp_swap:
+ /* for cmpxchg, src0 is [ui]vec2(data, compare): */
+ src0 = create_collect(ctx, (struct ir3_instruction*[]){
+ get_src(ctx, &intr->src[4])[0],
+ src0,
+ }, 2);
+ atomic = ir3_ATOMIC_CMPXCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ default:
+ unreachable("boo");
+ }
+
+ atomic->cat6.iim_val = 1;
+ atomic->cat6.d = ncoords;
+ atomic->cat6.type = get_image_type(var);
+ atomic->cat6.typed = true;
+ atomic->barrier_class = IR3_BARRIER_IMAGE_W;
+ atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
+
+ /* even if nothing consume the result, we can't DCE the instruction: */
+ array_insert(b, b->keeps, atomic);
+
+ return atomic;
+}
+
+static void
+emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+ struct ir3_block *b = ctx->block;
+ struct ir3_instruction *barrier;
+
+ switch (intr->intrinsic) {
+ case nir_intrinsic_barrier:
+ barrier = ir3_BAR(b);
+ barrier->cat7.g = true;
+ barrier->cat7.l = true;
+ barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
+ barrier->barrier_class = IR3_BARRIER_EVERYTHING;
+ break;
+ case nir_intrinsic_memory_barrier:
+ barrier = ir3_FENCE(b);
+ barrier->cat7.g = true;
+ barrier->cat7.r = true;
+ barrier->cat7.w = true;
+ barrier->barrier_class = IR3_BARRIER_IMAGE_W |
+ IR3_BARRIER_BUFFER_W;
+ barrier->barrier_conflict =
+ IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |
+ IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+ break;
+ case nir_intrinsic_memory_barrier_atomic_counter:
+ case nir_intrinsic_memory_barrier_buffer:
+ barrier = ir3_FENCE(b);
+ barrier->cat7.g = true;
+ barrier->cat7.r = true;
+ barrier->cat7.w = true;
+ barrier->barrier_class = IR3_BARRIER_BUFFER_W;
+ barrier->barrier_conflict = IR3_BARRIER_BUFFER_R |
+ IR3_BARRIER_BUFFER_W;
+ break;
+ case nir_intrinsic_memory_barrier_image:
+ // TODO double check if this should have .g set
+ barrier = ir3_FENCE(b);
+ barrier->cat7.g = true;
+ barrier->cat7.r = true;
+ barrier->cat7.w = true;
+ barrier->barrier_class = IR3_BARRIER_IMAGE_W;
+ barrier->barrier_conflict = IR3_BARRIER_IMAGE_R |
+ IR3_BARRIER_IMAGE_W;
+ break;
+ case nir_intrinsic_memory_barrier_shared:
+ barrier = ir3_FENCE(b);
+ barrier->cat7.g = true;
+ barrier->cat7.l = true;
+ barrier->cat7.r = true;
+ barrier->cat7.w = true;
+ barrier->barrier_class = IR3_BARRIER_SHARED_W;
+ barrier->barrier_conflict = IR3_BARRIER_SHARED_R |
+ IR3_BARRIER_SHARED_W;
+ break;
+ case nir_intrinsic_group_memory_barrier:
+ barrier = ir3_FENCE(b);
+ barrier->cat7.g = true;
+ barrier->cat7.l = true;
+ barrier->cat7.r = true;
+ barrier->cat7.w = true;
+ barrier->barrier_class = IR3_BARRIER_SHARED_W |
+ IR3_BARRIER_IMAGE_W |
+ IR3_BARRIER_BUFFER_W;
+ barrier->barrier_conflict =
+ IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W |
+ IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |
+ IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+ break;
+ default:
+ unreachable("boo");
+ }
+
+ /* make sure barrier doesn't get DCE'd */
+ array_insert(b, b->keeps, barrier);
+}
+
+static void add_sysval_input_compmask(struct ir3_context *ctx,
+ gl_system_value slot, unsigned compmask,
+ struct ir3_instruction *instr)
+{
+ struct ir3_shader_variant *so = ctx->so;
+ unsigned r = regid(so->inputs_count, 0);
+ unsigned n = so->inputs_count++;
+
+ so->inputs[n].sysval = true;
+ so->inputs[n].slot = slot;
+ so->inputs[n].compmask = compmask;
+ so->inputs[n].regid = r;
+ so->inputs[n].interpolate = INTERP_MODE_FLAT;
+ so->total_in++;
+
+ ctx->ir->ninputs = MAX2(ctx->ir->ninputs, r + 1);
+ ctx->ir->inputs[r] = instr;
+}
+
+static void add_sysval_input(struct ir3_context *ctx, gl_system_value slot,
+ struct ir3_instruction *instr)
+{
+ add_sysval_input_compmask(ctx, slot, 0x1, instr);
+}
+
+static void
+emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+ const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
+ struct ir3_instruction **dst;
+ struct ir3_instruction * const *src;
+ struct ir3_block *b = ctx->block;
+ nir_const_value *const_offset;
+ int idx, comp;
+
+ if (info->has_dest) {
+ unsigned n = nir_intrinsic_dest_components(intr);
+ dst = get_dst(ctx, &intr->dest, n);
+ } else {
+ dst = NULL;
+ }
+
+ switch (intr->intrinsic) {
+ case nir_intrinsic_load_uniform:
+ idx = nir_intrinsic_base(intr);
+ const_offset = nir_src_as_const_value(intr->src[0]);
+ if (const_offset) {
+ idx += const_offset->u32[0];
+ for (int i = 0; i < intr->num_components; i++) {
+ unsigned n = idx * 4 + i;
+ dst[i] = create_uniform(ctx, n);
+ }
+ } else {
+ src = get_src(ctx, &intr->src[0]);
+ for (int i = 0; i < intr->num_components; i++) {
+ int n = idx * 4 + i;
+ dst[i] = create_uniform_indirect(ctx, n,
+ get_addr(ctx, src[0], 4));
+ }
+ /* NOTE: if relative addressing is used, we set
+ * constlen in the compiler (to worst-case value)
+ * since we don't know in the assembler what the max
+ * addr reg value can be:
+ */
+ ctx->so->constlen = ctx->s->num_uniforms;
+ }
+ break;
+ case nir_intrinsic_load_ubo:
+ emit_intrinsic_load_ubo(ctx, intr, dst);
+ break;
+ case nir_intrinsic_load_input:
+ idx = nir_intrinsic_base(intr);
+ comp = nir_intrinsic_component(intr);
+ const_offset = nir_src_as_const_value(intr->src[0]);
+ if (const_offset) {
+ idx += const_offset->u32[0];
+ for (int i = 0; i < intr->num_components; i++) {
+ unsigned n = idx * 4 + i + comp;
+ dst[i] = ctx->ir->inputs[n];
+ }
+ } else {
+ src = get_src(ctx, &intr->src[0]);
+ struct ir3_instruction *collect =
+ create_collect(ctx, ctx->ir->inputs, ctx->ir->ninputs);
+ struct ir3_instruction *addr = get_addr(ctx, src[0], 4);
+ for (int i = 0; i < intr->num_components; i++) {
+ unsigned n = idx * 4 + i + comp;
+ dst[i] = create_indirect_load(ctx, ctx->ir->ninputs,
+ n, addr, collect);
+ }
+ }
+ break;
+ case nir_intrinsic_load_ssbo:
+ emit_intrinsic_load_ssbo(ctx, intr, dst);
+ break;
+ case nir_intrinsic_store_ssbo:
+ emit_intrinsic_store_ssbo(ctx, intr);
+ break;
+ case nir_intrinsic_get_buffer_size:
+ emit_intrinsic_ssbo_size(ctx, intr, dst);
+ break;
+ case nir_intrinsic_ssbo_atomic_add:
+ case nir_intrinsic_ssbo_atomic_imin:
+ case nir_intrinsic_ssbo_atomic_umin:
+ case nir_intrinsic_ssbo_atomic_imax:
+ case nir_intrinsic_ssbo_atomic_umax:
+ case nir_intrinsic_ssbo_atomic_and:
+ case nir_intrinsic_ssbo_atomic_or:
+ case nir_intrinsic_ssbo_atomic_xor:
+ case nir_intrinsic_ssbo_atomic_exchange:
+ case nir_intrinsic_ssbo_atomic_comp_swap:
+ dst[0] = emit_intrinsic_atomic_ssbo(ctx, intr);
+ break;
+ case nir_intrinsic_load_shared:
+ emit_intrinsic_load_shared(ctx, intr, dst);
+ break;
+ case nir_intrinsic_store_shared:
+ emit_intrinsic_store_shared(ctx, intr);
+ break;
+ case nir_intrinsic_shared_atomic_add:
+ case nir_intrinsic_shared_atomic_imin:
+ case nir_intrinsic_shared_atomic_umin:
+ case nir_intrinsic_shared_atomic_imax:
+ case nir_intrinsic_shared_atomic_umax:
+ case nir_intrinsic_shared_atomic_and:
+ case nir_intrinsic_shared_atomic_or:
+ case nir_intrinsic_shared_atomic_xor:
+ case nir_intrinsic_shared_atomic_exchange:
+ case nir_intrinsic_shared_atomic_comp_swap:
+ dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
+ break;
+ case nir_intrinsic_image_deref_load:
+ emit_intrinsic_load_image(ctx, intr, dst);
+ break;
+ case nir_intrinsic_image_deref_store:
+ emit_intrinsic_store_image(ctx, intr);
+ break;
+ case nir_intrinsic_image_deref_size:
+ emit_intrinsic_image_size(ctx, intr, dst);
+ break;
+ case nir_intrinsic_image_deref_atomic_add:
+ case nir_intrinsic_image_deref_atomic_min:
+ case nir_intrinsic_image_deref_atomic_max:
+ case nir_intrinsic_image_deref_atomic_and:
+ case nir_intrinsic_image_deref_atomic_or:
+ case nir_intrinsic_image_deref_atomic_xor:
+ case nir_intrinsic_image_deref_atomic_exchange:
+ case nir_intrinsic_image_deref_atomic_comp_swap:
+ dst[0] = emit_intrinsic_atomic_image(ctx, intr);
+ break;
+ case nir_intrinsic_barrier:
+ case nir_intrinsic_memory_barrier:
+ case nir_intrinsic_group_memory_barrier:
+ case nir_intrinsic_memory_barrier_atomic_counter:
+ case nir_intrinsic_memory_barrier_buffer:
+ case nir_intrinsic_memory_barrier_image:
+ case nir_intrinsic_memory_barrier_shared:
+ emit_intrinsic_barrier(ctx, intr);
+ /* note that blk ptr no longer valid, make that obvious: */
+ b = NULL;
+ break;
+ case nir_intrinsic_store_output:
+ idx = nir_intrinsic_base(intr);
+ comp = nir_intrinsic_component(intr);
+ const_offset = nir_src_as_const_value(intr->src[1]);
+ compile_assert(ctx, const_offset != NULL);
+ idx += const_offset->u32[0];
+
+ src = get_src(ctx, &intr->src[0]);
+ for (int i = 0; i < intr->num_components; i++) {
+ unsigned n = idx * 4 + i + comp;
+ ctx->ir->outputs[n] = src[i];
+ }
+ break;
+ case nir_intrinsic_load_base_vertex:
+ case nir_intrinsic_load_first_vertex:
+ if (!ctx->basevertex) {
+ ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE);
+ add_sysval_input(ctx, SYSTEM_VALUE_FIRST_VERTEX, ctx->basevertex);
+ }
+ dst[0] = ctx->basevertex;
+ break;
+ case nir_intrinsic_load_vertex_id_zero_base:
+ case nir_intrinsic_load_vertex_id:
+ if (!ctx->vertex_id) {
+ gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id) ?
+ SYSTEM_VALUE_VERTEX_ID : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;
+ ctx->vertex_id = create_input(ctx, 0);
+ add_sysval_input(ctx, sv, ctx->vertex_id);
+ }
+ dst[0] = ctx->vertex_id;
+ break;
+ case nir_intrinsic_load_instance_id:
+ if (!ctx->instance_id) {
+ ctx->instance_id = create_input(ctx, 0);
+ add_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID,
+ ctx->instance_id);
+ }
+ dst[0] = ctx->instance_id;
+ break;
+ case nir_intrinsic_load_sample_id:
+ case nir_intrinsic_load_sample_id_no_per_sample:
+ if (!ctx->samp_id) {
+ ctx->samp_id = create_input(ctx, 0);
+ ctx->samp_id->regs[0]->flags |= IR3_REG_HALF;
+ add_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_ID,
+ ctx->samp_id);
+ }
+ dst[0] = ir3_COV(b, ctx->samp_id, TYPE_U16, TYPE_U32);
+ break;
+ case nir_intrinsic_load_sample_mask_in:
+ if (!ctx->samp_mask_in) {
+ ctx->samp_mask_in = create_input(ctx, 0);
+ add_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN,
+ ctx->samp_mask_in);
+ }
+ dst[0] = ctx->samp_mask_in;
+ break;
+ case nir_intrinsic_load_user_clip_plane:
+ idx = nir_intrinsic_ucp_id(intr);
+ for (int i = 0; i < intr->num_components; i++) {
+ unsigned n = idx * 4 + i;
+ dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
+ }
+ break;
+ case nir_intrinsic_load_front_face:
+ if (!ctx->frag_face) {
+ ctx->so->frag_face = true;
+ ctx->frag_face = create_input(ctx, 0);
+ add_sysval_input(ctx, SYSTEM_VALUE_FRONT_FACE, ctx->frag_face);
+ ctx->frag_face->regs[0]->flags |= IR3_REG_HALF;
+ }
+ /* for fragface, we get -1 for back and 0 for front. However this is
+ * the inverse of what nir expects (where ~0 is true).
+ */
+ dst[0] = ir3_COV(b, ctx->frag_face, TYPE_S16, TYPE_S32);
+ dst[0] = ir3_NOT_B(b, dst[0], 0);
+ break;
+ case nir_intrinsic_load_local_invocation_id:
+ if (!ctx->local_invocation_id) {
+ ctx->local_invocation_id = create_input_compmask(ctx, 0, 0x7);
+ add_sysval_input_compmask(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID,
+ 0x7, ctx->local_invocation_id);
+ }
+ split_dest(b, dst, ctx->local_invocation_id, 0, 3);
+ break;
+ case nir_intrinsic_load_work_group_id:
+ if (!ctx->work_group_id) {
+ ctx->work_group_id = create_input_compmask(ctx, 0, 0x7);
+ add_sysval_input_compmask(ctx, SYSTEM_VALUE_WORK_GROUP_ID,
+ 0x7, ctx->work_group_id);
+ ctx->work_group_id->regs[0]->flags |= IR3_REG_HIGH;
+ }
+ split_dest(b, dst, ctx->work_group_id, 0, 3);
+ break;
+ case nir_intrinsic_load_num_work_groups:
+ for (int i = 0; i < intr->num_components; i++) {
+ dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i);
+ }
+ break;
+ case nir_intrinsic_load_local_group_size:
+ for (int i = 0; i < intr->num_components; i++) {
+ dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i);
+ }
+ break;
+ case nir_intrinsic_discard_if:
+ case nir_intrinsic_discard: {
+ struct ir3_instruction *cond, *kill;
+
+ if (intr->intrinsic == nir_intrinsic_discard_if) {
+ /* conditional discard: */
+ src = get_src(ctx, &intr->src[0]);
+ cond = ir3_b2n(b, src[0]);
+ } else {
+ /* unconditional discard: */
+ cond = create_immed(b, 1);
+ }
+
+ /* NOTE: only cmps.*.* can write p0.x: */
+ cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
+ cond->cat2.condition = IR3_COND_NE;
+
+ /* condition always goes in predicate register: */
+ cond->regs[0]->num = regid(REG_P0, 0);
+
+ kill = ir3_KILL(b, cond, 0);
+ array_insert(ctx->ir, ctx->ir->predicates, kill);
+
+ array_insert(b, b->keeps, kill);
+ ctx->so->has_kill = true;
+
+ break;
+ }
+ default:
+ compile_error(ctx, "Unhandled intrinsic type: %s\n",
+ nir_intrinsic_infos[intr->intrinsic].name);
+ break;
+ }
+
+ if (info->has_dest)
+ put_dst(ctx, &intr->dest);
+}
+
+static void
+emit_load_const(struct ir3_context *ctx, nir_load_const_instr *instr)
+{
+ struct ir3_instruction **dst = get_dst_ssa(ctx, &instr->def,
+ instr->def.num_components);
+ type_t type = (instr->def.bit_size < 32) ? TYPE_U16 : TYPE_U32;
+
+ for (int i = 0; i < instr->def.num_components; i++)
+ dst[i] = create_immed_typed(ctx->block, instr->value.u32[i], type);
+}
+
+static void
+emit_undef(struct ir3_context *ctx, nir_ssa_undef_instr *undef)
+{
+ struct ir3_instruction **dst = get_dst_ssa(ctx, &undef->def,
+ undef->def.num_components);
+ type_t type = (undef->def.bit_size < 32) ? TYPE_U16 : TYPE_U32;
+
+ /* backend doesn't want undefined instructions, so just plug
+ * in 0.0..
+ */
+ for (int i = 0; i < undef->def.num_components; i++)
+ dst[i] = create_immed_typed(ctx->block, fui(0.0), type);
+}
+
+/*
+ * texture fetch/sample instructions:
+ */
+
+static void
+tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
+{
+ unsigned coords, flags = 0;
+
+ /* note: would use tex->coord_components.. except txs.. also,
+ * since array index goes after shadow ref, we don't want to
+ * count it:
+ */
+ switch (tex->sampler_dim) {
+ case GLSL_SAMPLER_DIM_1D:
+ case GLSL_SAMPLER_DIM_BUF:
+ coords = 1;
+ break;
+ case GLSL_SAMPLER_DIM_2D:
+ case GLSL_SAMPLER_DIM_RECT:
+ case GLSL_SAMPLER_DIM_EXTERNAL:
+ case GLSL_SAMPLER_DIM_MS:
+ coords = 2;
+ break;
+ case GLSL_SAMPLER_DIM_3D:
+ case GLSL_SAMPLER_DIM_CUBE:
+ coords = 3;
+ flags |= IR3_INSTR_3D;
+ break;
+ default:
+ unreachable("bad sampler_dim");
+ }
+
+ if (tex->is_shadow && tex->op != nir_texop_lod)
+ flags |= IR3_INSTR_S;
+
+ if (tex->is_array && tex->op != nir_texop_lod)
+ flags |= IR3_INSTR_A;
+
+ *flagsp = flags;
+ *coordsp = coords;
+}
+
+static void
+emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
+{
+ struct ir3_block *b = ctx->block;
+ struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
+ struct ir3_instruction * const *coord, * const *off, * const *ddx, * const *ddy;
+ struct ir3_instruction *lod, *compare, *proj, *sample_index;
+ bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
+ unsigned i, coords, flags;
+ unsigned nsrc0 = 0, nsrc1 = 0;
+ type_t type;
+ opc_t opc = 0;
+
+ coord = off = ddx = ddy = NULL;
+ lod = proj = compare = sample_index = NULL;
+
+ /* TODO: might just be one component for gathers? */
+ dst = get_dst(ctx, &tex->dest, 4);
+
+ for (unsigned i = 0; i < tex->num_srcs; i++) {
+ switch (tex->src[i].src_type) {
+ case nir_tex_src_coord:
+ coord = get_src(ctx, &tex->src[i].src);
+ break;
+ case nir_tex_src_bias:
+ lod = get_src(ctx, &tex->src[i].src)[0];
+ has_bias = true;
+ break;
+ case nir_tex_src_lod:
+ lod = get_src(ctx, &tex->src[i].src)[0];
+ has_lod = true;
+ break;
+ case nir_tex_src_comparator: /* shadow comparator */
+ compare = get_src(ctx, &tex->src[i].src)[0];
+ break;
+ case nir_tex_src_projector:
+ proj = get_src(ctx, &tex->src[i].src)[0];
+ has_proj = true;
+ break;
+ case nir_tex_src_offset:
+ off = get_src(ctx, &tex->src[i].src);
+ has_off = true;
+ break;
+ case nir_tex_src_ddx:
+ ddx = get_src(ctx, &tex->src[i].src);
+ break;
+ case nir_tex_src_ddy:
+ ddy = get_src(ctx, &tex->src[i].src);
+ break;
+ case nir_tex_src_ms_index:
+ sample_index = get_src(ctx, &tex->src[i].src)[0];
+ break;
+ default:
+ compile_error(ctx, "Unhandled NIR tex src type: %d\n",
+ tex->src[i].src_type);
+ return;
+ }
+ }
+
+ switch (tex->op) {
+ case nir_texop_tex: opc = has_lod ? OPC_SAML : OPC_SAM; break;
+ case nir_texop_txb: opc = OPC_SAMB; break;
+ case nir_texop_txl: opc = OPC_SAML; break;
+ case nir_texop_txd: opc = OPC_SAMGQ; break;
+ case nir_texop_txf: opc = OPC_ISAML; break;
+ case nir_texop_lod: opc = OPC_GETLOD; break;
+ case nir_texop_tg4:
+ /* NOTE: a4xx might need to emulate gather w/ txf (this is
+ * what blob does, seems gather is broken?), and a3xx did
+ * not support it (but probably could also emulate).
+ */
+ switch (tex->component) {
+ case 0: opc = OPC_GATHER4R; break;
+ case 1: opc = OPC_GATHER4G; break;
+ case 2: opc = OPC_GATHER4B; break;
+ case 3: opc = OPC_GATHER4A; break;
+ }
+ break;
+ case nir_texop_txf_ms: opc = OPC_ISAMM; break;
+ case nir_texop_txs:
+ case nir_texop_query_levels:
+ case nir_texop_texture_samples:
+ case nir_texop_samples_identical:
+ case nir_texop_txf_ms_mcs:
+ compile_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
+ return;
+ }
+
+ tex_info(tex, &flags, &coords);
+
+ /*
+ * lay out the first argument in the proper order:
+ * - actual coordinates first
+ * - shadow reference
+ * - array index
+ * - projection w
+ * - starting at offset 4, dpdx.xy, dpdy.xy
+ *
+ * bias/lod go into the second arg
+ */
+
+ /* insert tex coords: */
+ for (i = 0; i < coords; i++)
+ src0[i] = coord[i];
+
+ nsrc0 = i;
+
+ /* NOTE a3xx (and possibly a4xx?) might be different, using isaml
+ * with scaled x coord according to requested sample:
+ */
+ if (tex->op == nir_texop_txf_ms) {
+ if (ctx->compiler->txf_ms_with_isaml) {
+ /* the samples are laid out in x dimension as
+ * 0 1 2 3
+ * x_ms = (x << ms) + sample_index;
+ */
+ struct ir3_instruction *ms;
+ ms = create_immed(b, (ctx->samples >> (2 * tex->texture_index)) & 3);
+
+ src0[0] = ir3_SHL_B(b, src0[0], 0, ms, 0);
+ src0[0] = ir3_ADD_U(b, src0[0], 0, sample_index, 0);
+
+ opc = OPC_ISAML;
+ } else {
+ src0[nsrc0++] = sample_index;
+ }
+ }
+
+ /* scale up integer coords for TXF based on the LOD */
+ if (ctx->compiler->unminify_coords && (opc == OPC_ISAML)) {
+ assert(has_lod);
+ for (i = 0; i < coords; i++)
+ src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0);
+ }
+
+ if (coords == 1) {
+ /* hw doesn't do 1d, so we treat it as 2d with
+ * height of 1, and patch up the y coord.
+ * TODO: y coord should be (int)0 in some cases..
+ */
+ src0[nsrc0++] = create_immed(b, fui(0.5));
+ }
+
+ if (tex->is_shadow && tex->op != nir_texop_lod)
+ src0[nsrc0++] = compare;
+
+ if (tex->is_array && tex->op != nir_texop_lod) {
+ struct ir3_instruction *idx = coord[coords];
+
+ /* the array coord for cube arrays needs 0.5 added to it */
+ if (ctx->compiler->array_index_add_half && (opc != OPC_ISAML))
+ idx = ir3_ADD_F(b, idx, 0, create_immed(b, fui(0.5)), 0);
+
+ src0[nsrc0++] = idx;
+ }
+
+ if (has_proj) {
+ src0[nsrc0++] = proj;
+ flags |= IR3_INSTR_P;
+ }
+
+ /* pad to 4, then ddx/ddy: */
+ if (tex->op == nir_texop_txd) {
+ while (nsrc0 < 4)
+ src0[nsrc0++] = create_immed(b, fui(0.0));
+ for (i = 0; i < coords; i++)
+ src0[nsrc0++] = ddx[i];
+ if (coords < 2)
+ src0[nsrc0++] = create_immed(b, fui(0.0));
+ for (i = 0; i < coords; i++)
+ src0[nsrc0++] = ddy[i];
+ if (coords < 2)
+ src0[nsrc0++] = create_immed(b, fui(0.0));
+ }
+
+ /*
+ * second argument (if applicable):
+ * - offsets
+ * - lod
+ * - bias
+ */
+ if (has_off | has_lod | has_bias) {
+ if (has_off) {
+ unsigned off_coords = coords;
+ if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
+ off_coords--;
+ for (i = 0; i < off_coords; i++)
+ src1[nsrc1++] = off[i];
+ if (off_coords < 2)
+ src1[nsrc1++] = create_immed(b, fui(0.0));
+ flags |= IR3_INSTR_O;
+ }
+
+ if (has_lod | has_bias)
+ src1[nsrc1++] = lod;
+ }
+
+ switch (tex->dest_type) {
+ case nir_type_invalid:
+ case nir_type_float:
+ type = TYPE_F32;
+ break;
+ case nir_type_int:
+ type = TYPE_S32;
+ break;
+ case nir_type_uint:
+ case nir_type_bool:
+ type = TYPE_U32;
+ break;
+ default:
+ unreachable("bad dest_type");
+ }
+
+ if (opc == OPC_GETLOD)
+ type = TYPE_U32;
+
+ unsigned tex_idx = tex->texture_index;
+
+ ctx->max_texture_index = MAX2(ctx->max_texture_index, tex_idx);
+
+ struct ir3_instruction *col0 = create_collect(ctx, src0, nsrc0);
+ struct ir3_instruction *col1 = create_collect(ctx, src1, nsrc1);
+
+ sam = ir3_SAM(b, opc, type, 0b1111, flags,
+ tex_idx, tex_idx, col0, col1);
+
+ if ((ctx->astc_srgb & (1 << tex_idx)) && !nir_tex_instr_is_query(tex)) {
+ /* only need first 3 components: */
+ sam->regs[0]->wrmask = 0x7;
+ split_dest(b, dst, sam, 0, 3);
+
+ /* we need to sample the alpha separately with a non-ASTC
+ * texture state:
+ */
+ sam = ir3_SAM(b, opc, type, 0b1000, flags,
+ tex_idx, tex_idx, col0, col1);
+
+ array_insert(ctx->ir, ctx->ir->astc_srgb, sam);
+
+ /* fixup .w component: */
+ split_dest(b, &dst[3], sam, 3, 1);
+ } else {
+ /* normal (non-workaround) case: */
+ split_dest(b, dst, sam, 0, 4);
+ }
+
+ /* GETLOD returns results in 4.8 fixed point */
+ if (opc == OPC_GETLOD) {
+ struct ir3_instruction *factor = create_immed(b, fui(1.0 / 256));
+
+ compile_assert(ctx, tex->dest_type == nir_type_float);
+ for (i = 0; i < 2; i++) {
+ dst[i] = ir3_MUL_F(b, ir3_COV(b, dst[i], TYPE_U32, TYPE_F32), 0,
+ factor, 0);
+ }
+ }
+
+ put_dst(ctx, &tex->dest);
+}
+
+static void
+emit_tex_query_levels(struct ir3_context *ctx, nir_tex_instr *tex)
+{
+ struct ir3_block *b = ctx->block;
+ struct ir3_instruction **dst, *sam;
+
+ dst = get_dst(ctx, &tex->dest, 1);
+
+ sam = ir3_SAM(b, OPC_GETINFO, TYPE_U32, 0b0100, 0,
+ tex->texture_index, tex->texture_index, NULL, NULL);
+
+ /* even though there is only one component, since it ends
+ * up in .z rather than .x, we need a split_dest()
+ */
+ split_dest(b, dst, sam, 0, 3);
+
+ /* The # of levels comes from getinfo.z. We need to add 1 to it, since
+ * the value in TEX_CONST_0 is zero-based.
+ */
+ if (ctx->compiler->levels_add_one)
+ dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);
+
+ put_dst(ctx, &tex->dest);
+}
+
+static void
+emit_tex_txs(struct ir3_context *ctx, nir_tex_instr *tex)
+{
+ struct ir3_block *b = ctx->block;
+ struct ir3_instruction **dst, *sam;
+ struct ir3_instruction *lod;
+ unsigned flags, coords;
+
+ tex_info(tex, &flags, &coords);
+
+ /* Actually we want the number of dimensions, not coordinates. This
+ * distinction only matters for cubes.
+ */
+ if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
+ coords = 2;
+
+ dst = get_dst(ctx, &tex->dest, 4);
+
+ compile_assert(ctx, tex->num_srcs == 1);
+ compile_assert(ctx, tex->src[0].src_type == nir_tex_src_lod);
+
+ lod = get_src(ctx, &tex->src[0].src)[0];
+
+ sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, 0b1111, flags,
+ tex->texture_index, tex->texture_index, lod, NULL);
+
+ split_dest(b, dst, sam, 0, 4);
+
+ /* Array size actually ends up in .w rather than .z. This doesn't
+ * matter for miplevel 0, but for higher mips the value in z is
+ * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
+ * returned, which means that we have to add 1 to it for arrays.
+ */
+ if (tex->is_array) {
+ if (ctx->compiler->levels_add_one) {
+ dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0);
+ } else {
+ dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);
+ }
+ }
+
+ put_dst(ctx, &tex->dest);
+}
+
+static void
+emit_jump(struct ir3_context *ctx, nir_jump_instr *jump)
+{
+ switch (jump->type) {
+ case nir_jump_break:
+ case nir_jump_continue:
+ case nir_jump_return:
+ /* I *think* we can simply just ignore this, and use the
+ * successor block link to figure out where we need to
+ * jump to for break/continue
+ */
+ break;
+ default:
+ compile_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
+ break;
+ }
+}
+
+static void
+emit_instr(struct ir3_context *ctx, nir_instr *instr)
+{
+ switch (instr->type) {
+ case nir_instr_type_alu:
+ emit_alu(ctx, nir_instr_as_alu(instr));
+ break;
+ case nir_instr_type_deref:
+ /* ignored, handled as part of the intrinsic they are src to */
+ break;
+ case nir_instr_type_intrinsic:
+ emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
+ break;
+ case nir_instr_type_load_const:
+ emit_load_const(ctx, nir_instr_as_load_const(instr));
+ break;
+ case nir_instr_type_ssa_undef:
+ emit_undef(ctx, nir_instr_as_ssa_undef(instr));
+ break;
+ case nir_instr_type_tex: {
+ nir_tex_instr *tex = nir_instr_as_tex(instr);
+ /* couple tex instructions get special-cased:
+ */
+ switch (tex->op) {
+ case nir_texop_txs:
+ emit_tex_txs(ctx, tex);
+ break;
+ case nir_texop_query_levels:
+ emit_tex_query_levels(ctx, tex);
+ break;
+ default:
+ emit_tex(ctx, tex);
+ break;
+ }
+ break;
+ }
+ case nir_instr_type_jump:
+ emit_jump(ctx, nir_instr_as_jump(instr));
+ break;
+ case nir_instr_type_phi:
+ /* we have converted phi webs to regs in NIR by now */
+ compile_error(ctx, "Unexpected NIR instruction type: %d\n", instr->type);
+ break;
+ case nir_instr_type_call:
+ case nir_instr_type_parallel_copy:
+ compile_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type);
+ break;
+ }
+}
+
+static struct ir3_block *
+get_block(struct ir3_context *ctx, const nir_block *nblock)
+{
+ struct ir3_block *block;
+ struct hash_entry *hentry;
+ unsigned i;
+
+ hentry = _mesa_hash_table_search(ctx->block_ht, nblock);
+ if (hentry)
+ return hentry->data;
+
+ block = ir3_block_create(ctx->ir);
+ block->nblock = nblock;
+ _mesa_hash_table_insert(ctx->block_ht, nblock, block);
+
+ block->predecessors_count = nblock->predecessors->entries;
+ block->predecessors = ralloc_array_size(block,
+ sizeof(block->predecessors[0]), block->predecessors_count);
+ i = 0;
+ set_foreach(nblock->predecessors, sentry) {
+ block->predecessors[i++] = get_block(ctx, sentry->key);
+ }
+
+ return block;
+}
+
+static void
+emit_block(struct ir3_context *ctx, nir_block *nblock)
+{
+ struct ir3_block *block = get_block(ctx, nblock);
+
+ for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
+ if (nblock->successors[i]) {
+ block->successors[i] =
+ get_block(ctx, nblock->successors[i]);
+ }
+ }
+
+ ctx->block = block;
+ list_addtail(&block->node, &ctx->ir->block_list);
+
+ /* re-emit addr register in each block if needed: */
+ for (int i = 0; i < ARRAY_SIZE(ctx->addr_ht); i++) {
+ _mesa_hash_table_destroy(ctx->addr_ht[i], NULL);
+ ctx->addr_ht[i] = NULL;
+ }
+
+ nir_foreach_instr(instr, nblock) {
+ ctx->cur_instr = instr;
+ emit_instr(ctx, instr);
+ ctx->cur_instr = NULL;
+ if (ctx->error)
+ return;
+ }
+}
+
+static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list);
+
+static void
+emit_if(struct ir3_context *ctx, nir_if *nif)
+{
+ struct ir3_instruction *condition = get_src(ctx, &nif->condition)[0];
+
+ ctx->block->condition =
+ get_predicate(ctx, ir3_b2n(condition->block, condition));
+
+ emit_cf_list(ctx, &nif->then_list);
+ emit_cf_list(ctx, &nif->else_list);
+}
+
+static void
+emit_loop(struct ir3_context *ctx, nir_loop *nloop)
+{
+ emit_cf_list(ctx, &nloop->body);
+}
+
+static void
+emit_cf_list(struct ir3_context *ctx, struct exec_list *list)
+{
+ foreach_list_typed(nir_cf_node, node, node, list) {
+ switch (node->type) {
+ case nir_cf_node_block:
+ emit_block(ctx, nir_cf_node_as_block(node));
+ break;
+ case nir_cf_node_if:
+ emit_if(ctx, nir_cf_node_as_if(node));
+ break;
+ case nir_cf_node_loop:
+ emit_loop(ctx, nir_cf_node_as_loop(node));
+ break;
+ case nir_cf_node_function:
+ compile_error(ctx, "TODO\n");
+ break;
+ }
+ }
+}
+
+/* emit stream-out code. At this point, the current block is the original
+ * (nir) end block, and nir ensures that all flow control paths terminate
+ * into the end block. We re-purpose the original end block to generate
+ * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional
+ * block holding stream-out write instructions, followed by the new end
+ * block:
+ *
+ * blockOrigEnd {
+ * p0.x = (vtxcnt < maxvtxcnt)
+ * // succs: blockStreamOut, blockNewEnd
+ * }
+ * blockStreamOut {
+ * ... stream-out instructions ...
+ * // succs: blockNewEnd
+ * }
+ * blockNewEnd {
+ * }
+ */
+static void
+emit_stream_out(struct ir3_context *ctx)
+{
+ struct ir3_shader_variant *v = ctx->so;
+ struct ir3 *ir = ctx->ir;
+ struct ir3_stream_output_info *strmout =
+ &ctx->so->shader->stream_output;
+ struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
+ struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
+ struct ir3_instruction *bases[IR3_MAX_SO_BUFFERS];
+
+ /* create vtxcnt input in input block at top of shader,
+ * so that it is seen as live over the entire duration
+ * of the shader:
+ */
+ vtxcnt = create_input(ctx, 0);
+ add_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, vtxcnt);
+
+ maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
+
+ /* at this point, we are at the original 'end' block,
+ * re-purpose this block to stream-out condition, then
+ * append stream-out block and new-end block
+ */
+ orig_end_block = ctx->block;
+
+// TODO these blocks need to update predecessors..
+// maybe w/ store_global intrinsic, we could do this
+// stuff in nir->nir pass
+
+ stream_out_block = ir3_block_create(ir);
+ list_addtail(&stream_out_block->node, &ir->block_list);
+
+ new_end_block = ir3_block_create(ir);
+ list_addtail(&new_end_block->node, &ir->block_list);
+
+ orig_end_block->successors[0] = stream_out_block;
+ orig_end_block->successors[1] = new_end_block;
+ stream_out_block->successors[0] = new_end_block;
+
+ /* setup 'if (vtxcnt < maxvtxcnt)' condition: */
+ cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
+ cond->regs[0]->num = regid(REG_P0, 0);
+ cond->cat2.condition = IR3_COND_LT;
+
+ /* condition goes on previous block to the conditional,
+ * since it is used to pick which of the two successor
+ * paths to take:
+ */
+ orig_end_block->condition = cond;
+
+ /* switch to stream_out_block to generate the stream-out
+ * instructions:
+ */
+ ctx->block = stream_out_block;
+
+ /* Calculate base addresses based on vtxcnt. Instructions
+ * generated for bases not used in following loop will be
+ * stripped out in the backend.
+ */
+ for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
+ unsigned stride = strmout->stride[i];
+ struct ir3_instruction *base, *off;
+
+ base = create_uniform(ctx, regid(v->constbase.tfbo, i));
+
+ /* 24-bit should be enough: */
+ off = ir3_MUL_U(ctx->block, vtxcnt, 0,
+ create_immed(ctx->block, stride * 4), 0);
+
+ bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);
+ }
+
+ /* Generate the per-output store instructions: */
+ for (unsigned i = 0; i < strmout->num_outputs; i++) {
+ for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
+ unsigned c = j + strmout->output[i].start_component;
+ struct ir3_instruction *base, *out, *stg;
+
+ base = bases[strmout->output[i].output_buffer];
+ out = ctx->ir->outputs[regid(strmout->output[i].register_index, c)];
+
+ stg = ir3_STG(ctx->block, base, 0, out, 0,
+ create_immed(ctx->block, 1), 0);
+ stg->cat6.type = TYPE_U32;
+ stg->cat6.dst_offset = (strmout->output[i].dst_offset + j) * 4;
+
+ array_insert(ctx->block, ctx->block->keeps, stg);
+ }
+ }
+
+ /* and finally switch to the new_end_block: */
+ ctx->block = new_end_block;
+}
+
+static void
+emit_function(struct ir3_context *ctx, nir_function_impl *impl)
+{
+ nir_metadata_require(impl, nir_metadata_block_index);
+
+ emit_cf_list(ctx, &impl->body);
+ emit_block(ctx, impl->end_block);
+
+ /* at this point, we should have a single empty block,
+ * into which we emit the 'end' instruction.
+ */
+ compile_assert(ctx, list_empty(&ctx->block->instr_list));
+
+ /* If stream-out (aka transform-feedback) enabled, emit the
+ * stream-out instructions, followed by a new empty block (into
+ * which the 'end' instruction lands).
+ *
+ * NOTE: it is done in this order, rather than inserting before
+ * we emit end_block, because NIR guarantees that all blocks
+ * flow into end_block, and that end_block has no successors.
+ * So by re-purposing end_block as the first block of stream-
+ * out, we guarantee that all exit paths flow into the stream-
+ * out instructions.
+ */
+ if ((ctx->compiler->gpu_id < 500) &&
+ (ctx->so->shader->stream_output.num_outputs > 0) &&
+ !ctx->so->binning_pass) {
+ debug_assert(ctx->so->type == MESA_SHADER_VERTEX);
+ emit_stream_out(ctx);
+ }
+
+ ir3_END(ctx->block);
+}
+
+static struct ir3_instruction *
+create_frag_coord(struct ir3_context *ctx, unsigned comp)
+{
+ struct ir3_block *block = ctx->block;
+ struct ir3_instruction *instr;
+
+ if (!ctx->frag_coord) {
+ ctx->frag_coord = create_input_compmask(ctx, 0, 0xf);
+ /* defer add_sysval_input() until after all inputs created */
+ }
+
+ split_dest(block, &instr, ctx->frag_coord, comp, 1);
+
+ switch (comp) {
+ case 0: /* .x */
+ case 1: /* .y */
+ /* for frag_coord, we get unsigned values.. we need
+ * to subtract (integer) 8 and divide by 16 (right-
+ * shift by 4) then convert to float:
+ *
+ * sub.s tmp, src, 8
+ * shr.b tmp, tmp, 4
+ * mov.u32f32 dst, tmp
+ *
+ */
+ instr = ir3_SUB_S(block, instr, 0,
+ create_immed(block, 8), 0);
+ instr = ir3_SHR_B(block, instr, 0,
+ create_immed(block, 4), 0);
+ instr = ir3_COV(block, instr, TYPE_U32, TYPE_F32);
+
+ return instr;
+ case 2: /* .z */
+ case 3: /* .w */
+ default:
+ /* seems that we can use these as-is: */
+ return instr;
+ }
+}
+
+static void
+setup_input(struct ir3_context *ctx, nir_variable *in)
+{
+ struct ir3_shader_variant *so = ctx->so;
+ unsigned ncomp = glsl_get_components(in->type);
+ unsigned n = in->data.driver_location;
+ unsigned slot = in->data.location;
+
+ /* let's pretend things other than vec4 don't exist: */
+ ncomp = MAX2(ncomp, 4);
+
+ /* skip unread inputs, we could end up with (for example), unsplit
+ * matrix/etc inputs in the case they are not read, so just silently
+ * skip these.
+ */
+ if (ncomp > 4)
+ return;
+
+ compile_assert(ctx, ncomp == 4);
+
+ so->inputs[n].slot = slot;
+ so->inputs[n].compmask = (1 << ncomp) - 1;
+ so->inputs_count = MAX2(so->inputs_count, n + 1);
+ so->inputs[n].interpolate = in->data.interpolation;
+
+ if (ctx->so->type == MESA_SHADER_FRAGMENT) {
+ for (int i = 0; i < ncomp; i++) {
+ struct ir3_instruction *instr = NULL;
+ unsigned idx = (n * 4) + i;
+
+ if (slot == VARYING_SLOT_POS) {
+ so->inputs[n].bary = false;
+ so->frag_coord = true;
+ instr = create_frag_coord(ctx, i);
+ } else if (slot == VARYING_SLOT_PNTC) {
+ /* see for example st_nir_fixup_varying_slots().. this is
+ * maybe a bit mesa/st specific. But we need things to line
+ * up for this in fdN_program:
+ * unsigned texmask = 1 << (slot - VARYING_SLOT_VAR0);
+ * if (emit->sprite_coord_enable & texmask) {
+ * ...
+ * }
+ */
+ so->inputs[n].slot = VARYING_SLOT_VAR8;
+ so->inputs[n].bary = true;
+ instr = create_frag_input(ctx, false);
+ } else {
+ bool use_ldlv = false;
+
+ /* detect the special case for front/back colors where
+ * we need to do flat vs smooth shading depending on
+ * rast state:
+ */
+ if (in->data.interpolation == INTERP_MODE_NONE) {
+ switch (slot) {
+ case VARYING_SLOT_COL0:
+ case VARYING_SLOT_COL1:
+ case VARYING_SLOT_BFC0:
+ case VARYING_SLOT_BFC1:
+ so->inputs[n].rasterflat = true;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (ctx->compiler->flat_bypass) {
+ if ((so->inputs[n].interpolate == INTERP_MODE_FLAT) ||
+ (so->inputs[n].rasterflat && ctx->so->key.rasterflat))
+ use_ldlv = true;
+ }
+
+ so->inputs[n].bary = true;
+
+ instr = create_frag_input(ctx, use_ldlv);
+ }
+
+ compile_assert(ctx, idx < ctx->ir->ninputs);
+
+ ctx->ir->inputs[idx] = instr;
+ }
+ } else if (ctx->so->type == MESA_SHADER_VERTEX) {
+ for (int i = 0; i < ncomp; i++) {
+ unsigned idx = (n * 4) + i;
+ compile_assert(ctx, idx < ctx->ir->ninputs);
+ ctx->ir->inputs[idx] = create_input(ctx, idx);
+ }
+ } else {
+ compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
+ }
+
+ if (so->inputs[n].bary || (ctx->so->type == MESA_SHADER_VERTEX)) {
+ so->total_in += ncomp;
+ }
+}
+
+static void
+setup_output(struct ir3_context *ctx, nir_variable *out)
+{
+ struct ir3_shader_variant *so = ctx->so;
+ unsigned ncomp = glsl_get_components(out->type);
+ unsigned n = out->data.driver_location;
+ unsigned slot = out->data.location;
+ unsigned comp = 0;
+
+ /* let's pretend things other than vec4 don't exist: */
+ ncomp = MAX2(ncomp, 4);
+ compile_assert(ctx, ncomp == 4);
+
+ if (ctx->so->type == MESA_SHADER_FRAGMENT) {
+ switch (slot) {
+ case FRAG_RESULT_DEPTH:
+ comp = 2; /* tgsi will write to .z component */
+ so->writes_pos = true;
+ break;
+ case FRAG_RESULT_COLOR:
+ so->color0_mrt = 1;
+ break;
+ default:
+ if (slot >= FRAG_RESULT_DATA0)
+ break;
+ compile_error(ctx, "unknown FS output name: %s\n",
+ gl_frag_result_name(slot));
+ }
+ } else if (ctx->so->type == MESA_SHADER_VERTEX) {
+ switch (slot) {
+ case VARYING_SLOT_POS:
+ so->writes_pos = true;
+ break;
+ case VARYING_SLOT_PSIZ:
+ so->writes_psize = true;
+ break;
+ case VARYING_SLOT_COL0:
+ case VARYING_SLOT_COL1:
+ case VARYING_SLOT_BFC0:
+ case VARYING_SLOT_BFC1:
+ case VARYING_SLOT_FOGC:
+ case VARYING_SLOT_CLIP_DIST0:
+ case VARYING_SLOT_CLIP_DIST1:
+ case VARYING_SLOT_CLIP_VERTEX:
+ break;
+ default:
+ if (slot >= VARYING_SLOT_VAR0)
+ break;
+ if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7))
+ break;
+ compile_error(ctx, "unknown VS output name: %s\n",
+ gl_varying_slot_name(slot));
+ }
+ } else {
+ compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
+ }
+
+ compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
+
+ so->outputs[n].slot = slot;
+ so->outputs[n].regid = regid(n, comp);
+ so->outputs_count = MAX2(so->outputs_count, n + 1);
+
+ for (int i = 0; i < ncomp; i++) {
+ unsigned idx = (n * 4) + i;
+ compile_assert(ctx, idx < ctx->ir->noutputs);
+ ctx->ir->outputs[idx] = create_immed(ctx->block, fui(0.0));
+ }
+}
+
+static int
+max_drvloc(struct exec_list *vars)
+{
+ int drvloc = -1;
+ nir_foreach_variable(var, vars) {
+ drvloc = MAX2(drvloc, (int)var->data.driver_location);
+ }
+ return drvloc;
+}
+
+static const unsigned max_sysvals[] = {
+ [MESA_SHADER_FRAGMENT] = 24, // TODO
+ [MESA_SHADER_VERTEX] = 16,
+ [MESA_SHADER_COMPUTE] = 16, // TODO how many do we actually need?
+};
+
+static void
+emit_instructions(struct ir3_context *ctx)
+{
+ unsigned ninputs, noutputs;
+ nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
+
+ ninputs = (max_drvloc(&ctx->s->inputs) + 1) * 4;
+ noutputs = (max_drvloc(&ctx->s->outputs) + 1) * 4;
+
+ /* we need to leave room for sysvals:
+ */
+ ninputs += max_sysvals[ctx->so->type];
+
+ ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs);
+
+ /* Create inputs in first block: */
+ ctx->block = get_block(ctx, nir_start_block(fxn));
+ ctx->in_block = ctx->block;
+ list_addtail(&ctx->block->node, &ctx->ir->block_list);
+
+ ninputs -= max_sysvals[ctx->so->type];
+
+ /* for fragment shader, the vcoord input register is used as the
+ * base for bary.f varying fetch instrs:
+ */
+ struct ir3_instruction *vcoord = NULL;
+ if (ctx->so->type == MESA_SHADER_FRAGMENT) {
+ struct ir3_instruction *xy[2];
+
+ vcoord = create_input_compmask(ctx, 0, 0x3);
+ split_dest(ctx->block, xy, vcoord, 0, 2);
+
+ ctx->frag_vcoord = create_collect(ctx, xy, 2);
+ }
+
+ /* Setup inputs: */
+ nir_foreach_variable(var, &ctx->s->inputs) {
+ setup_input(ctx, var);
+ }
+
+ /* Defer add_sysval_input() stuff until after setup_inputs(),
+ * because sysvals need to be appended after varyings:
+ */
+ if (vcoord) {
+ add_sysval_input_compmask(ctx, SYSTEM_VALUE_VARYING_COORD,
+ 0x3, vcoord);
+ }
+
+ if (ctx->frag_coord) {
+ add_sysval_input_compmask(ctx, SYSTEM_VALUE_FRAG_COORD,
+ 0xf, ctx->frag_coord);
+ }
+
+ /* Setup outputs: */
+ nir_foreach_variable(var, &ctx->s->outputs) {
+ setup_output(ctx, var);
+ }
+
+ /* Setup registers (which should only be arrays): */
+ nir_foreach_register(reg, &ctx->s->registers) {
+ declare_array(ctx, reg);
+ }
+
+ /* NOTE: need to do something more clever when we support >1 fxn */
+ nir_foreach_register(reg, &fxn->registers) {
+ declare_array(ctx, reg);
+ }
+ /* And emit the body: */
+ ctx->impl = fxn;
+ emit_function(ctx, fxn);
+}
+
+/* from NIR perspective, we actually have varying inputs. But the varying
+ * inputs, from an IR standpoint, are just bary.f/ldlv instructions. The
+ * only actual inputs are the sysvals.
+ */
+static void
+fixup_frag_inputs(struct ir3_context *ctx)
+{
+ struct ir3_shader_variant *so = ctx->so;
+ struct ir3 *ir = ctx->ir;
+ unsigned i = 0;
+
+ /* sysvals should appear at the end of the inputs, drop everything else: */
+ while ((i < so->inputs_count) && !so->inputs[i].sysval)
+ i++;
+
+ /* at IR level, inputs are always blocks of 4 scalars: */
+ i *= 4;
+
+ ir->inputs = &ir->inputs[i];
+ ir->ninputs -= i;
+}
+
+/* Fixup tex sampler state for astc/srgb workaround instructions. We
+ * need to assign the tex state indexes for these after we know the
+ * max tex index.
+ */
+static void
+fixup_astc_srgb(struct ir3_context *ctx)
+{
+ struct ir3_shader_variant *so = ctx->so;
+ /* indexed by original tex idx, value is newly assigned alpha sampler
+ * state tex idx. Zero is invalid since there is at least one sampler
+ * if we get here.
+ */
+ unsigned alt_tex_state[16] = {0};
+ unsigned tex_idx = ctx->max_texture_index + 1;
+ unsigned idx = 0;
+
+ so->astc_srgb.base = tex_idx;
+
+ for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) {
+ struct ir3_instruction *sam = ctx->ir->astc_srgb[i];
+
+ compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
+
+ if (alt_tex_state[sam->cat5.tex] == 0) {
+ /* assign new alternate/alpha tex state slot: */
+ alt_tex_state[sam->cat5.tex] = tex_idx++;
+ so->astc_srgb.orig_idx[idx++] = sam->cat5.tex;
+ so->astc_srgb.count++;
+ }
+
+ sam->cat5.tex = alt_tex_state[sam->cat5.tex];
+ }
+}
+
+static void
+fixup_binning_pass(struct ir3_context *ctx)
+{
+ struct ir3_shader_variant *so = ctx->so;
+ struct ir3 *ir = ctx->ir;
+ unsigned i, j;
+
+ for (i = 0, j = 0; i < so->outputs_count; i++) {
+ unsigned slot = so->outputs[i].slot;
+
+ /* throw away everything but first position/psize */
+ if ((slot == VARYING_SLOT_POS) || (slot == VARYING_SLOT_PSIZ)) {
+ if (i != j) {
+ so->outputs[j] = so->outputs[i];
+ ir->outputs[(j*4)+0] = ir->outputs[(i*4)+0];
+ ir->outputs[(j*4)+1] = ir->outputs[(i*4)+1];
+ ir->outputs[(j*4)+2] = ir->outputs[(i*4)+2];
+ ir->outputs[(j*4)+3] = ir->outputs[(i*4)+3];
+ }
+ j++;
+ }
+ }
+ so->outputs_count = j;
+ ir->noutputs = j * 4;
+}
+
+int
+ir3_compile_shader_nir(struct ir3_compiler *compiler,
+ struct ir3_shader_variant *so)
+{
+ struct ir3_context *ctx;
+ struct ir3 *ir;
+ struct ir3_instruction **inputs;
+ unsigned i, actual_in, inloc;
+ int ret = 0, max_bary;
+
+ assert(!so->ir);
+
+ ctx = compile_init(compiler, so);
+ if (!ctx) {
+ DBG("INIT failed!");
+ ret = -1;
+ goto out;
+ }
+
+ emit_instructions(ctx);
+
+ if (ctx->error) {
+ DBG("EMIT failed!");
+ ret = -1;
+ goto out;
+ }
+
+ ir = so->ir = ctx->ir;
+
+ /* keep track of the inputs from TGSI perspective.. */
+ inputs = ir->inputs;
+
+ /* but fixup actual inputs for frag shader: */
+ if (so->type == MESA_SHADER_FRAGMENT)
+ fixup_frag_inputs(ctx);
+
+ /* at this point, for binning pass, throw away unneeded outputs: */
+ if (so->binning_pass && (ctx->compiler->gpu_id < 600))
+ fixup_binning_pass(ctx);
+
+ /* if we want half-precision outputs, mark the output registers
+ * as half:
+ */
+ if (so->key.half_precision) {
+ for (i = 0; i < ir->noutputs; i++) {
+ struct ir3_instruction *out = ir->outputs[i];
+
+ if (!out)
+ continue;
+
+ /* if frag shader writes z, that needs to be full precision: */
+ if (so->outputs[i/4].slot == FRAG_RESULT_DEPTH)
+ continue;
+
+ out->regs[0]->flags |= IR3_REG_HALF;
+ /* output could be a fanout (ie. texture fetch output)
+ * in which case we need to propagate the half-reg flag
+ * up to the definer so that RA sees it:
+ */
+ if (out->opc == OPC_META_FO) {
+ out = out->regs[1]->instr;
+ out->regs[0]->flags |= IR3_REG_HALF;
+ }
+
+ if (out->opc == OPC_MOV) {
+ out->cat1.dst_type = half_type(out->cat1.dst_type);
+ }
+ }
+ }
+
+ if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+ printf("BEFORE CP:\n");
+ ir3_print(ir);
+ }
+
+ ir3_cp(ir, so);
+
+ /* at this point, for binning pass, throw away unneeded outputs:
+ * Note that for a6xx and later, we do this after ir3_cp to ensure
+ * that the uniform/constant layout for BS and VS matches, so that
+ * we can re-use same VS_CONST state group.
+ */
+ if (so->binning_pass && (ctx->compiler->gpu_id >= 600))
+ fixup_binning_pass(ctx);
+
+ /* Insert mov if there's same instruction for each output.
+ * eg. dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.const_expression.vertex.sampler2dshadow
+ */
+ for (int i = ir->noutputs - 1; i >= 0; i--) {
+ if (!ir->outputs[i])
+ continue;
+ for (unsigned j = 0; j < i; j++) {
+ if (ir->outputs[i] == ir->outputs[j]) {
+ ir->outputs[i] =
+ ir3_MOV(ir->outputs[i]->block, ir->outputs[i], TYPE_F32);
+ }
+ }
+ }
+
+ if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+ printf("BEFORE GROUPING:\n");
+ ir3_print(ir);
+ }
+
+ ir3_sched_add_deps(ir);
+
+ /* Group left/right neighbors, inserting mov's where needed to
+ * solve conflicts:
+ */
+ ir3_group(ir);
+
+ if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+ printf("AFTER GROUPING:\n");
+ ir3_print(ir);
+ }
+
+ ir3_depth(ir);
+
+ if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+ printf("AFTER DEPTH:\n");
+ ir3_print(ir);
+ }
+
+ ret = ir3_sched(ir);
+ if (ret) {
+ DBG("SCHED failed!");
+ goto out;
+ }
+
+ if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+ printf("AFTER SCHED:\n");
+ ir3_print(ir);
+ }
+
+ ret = ir3_ra(ir, so->type, so->frag_coord, so->frag_face);
+ if (ret) {
+ DBG("RA failed!");
+ goto out;
+ }
+
+ if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+ printf("AFTER RA:\n");
+ ir3_print(ir);
+ }
+
+ /* fixup input/outputs: */
+ for (i = 0; i < so->outputs_count; i++) {
+ so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num;
+ }
+
+ /* Note that some or all channels of an input may be unused: */
+ actual_in = 0;
+ inloc = 0;
+ for (i = 0; i < so->inputs_count; i++) {
+ unsigned j, reg = regid(63,0), compmask = 0, maxcomp = 0;
+ so->inputs[i].ncomp = 0;
+ so->inputs[i].inloc = inloc;
+ for (j = 0; j < 4; j++) {
+ struct ir3_instruction *in = inputs[(i*4) + j];
+ if (in && !(in->flags & IR3_INSTR_UNUSED)) {
+ compmask |= (1 << j);
+ reg = in->regs[0]->num - j;
+ actual_in++;
+ so->inputs[i].ncomp++;
+ if ((so->type == MESA_SHADER_FRAGMENT) && so->inputs[i].bary) {
+ /* assign inloc: */
+ assert(in->regs[1]->flags & IR3_REG_IMMED);
+ in->regs[1]->iim_val = inloc + j;
+ maxcomp = j + 1;
+ }
+ }
+ }
+ if ((so->type == MESA_SHADER_FRAGMENT) && compmask && so->inputs[i].bary) {
+ so->varying_in++;
+ so->inputs[i].compmask = (1 << maxcomp) - 1;
+ inloc += maxcomp;
+ } else if (!so->inputs[i].sysval) {
+ so->inputs[i].compmask = compmask;
+ }
+ so->inputs[i].regid = reg;
+ }
+
+ if (ctx->astc_srgb)
+ fixup_astc_srgb(ctx);
+
+ /* We need to do legalize after (for frag shader's) the "bary.f"
+ * offsets (inloc) have been assigned.
+ */
+ ir3_legalize(ir, &so->num_samp, &so->has_ssbo, &max_bary);
+
+ if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+ printf("AFTER LEGALIZE:\n");
+ ir3_print(ir);
+ }
+
+ /* Note that actual_in counts inputs that are not bary.f'd for FS: */
+ if (so->type == MESA_SHADER_VERTEX)
+ so->total_in = actual_in;
+ else
+ so->total_in = max_bary + 1;
+
+out:
+ if (ret) {
+ if (so->ir)
+ ir3_destroy(so->ir);
+ so->ir = NULL;
+ }
+ compile_free(ctx);
+
+ return ret;
+}
diff --git a/src/freedreno/ir3/ir3_cp.c b/src/freedreno/ir3/ir3_cp.c
new file mode 100644
index 00000000000..e8e8cc311e3
--- /dev/null
+++ b/src/freedreno/ir3/ir3_cp.c
@@ -0,0 +1,653 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#include <math.h>
+
+#include "ir3.h"
+#include "ir3_shader.h"
+
+/*
+ * Copy Propagate:
+ */
+
+struct ir3_cp_ctx {
+ struct ir3 *shader;
+ struct ir3_shader_variant *so;
+ unsigned immediate_idx;
+};
+
+/* is it a type preserving mov, with ok flags? */
+static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags)
+{
+ if (is_same_type_mov(instr)) {
+ struct ir3_register *dst = instr->regs[0];
+ struct ir3_register *src = instr->regs[1];
+ struct ir3_instruction *src_instr = ssa(src);
+
+ /* only if mov src is SSA (not const/immed): */
+ if (!src_instr)
+ return false;
+
+ /* no indirect: */
+ if (dst->flags & IR3_REG_RELATIV)
+ return false;
+ if (src->flags & IR3_REG_RELATIV)
+ return false;
+
+ if (src->flags & IR3_REG_ARRAY)
+ return false;
+
+ if (!allow_flags)
+ if (src->flags & (IR3_REG_FABS | IR3_REG_FNEG |
+ IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
+ return false;
+
+ /* TODO: remove this hack: */
+ if (src_instr->opc == OPC_META_FO)
+ return false;
+
+ return true;
+ }
+ return false;
+}
+
+static unsigned cp_flags(unsigned flags)
+{
+ /* only considering these flags (at least for now): */
+ flags &= (IR3_REG_CONST | IR3_REG_IMMED |
+ IR3_REG_FNEG | IR3_REG_FABS |
+ IR3_REG_SNEG | IR3_REG_SABS |
+ IR3_REG_BNOT | IR3_REG_RELATIV);
+ return flags;
+}
+
+static bool valid_flags(struct ir3_instruction *instr, unsigned n,
+ unsigned flags)
+{
+ unsigned valid_flags;
+ flags = cp_flags(flags);
+
+ /* If destination is indirect, then source cannot be.. at least
+ * I don't think so..
+ */
+ if ((instr->regs[0]->flags & IR3_REG_RELATIV) &&
+ (flags & IR3_REG_RELATIV))
+ return false;
+
+ /* TODO it seems to *mostly* work to cp RELATIV, except we get some
+ * intermittent piglit variable-indexing fails. Newer blob driver
+ * doesn't seem to cp these. Possibly this is hw workaround? Not
+ * sure, but until that is understood better, lets just switch off
+ * cp for indirect src's:
+ */
+ if (flags & IR3_REG_RELATIV)
+ return false;
+
+ switch (opc_cat(instr->opc)) {
+ case 1:
+ valid_flags = IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV;
+ if (flags & ~valid_flags)
+ return false;
+ break;
+ case 2:
+ valid_flags = ir3_cat2_absneg(instr->opc) |
+ IR3_REG_CONST | IR3_REG_RELATIV;
+
+ if (ir3_cat2_int(instr->opc))
+ valid_flags |= IR3_REG_IMMED;
+
+ if (flags & ~valid_flags)
+ return false;
+
+ if (flags & (IR3_REG_CONST | IR3_REG_IMMED)) {
+ unsigned m = (n ^ 1) + 1;
+ /* cannot deal w/ const in both srcs:
+ * (note that some cat2 actually only have a single src)
+ */
+ if (m < instr->regs_count) {
+ struct ir3_register *reg = instr->regs[m];
+ if ((flags & IR3_REG_CONST) && (reg->flags & IR3_REG_CONST))
+ return false;
+ if ((flags & IR3_REG_IMMED) && (reg->flags & IR3_REG_IMMED))
+ return false;
+ }
+ /* cannot be const + ABS|NEG: */
+ if (flags & (IR3_REG_FABS | IR3_REG_FNEG |
+ IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
+ return false;
+ }
+ break;
+ case 3:
+ valid_flags = ir3_cat3_absneg(instr->opc) |
+ IR3_REG_CONST | IR3_REG_RELATIV;
+
+ if (flags & ~valid_flags)
+ return false;
+
+ if (flags & (IR3_REG_CONST | IR3_REG_RELATIV)) {
+ /* cannot deal w/ const/relativ in 2nd src: */
+ if (n == 1)
+ return false;
+ }
+
+ if (flags & IR3_REG_CONST) {
+ /* cannot be const + ABS|NEG: */
+ if (flags & (IR3_REG_FABS | IR3_REG_FNEG |
+ IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
+ return false;
+ }
+ break;
+ case 4:
+ /* seems like blob compiler avoids const as src.. */
+ /* TODO double check if this is still the case on a4xx */
+ if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
+ return false;
+ if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
+ return false;
+ break;
+ case 5:
+ /* no flags allowed */
+ if (flags)
+ return false;
+ break;
+ case 6:
+ valid_flags = IR3_REG_IMMED;
+ if (flags & ~valid_flags)
+ return false;
+
+ if (flags & IR3_REG_IMMED) {
+ /* doesn't seem like we can have immediate src for store
+ * instructions:
+ *
+ * TODO this restriction could also apply to load instructions,
+ * but for load instructions this arg is the address (and not
+ * really sure any good way to test a hard-coded immed addr src)
+ */
+ if (is_store(instr) && (n == 1))
+ return false;
+
+ if ((instr->opc == OPC_LDL) && (n != 1))
+ return false;
+
+ if ((instr->opc == OPC_STL) && (n != 2))
+ return false;
+
+ /* disallow CP into anything but the SSBO slot argument for
+ * atomics:
+ */
+ if (is_atomic(instr->opc) && (n != 0))
+ return false;
+
+ if (is_atomic(instr->opc) && !(instr->flags & IR3_INSTR_G))
+ return false;
+ }
+
+ break;
+ }
+
+ return true;
+}
+
+/* propagate register flags from src to dst.. negates need special
+ * handling to cancel each other out.
+ */
+static void combine_flags(unsigned *dstflags, struct ir3_instruction *src)
+{
+ unsigned srcflags = src->regs[1]->flags;
+
+ /* if what we are combining into already has (abs) flags,
+ * we can drop (neg) from src:
+ */
+ if (*dstflags & IR3_REG_FABS)
+ srcflags &= ~IR3_REG_FNEG;
+ if (*dstflags & IR3_REG_SABS)
+ srcflags &= ~IR3_REG_SNEG;
+
+ if (srcflags & IR3_REG_FABS)
+ *dstflags |= IR3_REG_FABS;
+ if (srcflags & IR3_REG_SABS)
+ *dstflags |= IR3_REG_SABS;
+ if (srcflags & IR3_REG_FNEG)
+ *dstflags ^= IR3_REG_FNEG;
+ if (srcflags & IR3_REG_SNEG)
+ *dstflags ^= IR3_REG_SNEG;
+ if (srcflags & IR3_REG_BNOT)
+ *dstflags ^= IR3_REG_BNOT;
+
+ *dstflags &= ~IR3_REG_SSA;
+ *dstflags |= srcflags & IR3_REG_SSA;
+ *dstflags |= srcflags & IR3_REG_CONST;
+ *dstflags |= srcflags & IR3_REG_IMMED;
+ *dstflags |= srcflags & IR3_REG_RELATIV;
+ *dstflags |= srcflags & IR3_REG_ARRAY;
+
+ /* if src of the src is boolean we can drop the (abs) since we know
+ * the source value is already a postitive integer. This cleans
+ * up the absnegs that get inserted when converting between nir and
+ * native boolean (see ir3_b2n/n2b)
+ */
+ struct ir3_instruction *srcsrc = ssa(src->regs[1]);
+ if (srcsrc && is_bool(srcsrc))
+ *dstflags &= ~IR3_REG_SABS;
+}
+
+static struct ir3_register *
+lower_immed(struct ir3_cp_ctx *ctx, struct ir3_register *reg, unsigned new_flags)
+{
+ unsigned swiz, idx, i;
+
+ reg = ir3_reg_clone(ctx->shader, reg);
+
+ /* in some cases, there are restrictions on (abs)/(neg) plus const..
+ * so just evaluate those and clear the flags:
+ */
+ if (new_flags & IR3_REG_SABS) {
+ reg->iim_val = abs(reg->iim_val);
+ new_flags &= ~IR3_REG_SABS;
+ }
+
+ if (new_flags & IR3_REG_FABS) {
+ reg->fim_val = fabs(reg->fim_val);
+ new_flags &= ~IR3_REG_FABS;
+ }
+
+ if (new_flags & IR3_REG_SNEG) {
+ reg->iim_val = -reg->iim_val;
+ new_flags &= ~IR3_REG_SNEG;
+ }
+
+ if (new_flags & IR3_REG_FNEG) {
+ reg->fim_val = -reg->fim_val;
+ new_flags &= ~IR3_REG_FNEG;
+ }
+
+ /* Reallocate for 4 more elements whenever it's necessary */
+ if (ctx->immediate_idx == ctx->so->immediates_size * 4) {
+ ctx->so->immediates_size += 4;
+ ctx->so->immediates = realloc (ctx->so->immediates,
+ ctx->so->immediates_size * sizeof (ctx->so->immediates[0]));
+ }
+
+ for (i = 0; i < ctx->immediate_idx; i++) {
+ swiz = i % 4;
+ idx = i / 4;
+
+ if (ctx->so->immediates[idx].val[swiz] == reg->uim_val) {
+ break;
+ }
+ }
+
+ if (i == ctx->immediate_idx) {
+ /* need to generate a new immediate: */
+ swiz = i % 4;
+ idx = i / 4;
+ ctx->so->immediates[idx].val[swiz] = reg->uim_val;
+ ctx->so->immediates_count = idx + 1;
+ ctx->immediate_idx++;
+ }
+
+ new_flags &= ~IR3_REG_IMMED;
+ new_flags |= IR3_REG_CONST;
+ reg->flags = new_flags;
+ reg->num = i + (4 * ctx->so->constbase.immediate);
+
+ return reg;
+}
+
+static void
+unuse(struct ir3_instruction *instr)
+{
+ debug_assert(instr->use_count > 0);
+
+ if (--instr->use_count == 0) {
+ struct ir3_block *block = instr->block;
+
+ instr->barrier_class = 0;
+ instr->barrier_conflict = 0;
+
+ /* we don't want to remove anything in keeps (which could
+ * be things like array store's)
+ */
+ for (unsigned i = 0; i < block->keeps_count; i++) {
+ debug_assert(block->keeps[i] != instr);
+ }
+ }
+}
+
+/**
+ * Handle cp for a given src register. This additionally handles
+ * the cases of collapsing immedate/const (which replace the src
+ * register with a non-ssa src) or collapsing mov's from relative
+ * src (which needs to also fixup the address src reference by the
+ * instruction).
+ */
+static void
+reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
+ struct ir3_register *reg, unsigned n)
+{
+ struct ir3_instruction *src = ssa(reg);
+
+ if (is_eligible_mov(src, true)) {
+ /* simple case, no immed/const/relativ, only mov's w/ ssa src: */
+ struct ir3_register *src_reg = src->regs[1];
+ unsigned new_flags = reg->flags;
+
+ combine_flags(&new_flags, src);
+
+ if (valid_flags(instr, n, new_flags)) {
+ if (new_flags & IR3_REG_ARRAY) {
+ debug_assert(!(reg->flags & IR3_REG_ARRAY));
+ reg->array = src_reg->array;
+ }
+ reg->flags = new_flags;
+ reg->instr = ssa(src_reg);
+
+ instr->barrier_class |= src->barrier_class;
+ instr->barrier_conflict |= src->barrier_conflict;
+
+ unuse(src);
+ reg->instr->use_count++;
+ }
+
+ } else if (is_same_type_mov(src) &&
+ /* cannot collapse const/immed/etc into meta instrs: */
+ !is_meta(instr)) {
+ /* immed/const/etc cases, which require some special handling: */
+ struct ir3_register *src_reg = src->regs[1];
+ unsigned new_flags = reg->flags;
+
+ combine_flags(&new_flags, src);
+
+ if (!valid_flags(instr, n, new_flags)) {
+ /* See if lowering an immediate to const would help. */
+ if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) {
+ debug_assert(new_flags & IR3_REG_IMMED);
+ instr->regs[n + 1] = lower_immed(ctx, src_reg, new_flags);
+ return;
+ }
+
+ /* special case for "normal" mad instructions, we can
+ * try swapping the first two args if that fits better.
+ *
+ * the "plain" MAD's (ie. the ones that don't shift first
+ * src prior to multiply) can swap their first two srcs if
+ * src[0] is !CONST and src[1] is CONST:
+ */
+ if ((n == 1) && is_mad(instr->opc) &&
+ !(instr->regs[0 + 1]->flags & (IR3_REG_CONST | IR3_REG_RELATIV)) &&
+ valid_flags(instr, 0, new_flags & ~IR3_REG_IMMED)) {
+ /* swap src[0] and src[1]: */
+ struct ir3_register *tmp;
+ tmp = instr->regs[0 + 1];
+ instr->regs[0 + 1] = instr->regs[1 + 1];
+ instr->regs[1 + 1] = tmp;
+
+ n = 0;
+ } else {
+ return;
+ }
+ }
+
+ /* Here we handle the special case of mov from
+ * CONST and/or RELATIV. These need to be handled
+ * specially, because in the case of move from CONST
+ * there is no src ir3_instruction so we need to
+ * replace the ir3_register. And in the case of
+ * RELATIV we need to handle the address register
+ * dependency.
+ */
+ if (src_reg->flags & IR3_REG_CONST) {
+ /* an instruction cannot reference two different
+ * address registers:
+ */
+ if ((src_reg->flags & IR3_REG_RELATIV) &&
+ conflicts(instr->address, reg->instr->address))
+ return;
+
+ /* This seems to be a hw bug, or something where the timings
+ * just somehow don't work out. This restriction may only
+ * apply if the first src is also CONST.
+ */
+ if ((opc_cat(instr->opc) == 3) && (n == 2) &&
+ (src_reg->flags & IR3_REG_RELATIV) &&
+ (src_reg->array.offset == 0))
+ return;
+
+ src_reg = ir3_reg_clone(instr->block->shader, src_reg);
+ src_reg->flags = new_flags;
+ instr->regs[n+1] = src_reg;
+
+ if (src_reg->flags & IR3_REG_RELATIV)
+ ir3_instr_set_address(instr, reg->instr->address);
+
+ return;
+ }
+
+ if ((src_reg->flags & IR3_REG_RELATIV) &&
+ !conflicts(instr->address, reg->instr->address)) {
+ src_reg = ir3_reg_clone(instr->block->shader, src_reg);
+ src_reg->flags = new_flags;
+ instr->regs[n+1] = src_reg;
+ ir3_instr_set_address(instr, reg->instr->address);
+
+ return;
+ }
+
+ /* NOTE: seems we can only do immed integers, so don't
+ * need to care about float. But we do need to handle
+ * abs/neg *before* checking that the immediate requires
+ * few enough bits to encode:
+ *
+ * TODO: do we need to do something to avoid accidentally
+ * catching a float immed?
+ */
+ if (src_reg->flags & IR3_REG_IMMED) {
+ int32_t iim_val = src_reg->iim_val;
+
+ debug_assert((opc_cat(instr->opc) == 1) ||
+ (opc_cat(instr->opc) == 6) ||
+ ir3_cat2_int(instr->opc) ||
+ (is_mad(instr->opc) && (n == 0)));
+
+ if (new_flags & IR3_REG_SABS)
+ iim_val = abs(iim_val);
+
+ if (new_flags & IR3_REG_SNEG)
+ iim_val = -iim_val;
+
+ if (new_flags & IR3_REG_BNOT)
+ iim_val = ~iim_val;
+
+ /* other than category 1 (mov) we can only encode up to 10 bits: */
+ if ((instr->opc == OPC_MOV) ||
+ !((iim_val & ~0x3ff) && (-iim_val & ~0x3ff))) {
+ new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT);
+ src_reg = ir3_reg_clone(instr->block->shader, src_reg);
+ src_reg->flags = new_flags;
+ src_reg->iim_val = iim_val;
+ instr->regs[n+1] = src_reg;
+ } else if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) {
+ /* See if lowering an immediate to const would help. */
+ instr->regs[n+1] = lower_immed(ctx, src_reg, new_flags);
+ }
+
+ return;
+ }
+ }
+}
+
+/* Handle special case of eliminating output mov, and similar cases where
+ * there isn't a normal "consuming" instruction. In this case we cannot
+ * collapse flags (ie. output mov from const, or w/ abs/neg flags, cannot
+ * be eliminated)
+ */
+static struct ir3_instruction *
+eliminate_output_mov(struct ir3_instruction *instr)
+{
+ if (is_eligible_mov(instr, false)) {
+ struct ir3_register *reg = instr->regs[1];
+ if (!(reg->flags & IR3_REG_ARRAY)) {
+ struct ir3_instruction *src_instr = ssa(reg);
+ debug_assert(src_instr);
+ return src_instr;
+ }
+ }
+ return instr;
+}
+
+/**
+ * Find instruction src's which are mov's that can be collapsed, replacing
+ * the mov dst with the mov src
+ */
+static void
+instr_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr)
+{
+ struct ir3_register *reg;
+
+ if (instr->regs_count == 0)
+ return;
+
+ if (ir3_instr_check_mark(instr))
+ return;
+
+ /* walk down the graph from each src: */
+ foreach_src_n(reg, n, instr) {
+ struct ir3_instruction *src = ssa(reg);
+
+ if (!src)
+ continue;
+
+ instr_cp(ctx, src);
+
+ /* TODO non-indirect access we could figure out which register
+ * we actually want and allow cp..
+ */
+ if (reg->flags & IR3_REG_ARRAY)
+ continue;
+
+ /* Don't CP absneg into meta instructions, that won't end well: */
+ if (is_meta(instr) && (src->opc != OPC_MOV))
+ continue;
+
+ reg_cp(ctx, instr, reg, n);
+ }
+
+ if (instr->regs[0]->flags & IR3_REG_ARRAY) {
+ struct ir3_instruction *src = ssa(instr->regs[0]);
+ if (src)
+ instr_cp(ctx, src);
+ }
+
+ if (instr->address) {
+ instr_cp(ctx, instr->address);
+ ir3_instr_set_address(instr, eliminate_output_mov(instr->address));
+ }
+
+ /* we can end up with extra cmps.s from frontend, which uses a
+ *
+ * cmps.s p0.x, cond, 0
+ *
+ * as a way to mov into the predicate register. But frequently 'cond'
+ * is itself a cmps.s/cmps.f/cmps.u. So detect this special case and
+ * just re-write the instruction writing predicate register to get rid
+ * of the double cmps.
+ */
+ if ((instr->opc == OPC_CMPS_S) &&
+ (instr->regs[0]->num == regid(REG_P0, 0)) &&
+ ssa(instr->regs[1]) &&
+ (instr->regs[2]->flags & IR3_REG_IMMED) &&
+ (instr->regs[2]->iim_val == 0)) {
+ struct ir3_instruction *cond = ssa(instr->regs[1]);
+ switch (cond->opc) {
+ case OPC_CMPS_S:
+ case OPC_CMPS_F:
+ case OPC_CMPS_U:
+ instr->opc = cond->opc;
+ instr->flags = cond->flags;
+ instr->cat2 = cond->cat2;
+ instr->address = cond->address;
+ instr->regs[1] = cond->regs[1];
+ instr->regs[2] = cond->regs[2];
+ instr->barrier_class |= cond->barrier_class;
+ instr->barrier_conflict |= cond->barrier_conflict;
+ unuse(cond);
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+void
+ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so)
+{
+ struct ir3_cp_ctx ctx = {
+ .shader = ir,
+ .so = so,
+ };
+
+ /* This is a bit annoying, and probably wouldn't be necessary if we
+ * tracked a reverse link from producing instruction to consumer.
+ * But we need to know when we've eliminated the last consumer of
+ * a mov, so we need to do a pass to first count consumers of a
+ * mov.
+ */
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ struct ir3_instruction *src;
+
+ /* by the way, we don't account for false-dep's, so the CP
+ * pass should always happen before false-dep's are inserted
+ */
+ debug_assert(instr->deps_count == 0);
+
+ foreach_ssa_src(src, instr) {
+ src->use_count++;
+ }
+ }
+ }
+
+ ir3_clear_mark(ir);
+
+ for (unsigned i = 0; i < ir->noutputs; i++) {
+ if (ir->outputs[i]) {
+ instr_cp(&ctx, ir->outputs[i]);
+ ir->outputs[i] = eliminate_output_mov(ir->outputs[i]);
+ }
+ }
+
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ if (block->condition) {
+ instr_cp(&ctx, block->condition);
+ block->condition = eliminate_output_mov(block->condition);
+ }
+
+ for (unsigned i = 0; i < block->keeps_count; i++) {
+ instr_cp(&ctx, block->keeps[i]);
+ block->keeps[i] = eliminate_output_mov(block->keeps[i]);
+ }
+ }
+}
diff --git a/src/freedreno/ir3/ir3_depth.c b/src/freedreno/ir3/ir3_depth.c
new file mode 100644
index 00000000000..73bf5e19926
--- /dev/null
+++ b/src/freedreno/ir3/ir3_depth.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#include "util/u_math.h"
+
+#include "ir3.h"
+
+/*
+ * Instruction Depth:
+ *
+ * Calculates weighted instruction depth, ie. the sum of # of needed
+ * instructions plus delay slots back to original input (ie INPUT or
+ * CONST). That is to say, an instructions depth is:
+ *
+ * depth(instr) {
+ * d = 0;
+ * // for each src register:
+ * foreach (src in instr->regs[1..n])
+ * d = max(d, delayslots(src->instr, n) + depth(src->instr));
+ * return d + 1;
+ * }
+ *
+ * After an instruction's depth is calculated, it is inserted into the
+ * blocks depth sorted list, which is used by the scheduling pass.
+ */
+
+/* generally don't count false dependencies, since this can just be
+ * something like a barrier, or SSBO store. The exception is array
+ * dependencies if the assigner is an array write and the consumer
+ * reads the same array.
+ */
+static bool
+ignore_dep(struct ir3_instruction *assigner,
+ struct ir3_instruction *consumer, unsigned n)
+{
+ if (!__is_false_dep(consumer, n))
+ return false;
+
+ if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
+ struct ir3_register *dst = assigner->regs[0];
+ struct ir3_register *src;
+
+ debug_assert(dst->flags & IR3_REG_ARRAY);
+
+ foreach_src(src, consumer) {
+ if ((src->flags & IR3_REG_ARRAY) &&
+ (dst->array.id == src->array.id)) {
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+/* calculate required # of delay slots between the instruction that
+ * assigns a value and the one that consumes
+ */
+int ir3_delayslots(struct ir3_instruction *assigner,
+ struct ir3_instruction *consumer, unsigned n)
+{
+ if (ignore_dep(assigner, consumer, n))
+ return 0;
+
+ /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
+ * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
+ * handled with sync bits
+ */
+
+ if (is_meta(assigner))
+ return 0;
+
+ if (writes_addr(assigner))
+ return 6;
+
+ /* handled via sync flags: */
+ if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
+ return 0;
+
+ /* assigner must be alu: */
+ if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
+ is_mem(consumer)) {
+ return 6;
+ } else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
+ (n == 3)) {
+ /* special case, 3rd src to cat3 not required on first cycle */
+ return 1;
+ } else {
+ return 3;
+ }
+}
+
+void
+ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list)
+{
+ /* remove from existing spot in list: */
+ list_delinit(&instr->node);
+
+ /* find where to re-insert instruction: */
+ list_for_each_entry (struct ir3_instruction, pos, list, node) {
+ if (pos->depth > instr->depth) {
+ list_add(&instr->node, &pos->node);
+ return;
+ }
+ }
+ /* if we get here, we didn't find an insertion spot: */
+ list_addtail(&instr->node, list);
+}
+
+static void
+ir3_instr_depth(struct ir3_instruction *instr, unsigned boost, bool falsedep)
+{
+ struct ir3_instruction *src;
+
+ /* don't mark falsedep's as used, but otherwise process them normally: */
+ if (!falsedep)
+ instr->flags &= ~IR3_INSTR_UNUSED;
+
+ if (ir3_instr_check_mark(instr))
+ return;
+
+ instr->depth = 0;
+
+ foreach_ssa_src_n(src, i, instr) {
+ unsigned sd;
+
+ /* visit child to compute it's depth: */
+ ir3_instr_depth(src, boost, __is_false_dep(instr, i));
+
+ /* for array writes, no need to delay on previous write: */
+ if (i == 0)
+ continue;
+
+ sd = ir3_delayslots(src, instr, i) + src->depth;
+ sd += boost;
+
+ instr->depth = MAX2(instr->depth, sd);
+ }
+
+ if (!is_meta(instr))
+ instr->depth++;
+
+ ir3_insert_by_depth(instr, &instr->block->instr_list);
+}
+
+static bool
+remove_unused_by_block(struct ir3_block *block)
+{
+ bool progress = false;
+ list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
+ if (instr->opc == OPC_END)
+ continue;
+ if (instr->flags & IR3_INSTR_UNUSED) {
+ list_delinit(&instr->node);
+ progress = true;
+ }
+ }
+ return progress;
+}
+
+static bool
+compute_depth_and_remove_unused(struct ir3 *ir)
+{
+ unsigned i;
+ bool progress = false;
+
+ ir3_clear_mark(ir);
+
+ /* initially mark everything as unused, we'll clear the flag as we
+ * visit the instructions:
+ */
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ instr->flags |= IR3_INSTR_UNUSED;
+ }
+ }
+
+ for (i = 0; i < ir->noutputs; i++)
+ if (ir->outputs[i])
+ ir3_instr_depth(ir->outputs[i], 0, false);
+
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ for (i = 0; i < block->keeps_count; i++)
+ ir3_instr_depth(block->keeps[i], 0, false);
+
+ /* We also need to account for if-condition: */
+ if (block->condition)
+ ir3_instr_depth(block->condition, 6, false);
+ }
+
+ /* mark un-used instructions: */
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ progress |= remove_unused_by_block(block);
+ }
+
+ /* note that we can end up with unused indirects, but we should
+ * not end up with unused predicates.
+ */
+ for (i = 0; i < ir->indirects_count; i++) {
+ struct ir3_instruction *instr = ir->indirects[i];
+ if (instr && (instr->flags & IR3_INSTR_UNUSED))
+ ir->indirects[i] = NULL;
+ }
+
+ /* cleanup unused inputs: */
+ for (i = 0; i < ir->ninputs; i++) {
+ struct ir3_instruction *in = ir->inputs[i];
+ if (in && (in->flags & IR3_INSTR_UNUSED))
+ ir->inputs[i] = NULL;
+ }
+
+ return progress;
+}
+
+void
+ir3_depth(struct ir3 *ir)
+{
+ bool progress;
+ do {
+ progress = compute_depth_and_remove_unused(ir);
+ } while (progress);
+}
diff --git a/src/freedreno/ir3/ir3_group.c b/src/freedreno/ir3/ir3_group.c
new file mode 100644
index 00000000000..570055973e8
--- /dev/null
+++ b/src/freedreno/ir3/ir3_group.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#include "ir3.h"
+
+/*
+ * Find/group instruction neighbors:
+ */
+
+/* bleh.. we need to do the same group_n() thing for both inputs/outputs
+ * (where we have a simple instr[] array), and fanin nodes (where we have
+ * an extra indirection via reg->instr).
+ */
+struct group_ops {
+ struct ir3_instruction *(*get)(void *arr, int idx);
+ void (*insert_mov)(void *arr, int idx, struct ir3_instruction *instr);
+};
+
+static struct ir3_instruction *arr_get(void *arr, int idx)
+{
+ return ((struct ir3_instruction **)arr)[idx];
+}
+static void arr_insert_mov_out(void *arr, int idx, struct ir3_instruction *instr)
+{
+ ((struct ir3_instruction **)arr)[idx] =
+ ir3_MOV(instr->block, instr, TYPE_F32);
+}
+static void arr_insert_mov_in(void *arr, int idx, struct ir3_instruction *instr)
+{
+ /* so, we can't insert a mov in front of a meta:in.. and the downstream
+ * instruction already has a pointer to 'instr'. So we cheat a bit and
+ * morph the meta:in instruction into a mov and insert a new meta:in
+ * in front.
+ */
+ struct ir3_instruction *in;
+
+ debug_assert(instr->regs_count == 1);
+
+ in = ir3_instr_create(instr->block, OPC_META_INPUT);
+ in->inout.block = instr->block;
+ ir3_reg_create(in, instr->regs[0]->num, 0);
+
+ /* create src reg for meta:in and fixup to now be a mov: */
+ ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = in;
+ instr->opc = OPC_MOV;
+ instr->cat1.src_type = TYPE_F32;
+ instr->cat1.dst_type = TYPE_F32;
+
+ ((struct ir3_instruction **)arr)[idx] = in;
+}
+static struct group_ops arr_ops_out = { arr_get, arr_insert_mov_out };
+static struct group_ops arr_ops_in = { arr_get, arr_insert_mov_in };
+
+static struct ir3_instruction *instr_get(void *arr, int idx)
+{
+ return ssa(((struct ir3_instruction *)arr)->regs[idx+1]);
+}
+static void
+instr_insert_mov(void *arr, int idx, struct ir3_instruction *instr)
+{
+ ((struct ir3_instruction *)arr)->regs[idx+1]->instr =
+ ir3_MOV(instr->block, instr, TYPE_F32);
+}
+static struct group_ops instr_ops = { instr_get, instr_insert_mov };
+
+/* verify that cur != instr, but cur is also not in instr's neighbor-list: */
+static bool
+in_neighbor_list(struct ir3_instruction *instr, struct ir3_instruction *cur, int pos)
+{
+ int idx = 0;
+
+ if (!instr)
+ return false;
+
+ if (instr == cur)
+ return true;
+
+ for (instr = ir3_neighbor_first(instr); instr; instr = instr->cp.right)
+ if ((idx++ != pos) && (instr == cur))
+ return true;
+
+ return false;
+}
+
+static void
+group_n(struct group_ops *ops, void *arr, unsigned n)
+{
+ unsigned i, j;
+
+ /* first pass, figure out what has conflicts and needs a mov
+ * inserted. Do this up front, before starting to setup
+ * left/right neighbor pointers. Trying to do it in a single
+ * pass could result in a situation where we can't even setup
+ * the mov's right neighbor ptr if the next instr also needs
+ * a mov.
+ */
+restart:
+ for (i = 0; i < n; i++) {
+ struct ir3_instruction *instr = ops->get(arr, i);
+ if (instr) {
+ struct ir3_instruction *left = (i > 0) ? ops->get(arr, i - 1) : NULL;
+ struct ir3_instruction *right = (i < (n-1)) ? ops->get(arr, i + 1) : NULL;
+ bool conflict;
+
+ /* check for left/right neighbor conflicts: */
+ conflict = conflicts(instr->cp.left, left) ||
+ conflicts(instr->cp.right, right);
+
+ /* Mixing array elements and higher register classes
+ * (ie. groups) doesn't really work out in RA. See:
+ *
+ * https://trello.com/c/DqeDkeVf/156-bug-with-stk-70frag
+ */
+ if (instr->regs[0]->flags & IR3_REG_ARRAY)
+ conflict = true;
+
+ /* we also can't have an instr twice in the group: */
+ for (j = i + 1; (j < n) && !conflict; j++)
+ if (in_neighbor_list(ops->get(arr, j), instr, i))
+ conflict = true;
+
+ if (conflict) {
+ ops->insert_mov(arr, i, instr);
+ /* inserting the mov may have caused a conflict
+ * against the previous:
+ */
+ goto restart;
+ }
+ }
+ }
+
+ /* second pass, now that we've inserted mov's, fixup left/right
+ * neighbors. This is guaranteed to succeed, since by definition
+ * the newly inserted mov's cannot conflict with anything.
+ */
+ for (i = 0; i < n; i++) {
+ struct ir3_instruction *instr = ops->get(arr, i);
+ if (instr) {
+ struct ir3_instruction *left = (i > 0) ? ops->get(arr, i - 1) : NULL;
+ struct ir3_instruction *right = (i < (n-1)) ? ops->get(arr, i + 1) : NULL;
+
+ debug_assert(!conflicts(instr->cp.left, left));
+ if (left) {
+ instr->cp.left_cnt++;
+ instr->cp.left = left;
+ }
+
+ debug_assert(!conflicts(instr->cp.right, right));
+ if (right) {
+ instr->cp.right_cnt++;
+ instr->cp.right = right;
+ }
+ }
+ }
+}
+
+static void
+instr_find_neighbors(struct ir3_instruction *instr)
+{
+ struct ir3_instruction *src;
+
+ if (ir3_instr_check_mark(instr))
+ return;
+
+ if (instr->opc == OPC_META_FI)
+ group_n(&instr_ops, instr, instr->regs_count - 1);
+
+ foreach_ssa_src(src, instr)
+ instr_find_neighbors(src);
+}
+
+/* a bit of sadness.. we can't have "holes" in inputs from PoV of
+ * register assignment, they still need to be grouped together. So
+ * we need to insert dummy/padding instruction for grouping, and
+ * then take it back out again before anyone notices.
+ */
+static void
+pad_and_group_input(struct ir3_instruction **input, unsigned n)
+{
+ int i, mask = 0;
+ struct ir3_block *block = NULL;
+
+ for (i = n - 1; i >= 0; i--) {
+ struct ir3_instruction *instr = input[i];
+ if (instr) {
+ block = instr->block;
+ } else if (block) {
+ instr = ir3_NOP(block);
+ ir3_reg_create(instr, 0, IR3_REG_SSA); /* dummy dst */
+ input[i] = instr;
+ mask |= (1 << i);
+ }
+ }
+
+ group_n(&arr_ops_in, input, n);
+
+ for (i = 0; i < n; i++) {
+ if (mask & (1 << i))
+ input[i] = NULL;
+ }
+}
+
+static void
+find_neighbors(struct ir3 *ir)
+{
+ unsigned i;
+
+ /* shader inputs/outputs themselves must be contiguous as well:
+ *
+ * NOTE: group inputs first, since we only insert mov's
+ * *before* the conflicted instr (and that would go badly
+ * for inputs). By doing inputs first, we should never
+ * have a conflict on inputs.. pushing any conflict to
+ * resolve to the outputs, for stuff like:
+ *
+ * MOV OUT[n], IN[m].wzyx
+ *
+ * NOTE: we assume here inputs/outputs are grouped in vec4.
+ * This logic won't quite cut it if we don't align smaller
+ * on vec4 boundaries
+ */
+ for (i = 0; i < ir->ninputs; i += 4)
+ pad_and_group_input(&ir->inputs[i], 4);
+ for (i = 0; i < ir->noutputs; i += 4)
+ group_n(&arr_ops_out, &ir->outputs[i], 4);
+
+ for (i = 0; i < ir->noutputs; i++) {
+ if (ir->outputs[i]) {
+ struct ir3_instruction *instr = ir->outputs[i];
+ instr_find_neighbors(instr);
+ }
+ }
+
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ for (i = 0; i < block->keeps_count; i++) {
+ struct ir3_instruction *instr = block->keeps[i];
+ instr_find_neighbors(instr);
+ }
+
+ /* We also need to account for if-condition: */
+ if (block->condition)
+ instr_find_neighbors(block->condition);
+ }
+}
+
+void
+ir3_group(struct ir3 *ir)
+{
+ ir3_clear_mark(ir);
+ find_neighbors(ir);
+}
diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c
new file mode 100644
index 00000000000..ff4c644eab5
--- /dev/null
+++ b/src/freedreno/ir3/ir3_legalize.c
@@ -0,0 +1,496 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#include "util/ralloc.h"
+#include "util/u_math.h"
+
+#include "ir3.h"
+
+/*
+ * Legalize:
+ *
+ * We currently require that scheduling ensures that we have enough nop's
+ * in all the right places. The legalize step mostly handles fixing up
+ * instruction flags ((ss)/(sy)/(ei)), and collapses sequences of nop's
+ * into fewer nop's w/ rpt flag.
+ */
+
+struct ir3_legalize_ctx {
+ int num_samp;
+ bool has_ssbo;
+ int max_bary;
+};
+
+struct ir3_legalize_state {
+ regmask_t needs_ss;
+ regmask_t needs_ss_war; /* write after read */
+ regmask_t needs_sy;
+};
+
+struct ir3_legalize_block_data {
+ bool valid;
+ struct ir3_legalize_state state;
+};
+
+/* We want to evaluate each block from the position of any other
+ * predecessor block, in order that the flags set are the union of
+ * all possible program paths.
+ *
+ * To do this, we need to know the output state (needs_ss/ss_war/sy)
+ * of all predecessor blocks. The tricky thing is loops, which mean
+ * that we can't simply recursively process each predecessor block
+ * before legalizing the current block.
+ *
+ * How we handle that is by looping over all the blocks until the
+ * results converge. If the output state of a given block changes
+ * in a given pass, this means that all successor blocks are not
+ * yet fully legalized.
+ */
+
+static bool
+legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
+{
+ struct ir3_legalize_block_data *bd = block->data;
+
+ if (bd->valid)
+ return false;
+
+ struct ir3_instruction *last_input = NULL;
+ struct ir3_instruction *last_rel = NULL;
+ struct ir3_instruction *last_n = NULL;
+ struct list_head instr_list;
+ struct ir3_legalize_state prev_state = bd->state;
+ struct ir3_legalize_state *state = &bd->state;
+
+ /* our input state is the OR of all predecessor blocks' state: */
+ for (unsigned i = 0; i < block->predecessors_count; i++) {
+ struct ir3_legalize_block_data *pbd = block->predecessors[i]->data;
+ struct ir3_legalize_state *pstate = &pbd->state;
+
+ /* Our input (ss)/(sy) state is based on OR'ing the output
+ * state of all our predecessor blocks
+ */
+ regmask_or(&state->needs_ss,
+ &state->needs_ss, &pstate->needs_ss);
+ regmask_or(&state->needs_ss_war,
+ &state->needs_ss_war, &pstate->needs_ss_war);
+ regmask_or(&state->needs_sy,
+ &state->needs_sy, &pstate->needs_sy);
+ }
+
+ /* remove all the instructions from the list, we'll be adding
+ * them back in as we go
+ */
+ list_replace(&block->instr_list, &instr_list);
+ list_inithead(&block->instr_list);
+
+ list_for_each_entry_safe (struct ir3_instruction, n, &instr_list, node) {
+ struct ir3_register *reg;
+ unsigned i;
+
+ n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY);
+
+ if (is_meta(n))
+ continue;
+
+ if (is_input(n)) {
+ struct ir3_register *inloc = n->regs[1];
+ assert(inloc->flags & IR3_REG_IMMED);
+ ctx->max_bary = MAX2(ctx->max_bary, inloc->iim_val);
+ }
+
+ if (last_n && is_barrier(last_n))
+ n->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
+
+ /* NOTE: consider dst register too.. it could happen that
+ * texture sample instruction (for example) writes some
+ * components which are unused. A subsequent instruction
+ * that writes the same register can race w/ the sam instr
+ * resulting in undefined results:
+ */
+ for (i = 0; i < n->regs_count; i++) {
+ reg = n->regs[i];
+
+ if (reg_gpr(reg)) {
+
+ /* TODO: we probably only need (ss) for alu
+ * instr consuming sfu result.. need to make
+ * some tests for both this and (sy)..
+ */
+ if (regmask_get(&state->needs_ss, reg)) {
+ n->flags |= IR3_INSTR_SS;
+ regmask_init(&state->needs_ss_war);
+ regmask_init(&state->needs_ss);
+ }
+
+ if (regmask_get(&state->needs_sy, reg)) {
+ n->flags |= IR3_INSTR_SY;
+ regmask_init(&state->needs_sy);
+ }
+ }
+
+ /* TODO: is it valid to have address reg loaded from a
+ * relative src (ie. mova a0, c<a0.x+4>)? If so, the
+ * last_rel check below should be moved ahead of this:
+ */
+ if (reg->flags & IR3_REG_RELATIV)
+ last_rel = n;
+ }
+
+ if (n->regs_count > 0) {
+ reg = n->regs[0];
+ if (regmask_get(&state->needs_ss_war, reg)) {
+ n->flags |= IR3_INSTR_SS;
+ regmask_init(&state->needs_ss_war);
+ regmask_init(&state->needs_ss);
+ }
+
+ if (last_rel && (reg->num == regid(REG_A0, 0))) {
+ last_rel->flags |= IR3_INSTR_UL;
+ last_rel = NULL;
+ }
+ }
+
+ /* cat5+ does not have an (ss) bit, if needed we need to
+ * insert a nop to carry the sync flag. Would be kinda
+ * clever if we were aware of this during scheduling, but
+ * this should be a pretty rare case:
+ */
+ if ((n->flags & IR3_INSTR_SS) && (opc_cat(n->opc) >= 5)) {
+ struct ir3_instruction *nop;
+ nop = ir3_NOP(block);
+ nop->flags |= IR3_INSTR_SS;
+ n->flags &= ~IR3_INSTR_SS;
+ }
+
+ /* need to be able to set (ss) on first instruction: */
+ if (list_empty(&block->instr_list) && (opc_cat(n->opc) >= 5))
+ ir3_NOP(block);
+
+ if (is_nop(n) && !list_empty(&block->instr_list)) {
+ struct ir3_instruction *last = list_last_entry(&block->instr_list,
+ struct ir3_instruction, node);
+ if (is_nop(last) && (last->repeat < 5)) {
+ last->repeat++;
+ last->flags |= n->flags;
+ continue;
+ }
+ }
+
+ list_addtail(&n->node, &block->instr_list);
+
+ if (is_sfu(n))
+ regmask_set(&state->needs_ss, n->regs[0]);
+
+ if (is_tex(n)) {
+ /* this ends up being the # of samp instructions.. but that
+ * is ok, everything else only cares whether it is zero or
+ * not. We do this here, rather than when we encounter a
+ * SAMP decl, because (especially in binning pass shader)
+ * the samp instruction(s) could get eliminated if the
+ * result is not used.
+ */
+ ctx->num_samp = MAX2(ctx->num_samp, n->cat5.samp + 1);
+ regmask_set(&state->needs_sy, n->regs[0]);
+ } else if (n->opc == OPC_RESINFO) {
+ regmask_set(&state->needs_ss, n->regs[0]);
+ ir3_NOP(block)->flags |= IR3_INSTR_SS;
+ } else if (is_load(n)) {
+ /* seems like ldlv needs (ss) bit instead?? which is odd but
+ * makes a bunch of flat-varying tests start working on a4xx.
+ */
+ if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL))
+ regmask_set(&state->needs_ss, n->regs[0]);
+ else
+ regmask_set(&state->needs_sy, n->regs[0]);
+ } else if (is_atomic(n->opc)) {
+ if (n->flags & IR3_INSTR_G)
+ regmask_set(&state->needs_sy, n->regs[0]);
+ else
+ regmask_set(&state->needs_ss, n->regs[0]);
+ }
+
+ if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G)))
+ ctx->has_ssbo = true;
+
+ /* both tex/sfu appear to not always immediately consume
+ * their src register(s):
+ */
+ if (is_tex(n) || is_sfu(n) || is_mem(n)) {
+ foreach_src(reg, n) {
+ if (reg_gpr(reg))
+ regmask_set(&state->needs_ss_war, reg);
+ }
+ }
+
+ if (is_input(n))
+ last_input = n;
+
+ last_n = n;
+ }
+
+ if (last_input) {
+ /* special hack.. if using ldlv to bypass interpolation,
+ * we need to insert a dummy bary.f on which we can set
+ * the (ei) flag:
+ */
+ if (is_mem(last_input) && (last_input->opc == OPC_LDLV)) {
+ struct ir3_instruction *baryf;
+
+ /* (ss)bary.f (ei)r63.x, 0, r0.x */
+ baryf = ir3_instr_create(block, OPC_BARY_F);
+ baryf->flags |= IR3_INSTR_SS;
+ ir3_reg_create(baryf, regid(63, 0), 0);
+ ir3_reg_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
+ ir3_reg_create(baryf, regid(0, 0), 0);
+
+ /* insert the dummy bary.f after last_input: */
+ list_delinit(&baryf->node);
+ list_add(&baryf->node, &last_input->node);
+
+ last_input = baryf;
+ }
+ last_input->regs[0]->flags |= IR3_REG_EI;
+ }
+
+ if (last_rel)
+ last_rel->flags |= IR3_INSTR_UL;
+
+ bd->valid = true;
+
+ if (memcmp(&prev_state, state, sizeof(*state))) {
+ /* our output state changed, this invalidates all of our
+ * successors:
+ */
+ for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) {
+ if (!block->successors[i])
+ break;
+ struct ir3_legalize_block_data *pbd = block->successors[i]->data;
+ pbd->valid = false;
+ }
+ }
+
+ return true;
+}
+
+/* NOTE: branch instructions are always the last instruction(s)
+ * in the block. We take advantage of this as we resolve the
+ * branches, since "if (foo) break;" constructs turn into
+ * something like:
+ *
+ * block3 {
+ * ...
+ * 0029:021: mov.s32s32 r62.x, r1.y
+ * 0082:022: br !p0.x, target=block5
+ * 0083:023: br p0.x, target=block4
+ * // succs: if _[0029:021: mov.s32s32] block4; else block5;
+ * }
+ * block4 {
+ * 0084:024: jump, target=block6
+ * // succs: block6;
+ * }
+ * block5 {
+ * 0085:025: jump, target=block7
+ * // succs: block7;
+ * }
+ *
+ * ie. only instruction in block4/block5 is a jump, so when
+ * resolving branches we can easily detect this by checking
+ * that the first instruction in the target block is itself
+ * a jump, and setup the br directly to the jump's target
+ * (and strip back out the now unreached jump)
+ *
+ * TODO sometimes we end up with things like:
+ *
+ * br !p0.x, #2
+ * br p0.x, #12
+ * add.u r0.y, r0.y, 1
+ *
+ * If we swapped the order of the branches, we could drop one.
+ */
+static struct ir3_block *
+resolve_dest_block(struct ir3_block *block)
+{
+ /* special case for last block: */
+ if (!block->successors[0])
+ return block;
+
+ /* NOTE that we may or may not have inserted the jump
+ * in the target block yet, so conditions to resolve
+ * the dest to the dest block's successor are:
+ *
+ * (1) successor[1] == NULL &&
+ * (2) (block-is-empty || only-instr-is-jump)
+ */
+ if (block->successors[1] == NULL) {
+ if (list_empty(&block->instr_list)) {
+ return block->successors[0];
+ } else if (list_length(&block->instr_list) == 1) {
+ struct ir3_instruction *instr = list_first_entry(
+ &block->instr_list, struct ir3_instruction, node);
+ if (instr->opc == OPC_JUMP)
+ return block->successors[0];
+ }
+ }
+ return block;
+}
+
+static bool
+resolve_jump(struct ir3_instruction *instr)
+{
+ struct ir3_block *tblock =
+ resolve_dest_block(instr->cat0.target);
+ struct ir3_instruction *target;
+
+ if (tblock != instr->cat0.target) {
+ list_delinit(&instr->cat0.target->node);
+ instr->cat0.target = tblock;
+ return true;
+ }
+
+ target = list_first_entry(&tblock->instr_list,
+ struct ir3_instruction, node);
+
+ /* TODO maybe a less fragile way to do this. But we are expecting
+ * a pattern from sched_block() that looks like:
+ *
+ * br !p0.x, #else-block
+ * br p0.x, #if-block
+ *
+ * if the first branch target is +2, or if 2nd branch target is +1
+ * then we can just drop the jump.
+ */
+ unsigned next_block;
+ if (instr->cat0.inv == true)
+ next_block = 2;
+ else
+ next_block = 1;
+
+ if ((!target) || (target->ip == (instr->ip + next_block))) {
+ list_delinit(&instr->node);
+ return true;
+ } else {
+ instr->cat0.immed =
+ (int)target->ip - (int)instr->ip;
+ }
+ return false;
+}
+
+/* resolve jumps, removing jumps/branches to immediately following
+ * instruction which we end up with from earlier stages. Since
+ * removing an instruction can invalidate earlier instruction's
+ * branch offsets, we need to do this iteratively until no more
+ * branches are removed.
+ */
+static bool
+resolve_jumps(struct ir3 *ir)
+{
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node)
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
+ if (is_flow(instr) && instr->cat0.target)
+ if (resolve_jump(instr))
+ return true;
+
+ return false;
+}
+
+/* we want to mark points where divergent flow control re-converges
+ * with (jp) flags. For now, since we don't do any optimization for
+ * things that start out as a 'do {} while()', re-convergence points
+ * will always be a branch or jump target. Note that this is overly
+ * conservative, since unconditional jump targets are not convergence
+ * points, we are just assuming that the other path to reach the jump
+ * target was divergent. If we were clever enough to optimize the
+ * jump at end of a loop back to a conditional branch into a single
+ * conditional branch, ie. like:
+ *
+ * add.f r1.w, r0.x, (neg)(r)c2.x <= loop start
+ * mul.f r1.z, r1.z, r0.x
+ * mul.f r1.y, r1.y, r0.x
+ * mul.f r0.z, r1.x, r0.x
+ * mul.f r0.w, r0.y, r0.x
+ * cmps.f.ge r0.x, (r)c2.y, (r)r1.w
+ * add.s r0.x, (r)r0.x, (r)-1
+ * sel.f32 r0.x, (r)c3.y, (r)r0.x, c3.x
+ * cmps.f.eq p0.x, r0.x, c3.y
+ * mov.f32f32 r0.x, r1.w
+ * mov.f32f32 r0.y, r0.w
+ * mov.f32f32 r1.x, r0.z
+ * (rpt2)nop
+ * br !p0.x, #-13
+ * (jp)mul.f r0.x, c263.y, r1.y
+ *
+ * Then we'd have to be more clever, as the convergence point is no
+ * longer a branch or jump target.
+ */
+static void
+mark_convergence_points(struct ir3 *ir)
+{
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ if (is_flow(instr) && instr->cat0.target) {
+ struct ir3_instruction *target =
+ list_first_entry(&instr->cat0.target->instr_list,
+ struct ir3_instruction, node);
+ target->flags |= IR3_INSTR_JP;
+ }
+ }
+ }
+}
+
+void
+ir3_legalize(struct ir3 *ir, int *num_samp, bool *has_ssbo, int *max_bary)
+{
+ struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx);
+ bool progress;
+
+ ctx->max_bary = -1;
+
+ /* allocate per-block data: */
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ block->data = rzalloc(ctx, struct ir3_legalize_block_data);
+ }
+
+ /* process each block: */
+ do {
+ progress = false;
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ progress |= legalize_block(ctx, block);
+ }
+ } while (progress);
+
+ *num_samp = ctx->num_samp;
+ *has_ssbo = ctx->has_ssbo;
+ *max_bary = ctx->max_bary;
+
+ do {
+ ir3_count_instructions(ir);
+ } while(resolve_jumps(ir));
+
+ mark_convergence_points(ir);
+
+ ralloc_free(ctx);
+}
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
new file mode 100644
index 00000000000..70c01ee0593
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (C) 2015 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+
+#include "util/debug.h"
+
+#include "ir3_nir.h"
+#include "ir3_compiler.h"
+#include "ir3_shader.h"
+
+static const nir_shader_compiler_options options = {
+ .lower_fpow = true,
+ .lower_scmp = true,
+ .lower_flrp32 = true,
+ .lower_flrp64 = true,
+ .lower_ffract = true,
+ .lower_fmod32 = true,
+ .lower_fmod64 = true,
+ .lower_fdiv = true,
+ .lower_ldexp = true,
+ .fuse_ffma = true,
+ .native_integers = true,
+ .vertex_id_zero_based = true,
+ .lower_extract_byte = true,
+ .lower_extract_word = true,
+ .lower_all_io_to_temps = true,
+ .lower_helper_invocation = true,
+};
+
+const nir_shader_compiler_options *
+ir3_get_compiler_options(struct ir3_compiler *compiler)
+{
+ return &options;
+}
+
+/* for given shader key, are any steps handled in nir? */
+bool
+ir3_key_lowers_nir(const struct ir3_shader_key *key)
+{
+ return key->fsaturate_s | key->fsaturate_t | key->fsaturate_r |
+ key->vsaturate_s | key->vsaturate_t | key->vsaturate_r |
+ key->ucp_enables | key->color_two_side |
+ key->fclamp_color | key->vclamp_color;
+}
+
+#define OPT(nir, pass, ...) ({ \
+ bool this_progress = false; \
+ NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
+ this_progress; \
+})
+
+#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
+
+static void
+ir3_optimize_loop(nir_shader *s)
+{
+ bool progress;
+ do {
+ progress = false;
+
+ OPT_V(s, nir_lower_vars_to_ssa);
+ progress |= OPT(s, nir_opt_copy_prop_vars);
+ progress |= OPT(s, nir_opt_dead_write_vars);
+ progress |= OPT(s, nir_lower_alu_to_scalar);
+ progress |= OPT(s, nir_lower_phis_to_scalar);
+
+ progress |= OPT(s, nir_copy_prop);
+ progress |= OPT(s, nir_opt_dce);
+ progress |= OPT(s, nir_opt_cse);
+ static int gcm = -1;
+ if (gcm == -1)
+ gcm = env_var_as_unsigned("GCM", 0);
+ if (gcm == 1)
+ progress |= OPT(s, nir_opt_gcm, true);
+ else if (gcm == 2)
+ progress |= OPT(s, nir_opt_gcm, false);
+ progress |= OPT(s, nir_opt_peephole_select, 16);
+ progress |= OPT(s, nir_opt_intrinsics);
+ progress |= OPT(s, nir_opt_algebraic);
+ progress |= OPT(s, nir_opt_constant_folding);
+ progress |= OPT(s, nir_opt_dead_cf);
+ if (OPT(s, nir_opt_trivial_continues)) {
+ progress |= true;
+ /* If nir_opt_trivial_continues makes progress, then we need to clean
+ * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
+ * to make progress.
+ */
+ OPT(s, nir_copy_prop);
+ OPT(s, nir_opt_dce);
+ }
+ progress |= OPT(s, nir_opt_if);
+ progress |= OPT(s, nir_opt_remove_phis);
+ progress |= OPT(s, nir_opt_undef);
+
+ } while (progress);
+}
+
+struct nir_shader *
+ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
+ const struct ir3_shader_key *key)
+{
+ struct nir_lower_tex_options tex_options = {
+ .lower_rect = 0,
+ };
+
+ if (key) {
+ switch (shader->type) {
+ case MESA_SHADER_FRAGMENT:
+ tex_options.saturate_s = key->fsaturate_s;
+ tex_options.saturate_t = key->fsaturate_t;
+ tex_options.saturate_r = key->fsaturate_r;
+ break;
+ case MESA_SHADER_VERTEX:
+ tex_options.saturate_s = key->vsaturate_s;
+ tex_options.saturate_t = key->vsaturate_t;
+ tex_options.saturate_r = key->vsaturate_r;
+ break;
+ default:
+ /* TODO */
+ break;
+ }
+ }
+
+ if (shader->compiler->gpu_id >= 400) {
+ /* a4xx seems to have *no* sam.p */
+ tex_options.lower_txp = ~0; /* lower all txp */
+ } else {
+ /* a3xx just needs to avoid sam.p for 3d tex */
+ tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D);
+ }
+
+ if (ir3_shader_debug & IR3_DBG_DISASM) {
+ debug_printf("----------------------\n");
+ nir_print_shader(s, stdout);
+ debug_printf("----------------------\n");
+ }
+
+ OPT_V(s, nir_opt_global_to_local);
+ OPT_V(s, nir_lower_regs_to_ssa);
+
+ if (key) {
+ if (s->info.stage == MESA_SHADER_VERTEX) {
+ OPT_V(s, nir_lower_clip_vs, key->ucp_enables, false);
+ if (key->vclamp_color)
+ OPT_V(s, nir_lower_clamp_color_outputs);
+ } else if (s->info.stage == MESA_SHADER_FRAGMENT) {
+ OPT_V(s, nir_lower_clip_fs, key->ucp_enables);
+ if (key->fclamp_color)
+ OPT_V(s, nir_lower_clamp_color_outputs);
+ }
+ if (key->color_two_side) {
+ OPT_V(s, nir_lower_two_sided_color);
+ }
+ } else {
+ /* only want to do this the first time (when key is null)
+ * and not again on any potential 2nd variant lowering pass:
+ */
+ OPT_V(s, ir3_nir_apply_trig_workarounds);
+ }
+
+ OPT_V(s, nir_lower_tex, &tex_options);
+ OPT_V(s, nir_lower_load_const_to_scalar);
+ if (shader->compiler->gpu_id < 500)
+ OPT_V(s, ir3_nir_lower_tg4_to_tex);
+
+ ir3_optimize_loop(s);
+
+ /* do idiv lowering after first opt loop to give a chance for
+ * divide by immed power-of-two to be caught first:
+ */
+ if (OPT(s, nir_lower_idiv))
+ ir3_optimize_loop(s);
+
+ OPT_V(s, nir_remove_dead_variables, nir_var_local);
+
+ OPT_V(s, nir_move_load_const);
+
+ if (ir3_shader_debug & IR3_DBG_DISASM) {
+ debug_printf("----------------------\n");
+ nir_print_shader(s, stdout);
+ debug_printf("----------------------\n");
+ }
+
+ nir_sweep(s);
+
+ return s;
+}
+
+void
+ir3_nir_scan_driver_consts(nir_shader *shader,
+ struct ir3_driver_const_layout *layout)
+{
+ nir_foreach_function(function, shader) {
+ if (!function->impl)
+ continue;
+
+ nir_foreach_block(block, function->impl) {
+ nir_foreach_instr(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intr =
+ nir_instr_as_intrinsic(instr);
+ unsigned idx;
+
+ switch (intr->intrinsic) {
+ case nir_intrinsic_get_buffer_size:
+ idx = nir_src_as_const_value(intr->src[0])->u32[0];
+ if (layout->ssbo_size.mask & (1 << idx))
+ break;
+ layout->ssbo_size.mask |= (1 << idx);
+ layout->ssbo_size.off[idx] =
+ layout->ssbo_size.count;
+ layout->ssbo_size.count += 1; /* one const per */
+ break;
+ case nir_intrinsic_image_deref_atomic_add:
+ case nir_intrinsic_image_deref_atomic_min:
+ case nir_intrinsic_image_deref_atomic_max:
+ case nir_intrinsic_image_deref_atomic_and:
+ case nir_intrinsic_image_deref_atomic_or:
+ case nir_intrinsic_image_deref_atomic_xor:
+ case nir_intrinsic_image_deref_atomic_exchange:
+ case nir_intrinsic_image_deref_atomic_comp_swap:
+ case nir_intrinsic_image_deref_store:
+ case nir_intrinsic_image_deref_size:
+ idx = nir_intrinsic_get_var(intr, 0)->data.driver_location;
+ if (layout->image_dims.mask & (1 << idx))
+ break;
+ layout->image_dims.mask |= (1 << idx);
+ layout->image_dims.off[idx] =
+ layout->image_dims.count;
+ layout->image_dims.count += 3; /* three const per */
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ }
+}
diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h
new file mode 100644
index 00000000000..74201d34160
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2015 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#ifndef IR3_NIR_H_
+#define IR3_NIR_H_
+
+#include "compiler/nir/nir.h"
+#include "compiler/shader_enums.h"
+
+#include "ir3_shader.h"
+
+void ir3_nir_scan_driver_consts(nir_shader *shader, struct ir3_driver_const_layout *layout);
+
+bool ir3_nir_apply_trig_workarounds(nir_shader *shader);
+bool ir3_nir_lower_tg4_to_tex(nir_shader *shader);
+
+const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler);
+bool ir3_key_lowers_nir(const struct ir3_shader_key *key);
+struct nir_shader * ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
+ const struct ir3_shader_key *key);
+
+#endif /* IR3_NIR_H_ */
diff --git a/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c b/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c
new file mode 100644
index 00000000000..37a3dcb26f8
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright © 2017 Ilia Mirkin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "ir3_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+/* A4XX has a broken GATHER4 operation. It performs the texture swizzle on the
+ * gather results, rather than before. As a result, it must be emulated with
+ * direct texture calls.
+ */
+
+static bool
+lower_tg4(nir_block *block, nir_builder *b, void *mem_ctx)
+{
+ bool progress = false;
+
+ static const int offsets[3][2] = { {0, 1}, {1, 1}, {1, 0} };
+
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_tex)
+ continue;
+
+ nir_tex_instr *tg4 = (nir_tex_instr *)instr;
+
+ if (tg4->op != nir_texop_tg4)
+ continue;
+
+ b->cursor = nir_before_instr(&tg4->instr);
+
+ nir_ssa_def *results[4];
+ int offset_index = nir_tex_instr_src_index(tg4, nir_tex_src_offset);
+ for (int i = 0; i < 4; i++) {
+ int num_srcs = tg4->num_srcs + 1 /* lod */;
+ if (offset_index < 0 && i < 3)
+ num_srcs++;
+
+ nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs);
+ tex->op = nir_texop_txl;
+ tex->sampler_dim = tg4->sampler_dim;
+ tex->coord_components = tg4->coord_components;
+ tex->is_array = tg4->is_array;
+ tex->is_shadow = tg4->is_shadow;
+ tex->is_new_style_shadow = tg4->is_new_style_shadow;
+ tex->texture_index = tg4->texture_index;
+ tex->sampler_index = tg4->sampler_index;
+ tex->dest_type = tg4->dest_type;
+
+ for (int j = 0; j < tg4->num_srcs; j++) {
+ nir_src_copy(&tex->src[j].src, &tg4->src[j].src, tex);
+ tex->src[j].src_type = tg4->src[j].src_type;
+ }
+ if (i != 3) {
+ nir_ssa_def *offset =
+ nir_vec2(b, nir_imm_int(b, offsets[i][0]),
+ nir_imm_int(b, offsets[i][1]));
+ if (offset_index < 0) {
+ tex->src[tg4->num_srcs].src = nir_src_for_ssa(offset);
+ tex->src[tg4->num_srcs].src_type = nir_tex_src_offset;
+ } else {
+ assert(nir_tex_instr_src_size(tex, offset_index) == 2);
+ nir_ssa_def *orig = nir_ssa_for_src(
+ b, tex->src[offset_index].src, 2);
+ tex->src[offset_index].src =
+ nir_src_for_ssa(nir_iadd(b, orig, offset));
+ }
+ }
+ tex->src[num_srcs - 1].src = nir_src_for_ssa(nir_imm_float(b, 0));
+ tex->src[num_srcs - 1].src_type = nir_tex_src_lod;
+
+ nir_ssa_dest_init(&tex->instr, &tex->dest,
+ nir_tex_instr_dest_size(tex), 32, NULL);
+ nir_builder_instr_insert(b, &tex->instr);
+
+ results[i] = nir_channel(b, &tex->dest.ssa, tg4->component);
+ }
+
+ nir_ssa_def *result = nir_vec4(b, results[0], results[1], results[2], results[3]);
+ nir_ssa_def_rewrite_uses(&tg4->dest.ssa, nir_src_for_ssa(result));
+
+ nir_instr_remove(&tg4->instr);
+
+ progress = true;
+ }
+
+ return progress;
+}
+
+static bool
+lower_tg4_func(nir_function_impl *impl)
+{
+ void *mem_ctx = ralloc_parent(impl);
+ nir_builder b;
+ nir_builder_init(&b, impl);
+
+ bool progress = false;
+ nir_foreach_block_safe(block, impl) {
+ progress |= lower_tg4(block, &b, mem_ctx);
+ }
+
+ if (progress)
+ nir_metadata_preserve(impl, nir_metadata_block_index |
+ nir_metadata_dominance);
+
+ return progress;
+}
+
+bool
+ir3_nir_lower_tg4_to_tex(nir_shader *shader)
+{
+ bool progress = false;
+
+ nir_foreach_function(function, shader) {
+ if (function->impl)
+ progress |= lower_tg4_func(function->impl);
+ }
+
+ return progress;
+}
diff --git a/src/freedreno/ir3/ir3_nir_trig.py b/src/freedreno/ir3/ir3_nir_trig.py
new file mode 100644
index 00000000000..3968aea543c
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir_trig.py
@@ -0,0 +1,51 @@
+#
+# Copyright (C) 2016 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+from __future__ import print_function
+
+import argparse
+import sys
+
+trig_workarounds = [
+ (('fsin', 'x'), ('fsin', ('fsub', ('fmul', 6.283185, ('ffract', ('fadd', ('fmul', 0.159155, 'x'), 0.5))), 3.141593))),
+ (('fcos', 'x'), ('fcos', ('fsub', ('fmul', 6.283185, ('ffract', ('fadd', ('fmul', 0.159155, 'x'), 0.5))), 3.141593))),
+]
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-p', '--import-path', required=True)
+ args = parser.parse_args()
+ sys.path.insert(0, args.import_path)
+ run()
+
+
+def run():
+ import nir_algebraic # pylint: disable=import-error
+
+ print('#include "ir3_nir.h"')
+ print(nir_algebraic.AlgebraicPass("ir3_nir_apply_trig_workarounds",
+ trig_workarounds).render())
+
+
+if __name__ == '__main__':
+ main()
diff --git a/src/freedreno/ir3/ir3_print.c b/src/freedreno/ir3/ir3_print.c
new file mode 100644
index 00000000000..b6ef6e4b5a7
--- /dev/null
+++ b/src/freedreno/ir3/ir3_print.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+
+#include "ir3.h"
+
+#define PTRID(x) ((unsigned long)(x))
+
+static void print_instr_name(struct ir3_instruction *instr)
+{
+ if (!instr)
+ return;
+#ifdef DEBUG
+ printf("%04u:", instr->serialno);
+#endif
+ printf("%04u:", instr->name);
+ printf("%04u:", instr->ip);
+ printf("%03u: ", instr->depth);
+
+ if (instr->flags & IR3_INSTR_SY)
+ printf("(sy)");
+ if (instr->flags & IR3_INSTR_SS)
+ printf("(ss)");
+
+ if (is_meta(instr)) {
+ switch (instr->opc) {
+ case OPC_META_INPUT: printf("_meta:in"); break;
+ case OPC_META_FO: printf("_meta:fo"); break;
+ case OPC_META_FI: printf("_meta:fi"); break;
+
+ /* shouldn't hit here.. just for debugging: */
+ default: printf("_meta:%d", instr->opc); break;
+ }
+ } else if (instr->opc == OPC_MOV) {
+ static const char *type[] = {
+ [TYPE_F16] = "f16",
+ [TYPE_F32] = "f32",
+ [TYPE_U16] = "u16",
+ [TYPE_U32] = "u32",
+ [TYPE_S16] = "s16",
+ [TYPE_S32] = "s32",
+ [TYPE_U8] = "u8",
+ [TYPE_S8] = "s8",
+ };
+ if (instr->cat1.src_type == instr->cat1.dst_type)
+ printf("mov");
+ else
+ printf("cov");
+ printf(".%s%s", type[instr->cat1.src_type], type[instr->cat1.dst_type]);
+ } else {
+ printf("%s", ir3_instr_name(instr));
+ if (instr->flags & IR3_INSTR_3D)
+ printf(".3d");
+ if (instr->flags & IR3_INSTR_A)
+ printf(".a");
+ if (instr->flags & IR3_INSTR_O)
+ printf(".o");
+ if (instr->flags & IR3_INSTR_P)
+ printf(".p");
+ if (instr->flags & IR3_INSTR_S)
+ printf(".s");
+ if (instr->flags & IR3_INSTR_S2EN)
+ printf(".s2en");
+ }
+}
+
+static void print_reg_name(struct ir3_register *reg)
+{
+ if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) &&
+ (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)))
+ printf("(absneg)");
+ else if (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))
+ printf("(neg)");
+ else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS))
+ printf("(abs)");
+
+ if (reg->flags & IR3_REG_IMMED) {
+ printf("imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val);
+ } else if (reg->flags & IR3_REG_ARRAY) {
+ printf("arr[id=%u, offset=%d, size=%u", reg->array.id,
+ reg->array.offset, reg->size);
+ /* for ARRAY we could have null src, for example first write
+ * instruction..
+ */
+ if (reg->instr) {
+ printf(", _[");
+ print_instr_name(reg->instr);
+ printf("]");
+ }
+ printf("]");
+ } else if (reg->flags & IR3_REG_SSA) {
+ printf("_[");
+ print_instr_name(reg->instr);
+ printf("]");
+ } else if (reg->flags & IR3_REG_RELATIV) {
+ if (reg->flags & IR3_REG_HALF)
+ printf("h");
+ if (reg->flags & IR3_REG_CONST)
+ printf("c<a0.x + %d>", reg->array.offset);
+ else
+ printf("\x1b[0;31mr<a0.x + %d>\x1b[0m (%u)", reg->array.offset, reg->size);
+ } else {
+ if (reg->flags & IR3_REG_HALF)
+ printf("h");
+ if (reg->flags & IR3_REG_CONST)
+ printf("c%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]);
+ else
+ printf("\x1b[0;31mr%u.%c\x1b[0m", reg_num(reg), "xyzw"[reg_comp(reg)]);
+ }
+}
+
+static void
+tab(int lvl)
+{
+ for (int i = 0; i < lvl; i++)
+ printf("\t");
+}
+
+static void
+print_instr(struct ir3_instruction *instr, int lvl)
+{
+ unsigned i;
+
+ tab(lvl);
+
+ print_instr_name(instr);
+ for (i = 0; i < instr->regs_count; i++) {
+ struct ir3_register *reg = instr->regs[i];
+ printf(i ? ", " : " ");
+ print_reg_name(reg);
+ }
+
+ if (instr->address) {
+ printf(", address=_");
+ printf("[");
+ print_instr_name(instr->address);
+ printf("]");
+ }
+
+ if (instr->cp.left) {
+ printf(", left=_");
+ printf("[");
+ print_instr_name(instr->cp.left);
+ printf("]");
+ }
+
+ if (instr->cp.right) {
+ printf(", right=_");
+ printf("[");
+ print_instr_name(instr->cp.right);
+ printf("]");
+ }
+
+ if (instr->opc == OPC_META_FO) {
+ printf(", off=%d", instr->fo.off);
+ }
+
+ if (is_flow(instr) && instr->cat0.target) {
+ /* the predicate register src is implied: */
+ if (instr->opc == OPC_BR) {
+ printf(" %sp0.x", instr->cat0.inv ? "!" : "");
+ }
+ printf(", target=block%u", block_id(instr->cat0.target));
+ }
+
+ if (instr->deps_count) {
+ printf(", false-deps:");
+ for (unsigned i = 0; i < instr->deps_count; i++) {
+ if (i > 0)
+ printf(", ");
+ printf("_[");
+ print_instr_name(instr->deps[i]);
+ printf("]");
+ }
+ }
+
+ printf("\n");
+}
+
+void ir3_print_instr(struct ir3_instruction *instr)
+{
+ print_instr(instr, 0);
+}
+
+static void
+print_block(struct ir3_block *block, int lvl)
+{
+ tab(lvl); printf("block%u {\n", block_id(block));
+
+ if (block->predecessors_count > 0) {
+ tab(lvl+1);
+ printf("pred: ");
+ for (unsigned i = 0; i < block->predecessors_count; i++) {
+ if (i)
+ printf(", ");
+ printf("block%u", block_id(block->predecessors[i]));
+ }
+ printf("\n");
+ }
+
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ print_instr(instr, lvl+1);
+ }
+
+ tab(lvl+1); printf("/* keeps:\n");
+ for (unsigned i = 0; i < block->keeps_count; i++) {
+ print_instr(block->keeps[i], lvl+2);
+ }
+ tab(lvl+1); printf(" */\n");
+
+ if (block->successors[1]) {
+ /* leading into if/else: */
+ tab(lvl+1);
+ printf("/* succs: if _[");
+ print_instr_name(block->condition);
+ printf("] block%u; else block%u; */\n",
+ block_id(block->successors[0]),
+ block_id(block->successors[1]));
+ } else if (block->successors[0]) {
+ tab(lvl+1);
+ printf("/* succs: block%u; */\n",
+ block_id(block->successors[0]));
+ }
+ tab(lvl); printf("}\n");
+}
+
+void
+ir3_print(struct ir3 *ir)
+{
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node)
+ print_block(block, 0);
+
+ for (unsigned i = 0; i < ir->noutputs; i++) {
+ if (!ir->outputs[i])
+ continue;
+ printf("out%d: ", i);
+ print_instr(ir->outputs[i], 0);
+ }
+}
diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c
new file mode 100644
index 00000000000..ad09c4018d3
--- /dev/null
+++ b/src/freedreno/ir3/ir3_ra.c
@@ -0,0 +1,1124 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#include "util/u_math.h"
+#include "util/register_allocate.h"
+#include "util/ralloc.h"
+#include "util/bitset.h"
+
+#include "ir3.h"
+#include "ir3_compiler.h"
+
+/*
+ * Register Assignment:
+ *
+ * Uses the register_allocate util, which implements graph coloring
+ * algo with interference classes. To handle the cases where we need
+ * consecutive registers (for example, texture sample instructions),
+ * we model these as larger (double/quad/etc) registers which conflict
+ * with the corresponding registers in other classes.
+ *
+ * Additionally we create additional classes for half-regs, which
+ * do not conflict with the full-reg classes. We do need at least
+ * sizes 1-4 (to deal w/ texture sample instructions output to half-
+ * reg). At the moment we don't create the higher order half-reg
+ * classes as half-reg frequently does not have enough precision
+ * for texture coords at higher resolutions.
+ *
+ * There are some additional cases that we need to handle specially,
+ * as the graph coloring algo doesn't understand "partial writes".
+ * For example, a sequence like:
+ *
+ * add r0.z, ...
+ * sam (f32)(xy)r0.x, ...
+ * ...
+ * sam (f32)(xyzw)r0.w, r0.x, ... ; 3d texture, so r0.xyz are coord
+ *
+ * In this scenario, we treat r0.xyz as class size 3, which is written
+ * (from a use/def perspective) at the 'add' instruction and ignore the
+ * subsequent partial writes to r0.xy. So the 'add r0.z, ...' is the
+ * defining instruction, as it is the first to partially write r0.xyz.
+ *
+ * Note i965 has a similar scenario, which they solve with a virtual
+ * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after
+ * register assignment. But for us that is horrible from a scheduling
+ * standpoint. Instead what we do is use idea of 'definer' instruction.
+ * Ie. the first instruction (lowest ip) to write to the variable is the
+ * one we consider from use/def perspective when building interference
+ * graph. (Other instructions which write other variable components
+ * just define the variable some more.)
+ *
+ * Arrays of arbitrary size are handled via pre-coloring a consecutive
+ * sequence of registers. Additional scalar (single component) reg
+ * names are allocated starting at ctx->class_base[total_class_count]
+ * (see arr->base), which are pre-colored. In the use/def graph direct
+ * access is treated as a single element use/def, and indirect access
+ * is treated as use or def of all array elements. (Only the first
+ * def is tracked, in case of multiple indirect writes, etc.)
+ *
+ * TODO arrays that fit in one of the pre-defined class sizes should
+ * not need to be pre-colored, but instead could be given a normal
+ * vreg name. (Ignoring this for now since it is a good way to work
+ * out the kinks with arbitrary sized arrays.)
+ *
+ * TODO might be easier for debugging to split this into two passes,
+ * the first assigning vreg names in a way that we could ir3_print()
+ * the result.
+ */
+
+static const unsigned class_sizes[] = {
+ 1, 2, 3, 4,
+ 4 + 4, /* txd + 1d/2d */
+ 4 + 6, /* txd + 3d */
+};
+#define class_count ARRAY_SIZE(class_sizes)
+
+static const unsigned half_class_sizes[] = {
+ 1, 2, 3, 4,
+};
+#define half_class_count ARRAY_SIZE(half_class_sizes)
+
+/* seems to just be used for compute shaders? Seems like vec1 and vec3
+ * are sufficient (for now?)
+ */
+static const unsigned high_class_sizes[] = {
+ 1, 3,
+};
+#define high_class_count ARRAY_SIZE(high_class_sizes)
+
+#define total_class_count (class_count + half_class_count + high_class_count)
+
+/* Below a0.x are normal regs. RA doesn't need to assign a0.x/p0.x. */
+#define NUM_REGS (4 * 48) /* r0 to r47 */
+#define NUM_HIGH_REGS (4 * 8) /* r48 to r55 */
+#define FIRST_HIGH_REG (4 * 48)
+/* Number of virtual regs in a given class: */
+#define CLASS_REGS(i) (NUM_REGS - (class_sizes[i] - 1))
+#define HALF_CLASS_REGS(i) (NUM_REGS - (half_class_sizes[i] - 1))
+#define HIGH_CLASS_REGS(i) (NUM_HIGH_REGS - (high_class_sizes[i] - 1))
+
+#define HALF_OFFSET (class_count)
+#define HIGH_OFFSET (class_count + half_class_count)
+
+/* register-set, created one time, used for all shaders: */
+struct ir3_ra_reg_set {
+ struct ra_regs *regs;
+ unsigned int classes[class_count];
+ unsigned int half_classes[half_class_count];
+ unsigned int high_classes[high_class_count];
+ /* maps flat virtual register space to base gpr: */
+ uint16_t *ra_reg_to_gpr;
+ /* maps cls,gpr to flat virtual register space: */
+ uint16_t **gpr_to_ra_reg;
+};
+
+static void
+build_q_values(unsigned int **q_values, unsigned off,
+ const unsigned *sizes, unsigned count)
+{
+ for (unsigned i = 0; i < count; i++) {
+ q_values[i + off] = rzalloc_array(q_values, unsigned, total_class_count);
+
+ /* From register_allocate.c:
+ *
+ * q(B,C) (indexed by C, B is this register class) in
+ * Runeson/Nyström paper. This is "how many registers of B could
+ * the worst choice register from C conflict with".
+ *
+ * If we just let the register allocation algorithm compute these
+ * values, is extremely expensive. However, since all of our
+ * registers are laid out, we can very easily compute them
+ * ourselves. View the register from C as fixed starting at GRF n
+ * somewhere in the middle, and the register from B as sliding back
+ * and forth. Then the first register to conflict from B is the
+ * one starting at n - class_size[B] + 1 and the last register to
+ * conflict will start at n + class_size[B] - 1. Therefore, the
+ * number of conflicts from B is class_size[B] + class_size[C] - 1.
+ *
+ * +-+-+-+-+-+-+ +-+-+-+-+-+-+
+ * B | | | | | |n| --> | | | | | | |
+ * +-+-+-+-+-+-+ +-+-+-+-+-+-+
+ * +-+-+-+-+-+
+ * C |n| | | | |
+ * +-+-+-+-+-+
+ *
+ * (Idea copied from brw_fs_reg_allocate.cpp)
+ */
+ for (unsigned j = 0; j < count; j++)
+ q_values[i + off][j + off] = sizes[i] + sizes[j] - 1;
+ }
+}
+
+/* One-time setup of RA register-set, which describes all the possible
+ * "virtual" registers and their interferences. Ie. double register
+ * occupies (and conflicts with) two single registers, and so forth.
+ * Since registers do not need to be aligned to their class size, they
+ * can conflict with other registers in the same class too. Ie:
+ *
+ * Single (base) | Double
+ * --------------+---------------
+ * R0 | D0
+ * R1 | D0 D1
+ * R2 | D1 D2
+ * R3 | D2
+ * .. and so on..
+ *
+ * (NOTE the disassembler uses notation like r0.x/y/z/w but those are
+ * really just four scalar registers. Don't let that confuse you.)
+ */
+struct ir3_ra_reg_set *
+ir3_ra_alloc_reg_set(struct ir3_compiler *compiler)
+{
+ struct ir3_ra_reg_set *set = rzalloc(compiler, struct ir3_ra_reg_set);
+ unsigned ra_reg_count, reg, first_half_reg, first_high_reg, base;
+ unsigned int **q_values;
+
+ /* calculate # of regs across all classes: */
+ ra_reg_count = 0;
+ for (unsigned i = 0; i < class_count; i++)
+ ra_reg_count += CLASS_REGS(i);
+ for (unsigned i = 0; i < half_class_count; i++)
+ ra_reg_count += HALF_CLASS_REGS(i);
+ for (unsigned i = 0; i < high_class_count; i++)
+ ra_reg_count += HIGH_CLASS_REGS(i);
+
+ /* allocate and populate q_values: */
+ q_values = ralloc_array(set, unsigned *, total_class_count);
+
+ build_q_values(q_values, 0, class_sizes, class_count);
+ build_q_values(q_values, HALF_OFFSET, half_class_sizes, half_class_count);
+ build_q_values(q_values, HIGH_OFFSET, high_class_sizes, high_class_count);
+
+ /* allocate the reg-set.. */
+ set->regs = ra_alloc_reg_set(set, ra_reg_count, true);
+ set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
+ set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
+
+ /* .. and classes */
+ reg = 0;
+ for (unsigned i = 0; i < class_count; i++) {
+ set->classes[i] = ra_alloc_reg_class(set->regs);
+
+ set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i));
+
+ for (unsigned j = 0; j < CLASS_REGS(i); j++) {
+ ra_class_add_reg(set->regs, set->classes[i], reg);
+
+ set->ra_reg_to_gpr[reg] = j;
+ set->gpr_to_ra_reg[i][j] = reg;
+
+ for (unsigned br = j; br < j + class_sizes[i]; br++)
+ ra_add_transitive_reg_conflict(set->regs, br, reg);
+
+ reg++;
+ }
+ }
+
+ first_half_reg = reg;
+ base = HALF_OFFSET;
+
+ for (unsigned i = 0; i < half_class_count; i++) {
+ set->half_classes[i] = ra_alloc_reg_class(set->regs);
+
+ set->gpr_to_ra_reg[base + i] =
+ ralloc_array(set, uint16_t, HALF_CLASS_REGS(i));
+
+ for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
+ ra_class_add_reg(set->regs, set->half_classes[i], reg);
+
+ set->ra_reg_to_gpr[reg] = j;
+ set->gpr_to_ra_reg[base + i][j] = reg;
+
+ for (unsigned br = j; br < j + half_class_sizes[i]; br++)
+ ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg);
+
+ reg++;
+ }
+ }
+
+ first_high_reg = reg;
+ base = HIGH_OFFSET;
+
+ for (unsigned i = 0; i < high_class_count; i++) {
+ set->high_classes[i] = ra_alloc_reg_class(set->regs);
+
+ set->gpr_to_ra_reg[base + i] =
+ ralloc_array(set, uint16_t, HIGH_CLASS_REGS(i));
+
+ for (unsigned j = 0; j < HIGH_CLASS_REGS(i); j++) {
+ ra_class_add_reg(set->regs, set->high_classes[i], reg);
+
+ set->ra_reg_to_gpr[reg] = j;
+ set->gpr_to_ra_reg[base + i][j] = reg;
+
+ for (unsigned br = j; br < j + high_class_sizes[i]; br++)
+ ra_add_transitive_reg_conflict(set->regs, br + first_high_reg, reg);
+
+ reg++;
+ }
+ }
+
+ /* starting a6xx, half precision regs conflict w/ full precision regs: */
+ if (compiler->gpu_id >= 600) {
+ /* because of transitivity, we can get away with just setting up
+ * conflicts between the first class of full and half regs:
+ */
+ for (unsigned j = 0; j < CLASS_REGS(0) / 2; j++) {
+ unsigned freg = set->gpr_to_ra_reg[0][j];
+ unsigned hreg0 = set->gpr_to_ra_reg[HALF_OFFSET][(j * 2) + 0];
+ unsigned hreg1 = set->gpr_to_ra_reg[HALF_OFFSET][(j * 2) + 1];
+
+ ra_add_transitive_reg_conflict(set->regs, freg, hreg0);
+ ra_add_transitive_reg_conflict(set->regs, freg, hreg1);
+ }
+
+ // TODO also need to update q_values, but for now:
+ ra_set_finalize(set->regs, NULL);
+ } else {
+ ra_set_finalize(set->regs, q_values);
+ }
+
+ ralloc_free(q_values);
+
+ return set;
+}
+
+/* additional block-data (per-block) */
+struct ir3_ra_block_data {
+ BITSET_WORD *def; /* variables defined before used in block */
+ BITSET_WORD *use; /* variables used before defined in block */
+ BITSET_WORD *livein; /* which defs reach entry point of block */
+ BITSET_WORD *liveout; /* which defs reach exit point of block */
+};
+
+/* additional instruction-data (per-instruction) */
+struct ir3_ra_instr_data {
+ /* cached instruction 'definer' info: */
+ struct ir3_instruction *defn;
+ int off, sz, cls;
+};
+
+/* register-assign context, per-shader */
+struct ir3_ra_ctx {
+ struct ir3 *ir;
+ gl_shader_stage type;
+ bool frag_face;
+
+ struct ir3_ra_reg_set *set;
+ struct ra_graph *g;
+ unsigned alloc_count;
+ /* one per class, plus one slot for arrays: */
+ unsigned class_alloc_count[total_class_count + 1];
+ unsigned class_base[total_class_count + 1];
+ unsigned instr_cnt;
+ unsigned *def, *use; /* def/use table */
+ struct ir3_ra_instr_data *instrd;
+};
+
+/* does it conflict? */
+static inline bool
+intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end)
+{
+ return !((a_start >= b_end) || (b_start >= a_end));
+}
+
+static bool
+is_half(struct ir3_instruction *instr)
+{
+ return !!(instr->regs[0]->flags & IR3_REG_HALF);
+}
+
+static bool
+is_high(struct ir3_instruction *instr)
+{
+ return !!(instr->regs[0]->flags & IR3_REG_HIGH);
+}
+
+static int
+size_to_class(unsigned sz, bool half, bool high)
+{
+ if (high) {
+ for (unsigned i = 0; i < high_class_count; i++)
+ if (high_class_sizes[i] >= sz)
+ return i + HIGH_OFFSET;
+ } else if (half) {
+ for (unsigned i = 0; i < half_class_count; i++)
+ if (half_class_sizes[i] >= sz)
+ return i + HALF_OFFSET;
+ } else {
+ for (unsigned i = 0; i < class_count; i++)
+ if (class_sizes[i] >= sz)
+ return i;
+ }
+ debug_assert(0);
+ return -1;
+}
+
+static bool
+writes_gpr(struct ir3_instruction *instr)
+{
+ if (is_store(instr))
+ return false;
+ /* is dest a normal temp register: */
+ struct ir3_register *reg = instr->regs[0];
+ if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
+ return false;
+ if ((reg->num == regid(REG_A0, 0)) ||
+ (reg->num == regid(REG_P0, 0)))
+ return false;
+ return true;
+}
+
+static bool
+instr_before(struct ir3_instruction *a, struct ir3_instruction *b)
+{
+ if (a->flags & IR3_INSTR_UNUSED)
+ return false;
+ return (a->ip < b->ip);
+}
+
+static struct ir3_instruction *
+get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
+ int *sz, int *off)
+{
+ struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+ struct ir3_instruction *d = NULL;
+
+ if (id->defn) {
+ *sz = id->sz;
+ *off = id->off;
+ return id->defn;
+ }
+
+ if (instr->opc == OPC_META_FI) {
+ /* What about the case where collect is subset of array, we
+ * need to find the distance between where actual array starts
+ * and fanin.. that probably doesn't happen currently.
+ */
+ struct ir3_register *src;
+ int dsz, doff;
+
+ /* note: don't use foreach_ssa_src as this gets called once
+ * while assigning regs (which clears SSA flag)
+ */
+ foreach_src_n(src, n, instr) {
+ struct ir3_instruction *dd;
+ if (!src->instr)
+ continue;
+
+ dd = get_definer(ctx, src->instr, &dsz, &doff);
+
+ if ((!d) || instr_before(dd, d)) {
+ d = dd;
+ *sz = dsz;
+ *off = doff - n;
+ }
+ }
+
+ } else if (instr->cp.right || instr->cp.left) {
+ /* covers also the meta:fo case, which ends up w/ single
+ * scalar instructions for each component:
+ */
+ struct ir3_instruction *f = ir3_neighbor_first(instr);
+
+ /* by definition, the entire sequence forms one linked list
+ * of single scalar register nodes (even if some of them may
+ * be fanouts from a texture sample (for example) instr. We
+ * just need to walk the list finding the first element of
+ * the group defined (lowest ip)
+ */
+ int cnt = 0;
+
+ /* need to skip over unused in the group: */
+ while (f && (f->flags & IR3_INSTR_UNUSED)) {
+ f = f->cp.right;
+ cnt++;
+ }
+
+ while (f) {
+ if ((!d) || instr_before(f, d))
+ d = f;
+ if (f == instr)
+ *off = cnt;
+ f = f->cp.right;
+ cnt++;
+ }
+
+ *sz = cnt;
+
+ } else {
+ /* second case is looking directly at the instruction which
+ * produces multiple values (eg, texture sample), rather
+ * than the fanout nodes that point back to that instruction.
+ * This isn't quite right, because it may be part of a larger
+ * group, such as:
+ *
+ * sam (f32)(xyzw)r0.x, ...
+ * add r1.x, ...
+ * add r1.y, ...
+ * sam (f32)(xyzw)r2.x, r0.w <-- (r0.w, r1.x, r1.y)
+ *
+ * need to come up with a better way to handle that case.
+ */
+ if (instr->address) {
+ *sz = instr->regs[0]->size;
+ } else {
+ *sz = util_last_bit(instr->regs[0]->wrmask);
+ }
+ *off = 0;
+ d = instr;
+ }
+
+ if (d->opc == OPC_META_FO) {
+ struct ir3_instruction *dd;
+ int dsz, doff;
+
+ dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff);
+
+ /* by definition, should come before: */
+ debug_assert(instr_before(dd, d));
+
+ *sz = MAX2(*sz, dsz);
+
+ debug_assert(instr->opc == OPC_META_FO);
+ *off = MAX2(*off, instr->fo.off);
+
+ d = dd;
+ }
+
+ id->defn = d;
+ id->sz = *sz;
+ id->off = *off;
+
+ return d;
+}
+
+static void
+ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+ if (instr->regs_count == 0)
+ continue;
+ /* couple special cases: */
+ if (writes_addr(instr) || writes_pred(instr)) {
+ id->cls = -1;
+ } else if (instr->regs[0]->flags & IR3_REG_ARRAY) {
+ id->cls = total_class_count;
+ } else {
+ id->defn = get_definer(ctx, instr, &id->sz, &id->off);
+ id->cls = size_to_class(id->sz, is_half(id->defn), is_high(id->defn));
+ }
+ }
+}
+
+/* give each instruction a name (and ip), and count up the # of names
+ * of each class
+ */
+static void
+ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+
+#ifdef DEBUG
+ instr->name = ~0;
+#endif
+
+ ctx->instr_cnt++;
+
+ if (instr->regs_count == 0)
+ continue;
+
+ if (!writes_gpr(instr))
+ continue;
+
+ if (id->defn != instr)
+ continue;
+
+ /* arrays which don't fit in one of the pre-defined class
+ * sizes are pre-colored:
+ */
+ if ((id->cls >= 0) && (id->cls < total_class_count)) {
+ instr->name = ctx->class_alloc_count[id->cls]++;
+ ctx->alloc_count++;
+ }
+ }
+}
+
+static void
+ra_init(struct ir3_ra_ctx *ctx)
+{
+ unsigned n, base;
+
+ ir3_clear_mark(ctx->ir);
+ n = ir3_count_instructions(ctx->ir);
+
+ ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n);
+
+ list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+ ra_block_find_definers(ctx, block);
+ }
+
+ list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+ ra_block_name_instructions(ctx, block);
+ }
+
+ /* figure out the base register name for each class. The
+ * actual ra name is class_base[cls] + instr->name;
+ */
+ ctx->class_base[0] = 0;
+ for (unsigned i = 1; i <= total_class_count; i++) {
+ ctx->class_base[i] = ctx->class_base[i-1] +
+ ctx->class_alloc_count[i-1];
+ }
+
+ /* and vreg names for array elements: */
+ base = ctx->class_base[total_class_count];
+ list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+ arr->base = base;
+ ctx->class_alloc_count[total_class_count] += arr->length;
+ base += arr->length;
+ }
+ ctx->alloc_count += ctx->class_alloc_count[total_class_count];
+
+ ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
+ ralloc_steal(ctx->g, ctx->instrd);
+ ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
+ ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
+}
+
+static unsigned
+__ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
+{
+ unsigned name;
+ debug_assert(cls >= 0);
+ debug_assert(cls < total_class_count); /* we shouldn't get arrays here.. */
+ name = ctx->class_base[cls] + defn->name;
+ debug_assert(name < ctx->alloc_count);
+ return name;
+}
+
+static int
+ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id)
+{
+ /* TODO handle name mapping for arrays */
+ return __ra_name(ctx, id->cls, id->defn);
+}
+
+static void
+ra_destroy(struct ir3_ra_ctx *ctx)
+{
+ ralloc_free(ctx->g);
+}
+
+static void
+ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+ struct ir3_ra_block_data *bd;
+ unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+
+#define def(name, instr) \
+ do { \
+ /* defined on first write: */ \
+ if (!ctx->def[name]) \
+ ctx->def[name] = instr->ip; \
+ ctx->use[name] = instr->ip; \
+ BITSET_SET(bd->def, name); \
+ } while(0);
+
+#define use(name, instr) \
+ do { \
+ ctx->use[name] = MAX2(ctx->use[name], instr->ip); \
+ if (!BITSET_TEST(bd->def, name)) \
+ BITSET_SET(bd->use, name); \
+ } while(0);
+
+ bd = rzalloc(ctx->g, struct ir3_ra_block_data);
+
+ bd->def = rzalloc_array(bd, BITSET_WORD, bitset_words);
+ bd->use = rzalloc_array(bd, BITSET_WORD, bitset_words);
+ bd->livein = rzalloc_array(bd, BITSET_WORD, bitset_words);
+ bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
+
+ block->data = bd;
+
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ struct ir3_instruction *src;
+ struct ir3_register *reg;
+
+ if (instr->regs_count == 0)
+ continue;
+
+ /* There are a couple special cases to deal with here:
+ *
+ * fanout: used to split values from a higher class to a lower
+ * class, for example split the results of a texture fetch
+ * into individual scalar values; We skip over these from
+ * a 'def' perspective, and for a 'use' we walk the chain
+ * up to the defining instruction.
+ *
+ * fanin: used to collect values from lower class and assemble
+ * them together into a higher class, for example arguments
+ * to texture sample instructions; We consider these to be
+ * defined at the earliest fanin source.
+ *
+ * Most of this is handled in the get_definer() helper.
+ *
+ * In either case, we trace the instruction back to the original
+ * definer and consider that as the def/use ip.
+ */
+
+ if (writes_gpr(instr)) {
+ struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+ struct ir3_register *dst = instr->regs[0];
+
+ if (dst->flags & IR3_REG_ARRAY) {
+ struct ir3_array *arr =
+ ir3_lookup_array(ctx->ir, dst->array.id);
+ unsigned i;
+
+ arr->start_ip = MIN2(arr->start_ip, instr->ip);
+ arr->end_ip = MAX2(arr->end_ip, instr->ip);
+
+ /* set the node class now.. in case we don't encounter
+ * this array dst again. From register_alloc algo's
+ * perspective, these are all single/scalar regs:
+ */
+ for (i = 0; i < arr->length; i++) {
+ unsigned name = arr->base + i;
+ ra_set_node_class(ctx->g, name, ctx->set->classes[0]);
+ }
+
+ /* indirect write is treated like a write to all array
+ * elements, since we don't know which one is actually
+ * written:
+ */
+ if (dst->flags & IR3_REG_RELATIV) {
+ for (i = 0; i < arr->length; i++) {
+ unsigned name = arr->base + i;
+ def(name, instr);
+ }
+ } else {
+ unsigned name = arr->base + dst->array.offset;
+ def(name, instr);
+ }
+
+ } else if (id->defn == instr) {
+ unsigned name = ra_name(ctx, id);
+
+ /* since we are in SSA at this point: */
+ debug_assert(!BITSET_TEST(bd->use, name));
+
+ def(name, id->defn);
+
+ if (is_high(id->defn)) {
+ ra_set_node_class(ctx->g, name,
+ ctx->set->high_classes[id->cls - HIGH_OFFSET]);
+ } else if (is_half(id->defn)) {
+ ra_set_node_class(ctx->g, name,
+ ctx->set->half_classes[id->cls - HALF_OFFSET]);
+ } else {
+ ra_set_node_class(ctx->g, name,
+ ctx->set->classes[id->cls]);
+ }
+ }
+ }
+
+ foreach_src(reg, instr) {
+ if (reg->flags & IR3_REG_ARRAY) {
+ struct ir3_array *arr =
+ ir3_lookup_array(ctx->ir, reg->array.id);
+ arr->start_ip = MIN2(arr->start_ip, instr->ip);
+ arr->end_ip = MAX2(arr->end_ip, instr->ip);
+
+ /* indirect read is treated like a read fromall array
+ * elements, since we don't know which one is actually
+ * read:
+ */
+ if (reg->flags & IR3_REG_RELATIV) {
+ unsigned i;
+ for (i = 0; i < arr->length; i++) {
+ unsigned name = arr->base + i;
+ use(name, instr);
+ }
+ } else {
+ unsigned name = arr->base + reg->array.offset;
+ use(name, instr);
+ /* NOTE: arrays are not SSA so unconditionally
+ * set use bit:
+ */
+ BITSET_SET(bd->use, name);
+ debug_assert(reg->array.offset < arr->length);
+ }
+ } else if ((src = ssa(reg)) && writes_gpr(src)) {
+ unsigned name = ra_name(ctx, &ctx->instrd[src->ip]);
+ use(name, instr);
+ }
+ }
+ }
+}
+
+static bool
+ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
+{
+ unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+ bool progress = false;
+
+ list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+ struct ir3_ra_block_data *bd = block->data;
+
+ /* update livein: */
+ for (unsigned i = 0; i < bitset_words; i++) {
+ BITSET_WORD new_livein =
+ (bd->use[i] | (bd->liveout[i] & ~bd->def[i]));
+
+ if (new_livein & ~bd->livein[i]) {
+ bd->livein[i] |= new_livein;
+ progress = true;
+ }
+ }
+
+ /* update liveout: */
+ for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) {
+ struct ir3_block *succ = block->successors[j];
+ struct ir3_ra_block_data *succ_bd;
+
+ if (!succ)
+ continue;
+
+ succ_bd = succ->data;
+
+ for (unsigned i = 0; i < bitset_words; i++) {
+ BITSET_WORD new_liveout =
+ (succ_bd->livein[i] & ~bd->liveout[i]);
+
+ if (new_liveout) {
+ bd->liveout[i] |= new_liveout;
+ progress = true;
+ }
+ }
+ }
+ }
+
+ return progress;
+}
+
+static void
+print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt)
+{
+ bool first = true;
+ debug_printf(" %s:", name);
+ for (unsigned i = 0; i < cnt; i++) {
+ if (BITSET_TEST(bs, i)) {
+ if (!first)
+ debug_printf(",");
+ debug_printf(" %04u", i);
+ first = false;
+ }
+ }
+ debug_printf("\n");
+}
+
+static void
+ra_add_interference(struct ir3_ra_ctx *ctx)
+{
+ struct ir3 *ir = ctx->ir;
+
+ /* initialize array live ranges: */
+ list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
+ arr->start_ip = ~0;
+ arr->end_ip = 0;
+ }
+
+ /* compute live ranges (use/def) on a block level, also updating
+ * block's def/use bitmasks (used below to calculate per-block
+ * livein/liveout):
+ */
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ ra_block_compute_live_ranges(ctx, block);
+ }
+
+ /* update per-block livein/liveout: */
+ while (ra_compute_livein_liveout(ctx)) {}
+
+ if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+ debug_printf("AFTER LIVEIN/OUT:\n");
+ ir3_print(ir);
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ struct ir3_ra_block_data *bd = block->data;
+ debug_printf("block%u:\n", block_id(block));
+ print_bitset(" def", bd->def, ctx->alloc_count);
+ print_bitset(" use", bd->use, ctx->alloc_count);
+ print_bitset(" l/i", bd->livein, ctx->alloc_count);
+ print_bitset(" l/o", bd->liveout, ctx->alloc_count);
+ }
+ list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
+ debug_printf("array%u:\n", arr->id);
+ debug_printf(" length: %u\n", arr->length);
+ debug_printf(" start_ip: %u\n", arr->start_ip);
+ debug_printf(" end_ip: %u\n", arr->end_ip);
+ }
+ }
+
+ /* extend start/end ranges based on livein/liveout info from cfg: */
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ struct ir3_ra_block_data *bd = block->data;
+
+ for (unsigned i = 0; i < ctx->alloc_count; i++) {
+ if (BITSET_TEST(bd->livein, i)) {
+ ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
+ ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
+ }
+
+ if (BITSET_TEST(bd->liveout, i)) {
+ ctx->def[i] = MIN2(ctx->def[i], block->end_ip);
+ ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
+ }
+ }
+
+ list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+ for (unsigned i = 0; i < arr->length; i++) {
+ if (BITSET_TEST(bd->livein, i + arr->base)) {
+ arr->start_ip = MIN2(arr->start_ip, block->start_ip);
+ }
+ if (BITSET_TEST(bd->livein, i + arr->base)) {
+ arr->end_ip = MAX2(arr->end_ip, block->end_ip);
+ }
+ }
+ }
+ }
+
+ /* need to fix things up to keep outputs live: */
+ for (unsigned i = 0; i < ir->noutputs; i++) {
+ struct ir3_instruction *instr = ir->outputs[i];
+ unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]);
+ ctx->use[name] = ctx->instr_cnt;
+ }
+
+ for (unsigned i = 0; i < ctx->alloc_count; i++) {
+ for (unsigned j = 0; j < ctx->alloc_count; j++) {
+ if (intersects(ctx->def[i], ctx->use[i],
+ ctx->def[j], ctx->use[j])) {
+ ra_add_node_interference(ctx->g, i, j);
+ }
+ }
+ }
+}
+
+/* some instructions need fix-up if dst register is half precision: */
+static void fixup_half_instr_dst(struct ir3_instruction *instr)
+{
+ switch (opc_cat(instr->opc)) {
+ case 1: /* move instructions */
+ instr->cat1.dst_type = half_type(instr->cat1.dst_type);
+ break;
+ case 3:
+ switch (instr->opc) {
+ case OPC_MAD_F32:
+ instr->opc = OPC_MAD_F16;
+ break;
+ case OPC_SEL_B32:
+ instr->opc = OPC_SEL_B16;
+ break;
+ case OPC_SEL_S32:
+ instr->opc = OPC_SEL_S16;
+ break;
+ case OPC_SEL_F32:
+ instr->opc = OPC_SEL_F16;
+ break;
+ case OPC_SAD_S32:
+ instr->opc = OPC_SAD_S16;
+ break;
+ /* instructions may already be fixed up: */
+ case OPC_MAD_F16:
+ case OPC_SEL_B16:
+ case OPC_SEL_S16:
+ case OPC_SEL_F16:
+ case OPC_SAD_S16:
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ break;
+ case 5:
+ instr->cat5.type = half_type(instr->cat5.type);
+ break;
+ }
+}
+/* some instructions need fix-up if src register is half precision: */
+static void fixup_half_instr_src(struct ir3_instruction *instr)
+{
+ switch (instr->opc) {
+ case OPC_MOV:
+ instr->cat1.src_type = half_type(instr->cat1.src_type);
+ break;
+ default:
+ break;
+ }
+}
+
+/* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first
+ * array access(es) which do not have any previous access to depend
+ * on from scheduling point of view
+ */
+static void
+reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
+ struct ir3_instruction *instr)
+{
+ struct ir3_ra_instr_data *id;
+
+ if (reg->flags & IR3_REG_ARRAY) {
+ struct ir3_array *arr =
+ ir3_lookup_array(ctx->ir, reg->array.id);
+ unsigned name = arr->base + reg->array.offset;
+ unsigned r = ra_get_node_reg(ctx->g, name);
+ unsigned num = ctx->set->ra_reg_to_gpr[r];
+
+ if (reg->flags & IR3_REG_RELATIV) {
+ reg->array.offset = num;
+ } else {
+ reg->num = num;
+ reg->flags &= ~IR3_REG_SSA;
+ }
+
+ reg->flags &= ~IR3_REG_ARRAY;
+ } else if ((id = &ctx->instrd[instr->ip]) && id->defn) {
+ unsigned name = ra_name(ctx, id);
+ unsigned r = ra_get_node_reg(ctx->g, name);
+ unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
+
+ debug_assert(!(reg->flags & IR3_REG_RELATIV));
+
+ if (is_high(id->defn))
+ num += FIRST_HIGH_REG;
+
+ reg->num = num;
+ reg->flags &= ~IR3_REG_SSA;
+
+ if (is_half(id->defn))
+ reg->flags |= IR3_REG_HALF;
+ }
+}
+
+static void
+ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ struct ir3_register *reg;
+
+ if (instr->regs_count == 0)
+ continue;
+
+ if (writes_gpr(instr)) {
+ reg_assign(ctx, instr->regs[0], instr);
+ if (instr->regs[0]->flags & IR3_REG_HALF)
+ fixup_half_instr_dst(instr);
+ }
+
+ foreach_src_n(reg, n, instr) {
+ struct ir3_instruction *src = reg->instr;
+ /* Note: reg->instr could be null for IR3_REG_ARRAY */
+ if (!(src || (reg->flags & IR3_REG_ARRAY)))
+ continue;
+ reg_assign(ctx, instr->regs[n+1], src);
+ if (instr->regs[n+1]->flags & IR3_REG_HALF)
+ fixup_half_instr_src(instr);
+ }
+ }
+}
+
+static int
+ra_alloc(struct ir3_ra_ctx *ctx)
+{
+ /* pre-assign array elements:
+ */
+ list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+ unsigned base = 0;
+
+ if (arr->end_ip == 0)
+ continue;
+
+ /* figure out what else we conflict with which has already
+ * been assigned:
+ */
+retry:
+ list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) {
+ if (arr2 == arr)
+ break;
+ if (arr2->end_ip == 0)
+ continue;
+ /* if it intersects with liverange AND register range.. */
+ if (intersects(arr->start_ip, arr->end_ip,
+ arr2->start_ip, arr2->end_ip) &&
+ intersects(base, base + arr->length,
+ arr2->reg, arr2->reg + arr2->length)) {
+ base = MAX2(base, arr2->reg + arr2->length);
+ goto retry;
+ }
+ }
+
+ arr->reg = base;
+
+ for (unsigned i = 0; i < arr->length; i++) {
+ unsigned name, reg;
+
+ name = arr->base + i;
+ reg = ctx->set->gpr_to_ra_reg[0][base++];
+
+ ra_set_node_reg(ctx->g, name, reg);
+ }
+ }
+
+ if (!ra_allocate(ctx->g))
+ return -1;
+
+ list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+ ra_block_alloc(ctx, block);
+ }
+
+ return 0;
+}
+
+int ir3_ra(struct ir3 *ir, gl_shader_stage type,
+ bool frag_coord, bool frag_face)
+{
+ struct ir3_ra_ctx ctx = {
+ .ir = ir,
+ .type = type,
+ .frag_face = frag_face,
+ .set = ir->compiler->set,
+ };
+ int ret;
+
+ ra_init(&ctx);
+ ra_add_interference(&ctx);
+ ret = ra_alloc(&ctx);
+ ra_destroy(&ctx);
+
+ return ret;
+}
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c
new file mode 100644
index 00000000000..6552980d90c
--- /dev/null
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -0,0 +1,818 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+
+#include "util/u_math.h"
+
+#include "ir3.h"
+
+/*
+ * Instruction Scheduling:
+ *
+ * A recursive depth based scheduling algo. Recursively find an eligible
+ * instruction to schedule from the deepest instruction (recursing through
+ * it's unscheduled src instructions). Normally this would result in a
+ * lot of re-traversal of the same instructions, so we cache results in
+ * instr->data (and clear cached results that would be no longer valid
+ * after scheduling an instruction).
+ *
+ * There are a few special cases that need to be handled, since sched
+ * is currently independent of register allocation. Usages of address
+ * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
+ * if you have two pairs of instructions that write the same special
+ * register and then read it, then those pairs cannot be interleaved.
+ * To solve this, when we are in such a scheduling "critical section",
+ * and we encounter a conflicting write to a special register, we try
+ * to schedule any remaining instructions that use that value first.
+ */
+
+struct ir3_sched_ctx {
+ struct ir3_block *block; /* the current block */
+ struct list_head depth_list; /* depth sorted unscheduled instrs */
+ struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
+ struct ir3_instruction *addr; /* current a0.x user, if any */
+ struct ir3_instruction *pred; /* current p0.x user, if any */
+ bool error;
+};
+
+static bool is_sfu_or_mem(struct ir3_instruction *instr)
+{
+ return is_sfu(instr) || is_mem(instr);
+}
+
+#define NULL_INSTR ((void *)~0)
+
+static void
+clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+{
+ list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
+ if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
+ instr2->data = NULL;
+ }
+}
+
+static void
+schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+{
+ debug_assert(ctx->block == instr->block);
+
+ /* maybe there is a better way to handle this than just stuffing
+ * a nop.. ideally we'd know about this constraint in the
+ * scheduling and depth calculation..
+ */
+ if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
+ ir3_NOP(ctx->block);
+
+ /* remove from depth list:
+ */
+ list_delinit(&instr->node);
+
+ if (writes_addr(instr)) {
+ debug_assert(ctx->addr == NULL);
+ ctx->addr = instr;
+ }
+
+ if (writes_pred(instr)) {
+ debug_assert(ctx->pred == NULL);
+ ctx->pred = instr;
+ }
+
+ instr->flags |= IR3_INSTR_MARK;
+
+ list_addtail(&instr->node, &instr->block->instr_list);
+ ctx->scheduled = instr;
+
+ if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
+ clear_cache(ctx, NULL);
+ } else {
+ /* invalidate only the necessary entries.. */
+ clear_cache(ctx, instr);
+ }
+}
+
+static struct ir3_instruction *
+deepest(struct ir3_instruction **srcs, unsigned nsrcs)
+{
+ struct ir3_instruction *d = NULL;
+ unsigned i = 0, id = 0;
+
+ while ((i < nsrcs) && !(d = srcs[id = i]))
+ i++;
+
+ if (!d)
+ return NULL;
+
+ for (; i < nsrcs; i++)
+ if (srcs[i] && (srcs[i]->depth > d->depth))
+ d = srcs[id = i];
+
+ srcs[id] = NULL;
+
+ return d;
+}
+
+/**
+ * @block: the block to search in, starting from end; in first pass,
+ * this will be the block the instruction would be inserted into
+ * (but has not yet, ie. it only contains already scheduled
+ * instructions). For intra-block scheduling (second pass), this
+ * would be one of the predecessor blocks.
+ * @instr: the instruction to search for
+ * @maxd: max distance, bail after searching this # of instruction
+ * slots, since it means the instruction we are looking for is
+ * far enough away
+ * @pred: if true, recursively search into predecessor blocks to
+ * find the worst case (shortest) distance (only possible after
+ * individual blocks are all scheduled
+ */
+static unsigned
+distance(struct ir3_block *block, struct ir3_instruction *instr,
+ unsigned maxd, bool pred)
+{
+ unsigned d = 0;
+
+ list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
+ if ((n == instr) || (d >= maxd))
+ return d;
+ /* NOTE: don't count branch/jump since we don't know yet if they will
+ * be eliminated later in resolve_jumps().. really should do that
+ * earlier so we don't have this constraint.
+ */
+ if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
+ d++;
+ }
+
+ /* if coming from a predecessor block, assume it is assigned far
+ * enough away.. we'll fix up later.
+ */
+ if (!pred)
+ return maxd;
+
+ if (pred && (block->data != block)) {
+ /* Search into predecessor blocks, finding the one with the
+ * shortest distance, since that will be the worst case
+ */
+ unsigned min = maxd - d;
+
+ /* (ab)use block->data to prevent recursion: */
+ block->data = block;
+
+ for (unsigned i = 0; i < block->predecessors_count; i++) {
+ unsigned n;
+
+ n = distance(block->predecessors[i], instr, min, pred);
+
+ min = MIN2(min, n);
+ }
+
+ block->data = NULL;
+ d += min;
+ }
+
+ return d;
+}
+
+/* calculate delay for specified src: */
+static unsigned
+delay_calc_srcn(struct ir3_block *block,
+ struct ir3_instruction *assigner,
+ struct ir3_instruction *consumer,
+ unsigned srcn, bool soft, bool pred)
+{
+ unsigned delay = 0;
+
+ if (is_meta(assigner)) {
+ struct ir3_instruction *src;
+ foreach_ssa_src(src, assigner) {
+ unsigned d;
+ d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
+ delay = MAX2(delay, d);
+ }
+ } else {
+ if (soft) {
+ if (is_sfu(assigner)) {
+ delay = 4;
+ } else {
+ delay = ir3_delayslots(assigner, consumer, srcn);
+ }
+ } else {
+ delay = ir3_delayslots(assigner, consumer, srcn);
+ }
+ delay -= distance(block, assigner, delay, pred);
+ }
+
+ return delay;
+}
+
+/* calculate delay for instruction (maximum of delay for all srcs): */
+static unsigned
+delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
+ bool soft, bool pred)
+{
+ unsigned delay = 0;
+ struct ir3_instruction *src;
+
+ foreach_ssa_src_n(src, i, instr) {
+ unsigned d;
+ d = delay_calc_srcn(block, src, instr, i, soft, pred);
+ delay = MAX2(delay, d);
+ }
+
+ return delay;
+}
+
+struct ir3_sched_notes {
+ /* there is at least one kill which could be scheduled, except
+ * for unscheduled bary.f's:
+ */
+ bool blocked_kill;
+ /* there is at least one instruction that could be scheduled,
+ * except for conflicting address/predicate register usage:
+ */
+ bool addr_conflict, pred_conflict;
+};
+
+static bool is_scheduled(struct ir3_instruction *instr)
+{
+ return !!(instr->flags & IR3_INSTR_MARK);
+}
+
+/* could an instruction be scheduled if specified ssa src was scheduled? */
+static bool
+could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
+{
+ struct ir3_instruction *other_src;
+ foreach_ssa_src(other_src, instr) {
+ /* if dependency not scheduled, we aren't ready yet: */
+ if ((src != other_src) && !is_scheduled(other_src)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+/* Check if instruction is ok to schedule. Make sure it is not blocked
+ * by use of addr/predicate register, etc.
+ */
+static bool
+check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+ struct ir3_instruction *instr)
+{
+ /* For instructions that write address register we need to
+ * make sure there is at least one instruction that uses the
+ * addr value which is otherwise ready.
+ *
+ * TODO if any instructions use pred register and have other
+ * src args, we would need to do the same for writes_pred()..
+ */
+ if (writes_addr(instr)) {
+ struct ir3 *ir = instr->block->shader;
+ bool ready = false;
+ for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
+ struct ir3_instruction *indirect = ir->indirects[i];
+ if (!indirect)
+ continue;
+ if (indirect->address != instr)
+ continue;
+ ready = could_sched(indirect, instr);
+ }
+
+ /* nothing could be scheduled, so keep looking: */
+ if (!ready)
+ return false;
+ }
+
+ /* if this is a write to address/predicate register, and that
+ * register is currently in use, we need to defer until it is
+ * free:
+ */
+ if (writes_addr(instr) && ctx->addr) {
+ debug_assert(ctx->addr != instr);
+ notes->addr_conflict = true;
+ return false;
+ }
+
+ if (writes_pred(instr) && ctx->pred) {
+ debug_assert(ctx->pred != instr);
+ notes->pred_conflict = true;
+ return false;
+ }
+
+ /* if the instruction is a kill, we need to ensure *every*
+ * bary.f is scheduled. The hw seems unhappy if the thread
+ * gets killed before the end-input (ei) flag is hit.
+ *
+ * We could do this by adding each bary.f instruction as
+ * virtual ssa src for the kill instruction. But we have
+ * fixed length instr->regs[].
+ *
+ * TODO this wouldn't be quite right if we had multiple
+ * basic blocks, if any block was conditional. We'd need
+ * to schedule the bary.f's outside of any block which
+ * was conditional that contained a kill.. I think..
+ */
+ if (is_kill(instr)) {
+ struct ir3 *ir = instr->block->shader;
+
+ for (unsigned i = 0; i < ir->baryfs_count; i++) {
+ struct ir3_instruction *baryf = ir->baryfs[i];
+ if (baryf->flags & IR3_INSTR_UNUSED)
+ continue;
+ if (!is_scheduled(baryf)) {
+ notes->blocked_kill = true;
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+/* Find the best instruction to schedule from specified instruction or
+ * recursively it's ssa sources.
+ */
+static struct ir3_instruction *
+find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+ struct ir3_instruction *instr)
+{
+ struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
+ struct ir3_instruction *src;
+ unsigned nsrcs = 0;
+
+ if (is_scheduled(instr))
+ return NULL;
+
+ /* use instr->data to cache the results of recursing up the
+ * instr src's. Otherwise the recursive algo can scale quite
+ * badly w/ shader size. But this takes some care to clear
+ * the cache appropriately when instructions are scheduled.
+ */
+ if (instr->data) {
+ if (instr->data == NULL_INSTR)
+ return NULL;
+ return instr->data;
+ }
+
+ /* find unscheduled srcs: */
+ foreach_ssa_src(src, instr) {
+ if (!is_scheduled(src)) {
+ debug_assert(nsrcs < ARRAY_SIZE(srcs));
+ srcs[nsrcs++] = src;
+ }
+ }
+
+ /* if all our src's are already scheduled: */
+ if (nsrcs == 0) {
+ if (check_instr(ctx, notes, instr)) {
+ instr->data = instr;
+ return instr;
+ }
+ return NULL;
+ }
+
+ while ((src = deepest(srcs, nsrcs))) {
+ struct ir3_instruction *candidate;
+
+ candidate = find_instr_recursive(ctx, notes, src);
+ if (!candidate)
+ continue;
+
+ if (check_instr(ctx, notes, candidate)) {
+ instr->data = candidate;
+ return candidate;
+ }
+ }
+
+ instr->data = NULL_INSTR;
+ return NULL;
+}
+
+/* find instruction to schedule: */
+static struct ir3_instruction *
+find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+ bool soft)
+{
+ struct ir3_instruction *best_instr = NULL;
+ unsigned min_delay = ~0;
+
+ /* TODO we'd really rather use the list/array of block outputs. But we
+ * don't have such a thing. Recursing *every* instruction in the list
+ * will result in a lot of repeated traversal, since instructions will
+ * get traversed both when they appear as ssa src to a later instruction
+ * as well as where they appear in the depth_list.
+ */
+ list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
+ struct ir3_instruction *candidate;
+ unsigned delay;
+
+ candidate = find_instr_recursive(ctx, notes, instr);
+ if (!candidate)
+ continue;
+
+ delay = delay_calc(ctx->block, candidate, soft, false);
+ if (delay < min_delay) {
+ best_instr = candidate;
+ min_delay = delay;
+ }
+
+ if (min_delay == 0)
+ break;
+ }
+
+ return best_instr;
+}
+
+/* "spill" the address register by remapping any unscheduled
+ * instructions which depend on the current address register
+ * to a clone of the instruction which wrote the address reg.
+ */
+static struct ir3_instruction *
+split_addr(struct ir3_sched_ctx *ctx)
+{
+ struct ir3 *ir;
+ struct ir3_instruction *new_addr = NULL;
+ unsigned i;
+
+ debug_assert(ctx->addr);
+
+ ir = ctx->addr->block->shader;
+
+ for (i = 0; i < ir->indirects_count; i++) {
+ struct ir3_instruction *indirect = ir->indirects[i];
+
+ if (!indirect)
+ continue;
+
+ /* skip instructions already scheduled: */
+ if (is_scheduled(indirect))
+ continue;
+
+ /* remap remaining instructions using current addr
+ * to new addr:
+ */
+ if (indirect->address == ctx->addr) {
+ if (!new_addr) {
+ new_addr = ir3_instr_clone(ctx->addr);
+ /* original addr is scheduled, but new one isn't: */
+ new_addr->flags &= ~IR3_INSTR_MARK;
+ }
+ ir3_instr_set_address(indirect, new_addr);
+ }
+ }
+
+ /* all remaining indirects remapped to new addr: */
+ ctx->addr = NULL;
+
+ return new_addr;
+}
+
+/* "spill" the predicate register by remapping any unscheduled
+ * instructions which depend on the current predicate register
+ * to a clone of the instruction which wrote the address reg.
+ */
+static struct ir3_instruction *
+split_pred(struct ir3_sched_ctx *ctx)
+{
+ struct ir3 *ir;
+ struct ir3_instruction *new_pred = NULL;
+ unsigned i;
+
+ debug_assert(ctx->pred);
+
+ ir = ctx->pred->block->shader;
+
+ for (i = 0; i < ir->predicates_count; i++) {
+ struct ir3_instruction *predicated = ir->predicates[i];
+
+ /* skip instructions already scheduled: */
+ if (is_scheduled(predicated))
+ continue;
+
+ /* remap remaining instructions using current pred
+ * to new pred:
+ *
+ * TODO is there ever a case when pred isn't first
+ * (and only) src?
+ */
+ if (ssa(predicated->regs[1]) == ctx->pred) {
+ if (!new_pred) {
+ new_pred = ir3_instr_clone(ctx->pred);
+ /* original pred is scheduled, but new one isn't: */
+ new_pred->flags &= ~IR3_INSTR_MARK;
+ }
+ predicated->regs[1]->instr = new_pred;
+ }
+ }
+
+ /* all remaining predicated remapped to new pred: */
+ ctx->pred = NULL;
+
+ return new_pred;
+}
+
+static void
+sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+{
+ struct list_head unscheduled_list;
+
+ ctx->block = block;
+
+ /* addr/pred writes are per-block: */
+ ctx->addr = NULL;
+ ctx->pred = NULL;
+
+ /* move all instructions to the unscheduled list, and
+ * empty the block's instruction list (to which we will
+ * be inserting).
+ */
+ list_replace(&block->instr_list, &unscheduled_list);
+ list_inithead(&block->instr_list);
+ list_inithead(&ctx->depth_list);
+
+ /* first a pre-pass to schedule all meta:input instructions
+ * (which need to appear first so that RA knows the register is
+ * occupied), and move remaining to depth sorted list:
+ */
+ list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
+ if (instr->opc == OPC_META_INPUT) {
+ schedule(ctx, instr);
+ } else {
+ ir3_insert_by_depth(instr, &ctx->depth_list);
+ }
+ }
+
+ while (!list_empty(&ctx->depth_list)) {
+ struct ir3_sched_notes notes = {0};
+ struct ir3_instruction *instr;
+
+ instr = find_eligible_instr(ctx, &notes, true);
+ if (!instr)
+ instr = find_eligible_instr(ctx, &notes, false);
+
+ if (instr) {
+ unsigned delay = delay_calc(ctx->block, instr, false, false);
+
+ /* and if we run out of instructions that can be scheduled,
+ * then it is time for nop's:
+ */
+ debug_assert(delay <= 6);
+ while (delay > 0) {
+ ir3_NOP(block);
+ delay--;
+ }
+
+ schedule(ctx, instr);
+ } else {
+ struct ir3_instruction *new_instr = NULL;
+
+ /* nothing available to schedule.. if we are blocked on
+ * address/predicate register conflict, then break the
+ * deadlock by cloning the instruction that wrote that
+ * reg:
+ */
+ if (notes.addr_conflict) {
+ new_instr = split_addr(ctx);
+ } else if (notes.pred_conflict) {
+ new_instr = split_pred(ctx);
+ } else {
+ debug_assert(0);
+ ctx->error = true;
+ return;
+ }
+
+ if (new_instr) {
+ /* clearing current addr/pred can change what is
+ * available to schedule, so clear cache..
+ */
+ clear_cache(ctx, NULL);
+
+ ir3_insert_by_depth(new_instr, &ctx->depth_list);
+ /* the original instr that wrote addr/pred may have
+ * originated from a different block:
+ */
+ new_instr->block = block;
+ }
+ }
+ }
+
+ /* And lastly, insert branch/jump instructions to take us to
+ * the next block. Later we'll strip back out the branches
+ * that simply jump to next instruction.
+ */
+ if (block->successors[1]) {
+ /* if/else, conditional branches to "then" or "else": */
+ struct ir3_instruction *br;
+ unsigned delay = 6;
+
+ debug_assert(ctx->pred);
+ debug_assert(block->condition);
+
+ delay -= distance(ctx->block, ctx->pred, delay, false);
+
+ while (delay > 0) {
+ ir3_NOP(block);
+ delay--;
+ }
+
+ /* create "else" branch first (since "then" block should
+ * frequently/always end up being a fall-thru):
+ */
+ br = ir3_BR(block);
+ br->cat0.inv = true;
+ br->cat0.target = block->successors[1];
+
+ /* NOTE: we have to hard code delay of 6 above, since
+ * we want to insert the nop's before constructing the
+ * branch. Throw in an assert so we notice if this
+ * ever breaks on future generation:
+ */
+ debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
+
+ br = ir3_BR(block);
+ br->cat0.target = block->successors[0];
+
+ } else if (block->successors[0]) {
+ /* otherwise unconditional jump to next block: */
+ struct ir3_instruction *jmp;
+
+ jmp = ir3_JUMP(block);
+ jmp->cat0.target = block->successors[0];
+ }
+
+ /* NOTE: if we kept track of the predecessors, we could do a better
+ * job w/ (jp) flags.. every node w/ > predecessor is a join point.
+ * Note that as we eliminate blocks which contain only an unconditional
+ * jump we probably need to propagate (jp) flag..
+ */
+}
+
+/* After scheduling individual blocks, we still could have cases where
+ * one (or more) paths into a block, a value produced by a previous
+ * has too few delay slots to be legal. We can't deal with this in the
+ * first pass, because loops (ie. we can't ensure all predecessor blocks
+ * are already scheduled in the first pass). All we can really do at
+ * this point is stuff in extra nop's until things are legal.
+ */
+static void
+sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+{
+ unsigned n = 0;
+
+ ctx->block = block;
+
+ list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
+ unsigned delay = 0;
+
+ for (unsigned i = 0; i < block->predecessors_count; i++) {
+ unsigned d = delay_calc(block->predecessors[i], instr, false, true);
+ delay = MAX2(d, delay);
+ }
+
+ while (delay > n) {
+ struct ir3_instruction *nop = ir3_NOP(block);
+
+ /* move to before instr: */
+ list_delinit(&nop->node);
+ list_addtail(&nop->node, &instr->node);
+
+ n++;
+ }
+
+ /* we can bail once we hit worst case delay: */
+ if (++n > 6)
+ break;
+ }
+}
+
+int ir3_sched(struct ir3 *ir)
+{
+ struct ir3_sched_ctx ctx = {0};
+
+ ir3_clear_mark(ir);
+
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ sched_block(&ctx, block);
+ }
+
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ sched_intra_block(&ctx, block);
+ }
+
+ if (ctx.error)
+ return -1;
+ return 0;
+}
+
+/* does instruction 'prior' need to be scheduled before 'instr'? */
+static bool
+depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
+{
+ /* TODO for dependencies that are related to a specific object, ie
+ * a specific SSBO/image/array, we could relax this constraint to
+ * make accesses to unrelated objects not depend on each other (at
+ * least as long as not declared coherent)
+ */
+ if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
+ ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
+ return true;
+ return !!(instr->barrier_class & prior->barrier_conflict);
+}
+
+static void
+add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
+{
+ struct list_head *prev = instr->node.prev;
+ struct list_head *next = instr->node.next;
+
+ /* add dependencies on previous instructions that must be scheduled
+ * prior to the current instruction
+ */
+ while (prev != &block->instr_list) {
+ struct ir3_instruction *pi =
+ LIST_ENTRY(struct ir3_instruction, prev, node);
+
+ prev = prev->prev;
+
+ if (is_meta(pi))
+ continue;
+
+ if (instr->barrier_class == pi->barrier_class) {
+ ir3_instr_add_dep(instr, pi);
+ break;
+ }
+
+ if (depends_on(instr, pi))
+ ir3_instr_add_dep(instr, pi);
+ }
+
+ /* add dependencies on this instruction to following instructions
+ * that must be scheduled after the current instruction:
+ */
+ while (next != &block->instr_list) {
+ struct ir3_instruction *ni =
+ LIST_ENTRY(struct ir3_instruction, next, node);
+
+ next = next->next;
+
+ if (is_meta(ni))
+ continue;
+
+ if (instr->barrier_class == ni->barrier_class) {
+ ir3_instr_add_dep(ni, instr);
+ break;
+ }
+
+ if (depends_on(ni, instr))
+ ir3_instr_add_dep(ni, instr);
+ }
+}
+
+/* before scheduling a block, we need to add any necessary false-dependencies
+ * to ensure that:
+ *
+ * (1) barriers are scheduled in the right order wrt instructions related
+ * to the barrier
+ *
+ * (2) reads that come before a write actually get scheduled before the
+ * write
+ */
+static void
+calculate_deps(struct ir3_block *block)
+{
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ if (instr->barrier_class) {
+ add_barrier_deps(block, instr);
+ }
+ }
+}
+
+void
+ir3_sched_add_deps(struct ir3 *ir)
+{
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ calculate_deps(block);
+ }
+}
diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c
new file mode 100644
index 00000000000..8b18e950cca
--- /dev/null
+++ b/src/freedreno/ir3/ir3_shader.c
@@ -0,0 +1,436 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#include "util/u_string.h"
+#include "util/u_memory.h"
+#include "util/u_format.h"
+
+#include "drm/freedreno_drmif.h"
+
+#include "ir3_shader.h"
+#include "ir3_compiler.h"
+#include "ir3_nir.h"
+
+int
+ir3_glsl_type_size(const struct glsl_type *type)
+{
+ return glsl_count_attribute_slots(type, false);
+}
+
+static void
+delete_variant(struct ir3_shader_variant *v)
+{
+ if (v->ir)
+ ir3_destroy(v->ir);
+ if (v->bo)
+ fd_bo_del(v->bo);
+ if (v->immediates)
+ free(v->immediates);
+ free(v);
+}
+
+/* for vertex shader, the inputs are loaded into registers before the shader
+ * is executed, so max_regs from the shader instructions might not properly
+ * reflect the # of registers actually used, especially in case passthrough
+ * varyings.
+ *
+ * Likewise, for fragment shader, we can have some regs which are passed
+ * input values but never touched by the resulting shader (ie. as result
+ * of dead code elimination or simply because we don't know how to turn
+ * the reg off.
+ */
+static void
+fixup_regfootprint(struct ir3_shader_variant *v)
+{
+ unsigned i;
+
+ for (i = 0; i < v->inputs_count; i++) {
+ /* skip frag inputs fetch via bary.f since their reg's are
+ * not written by gpu before shader starts (and in fact the
+ * regid's might not even be valid)
+ */
+ if (v->inputs[i].bary)
+ continue;
+
+ /* ignore high regs that are global to all threads in a warp
+ * (they exist by default) (a5xx+)
+ */
+ if (v->inputs[i].regid >= regid(48,0))
+ continue;
+
+ if (v->inputs[i].compmask) {
+ unsigned n = util_last_bit(v->inputs[i].compmask) - 1;
+ int32_t regid = (v->inputs[i].regid + n) >> 2;
+ v->info.max_reg = MAX2(v->info.max_reg, regid);
+ }
+ }
+
+ for (i = 0; i < v->outputs_count; i++) {
+ int32_t regid = (v->outputs[i].regid + 3) >> 2;
+ v->info.max_reg = MAX2(v->info.max_reg, regid);
+ }
+}
+
+/* wrapper for ir3_assemble() which does some info fixup based on
+ * shader state. Non-static since used by ir3_cmdline too.
+ */
+void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id)
+{
+ void *bin;
+
+ bin = ir3_assemble(v->ir, &v->info, gpu_id);
+ if (!bin)
+ return NULL;
+
+ if (gpu_id >= 400) {
+ v->instrlen = v->info.sizedwords / (2 * 16);
+ } else {
+ v->instrlen = v->info.sizedwords / (2 * 4);
+ }
+
+ /* NOTE: if relative addressing is used, we set constlen in
+ * the compiler (to worst-case value) since we don't know in
+ * the assembler what the max addr reg value can be:
+ */
+ v->constlen = MIN2(255, MAX2(v->constlen, v->info.max_const + 1));
+
+ fixup_regfootprint(v);
+
+ return bin;
+}
+
+static void
+assemble_variant(struct ir3_shader_variant *v)
+{
+ struct ir3_compiler *compiler = v->shader->compiler;
+ uint32_t gpu_id = compiler->gpu_id;
+ uint32_t sz, *bin;
+
+ bin = ir3_shader_assemble(v, gpu_id);
+ sz = v->info.sizedwords * 4;
+
+ v->bo = fd_bo_new(compiler->dev, sz,
+ DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
+ DRM_FREEDRENO_GEM_TYPE_KMEM);
+
+ memcpy(fd_bo_map(v->bo), bin, sz);
+
+ if (ir3_shader_debug & IR3_DBG_DISASM) {
+ struct ir3_shader_key key = v->key;
+ printf("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
+ v->binning_pass, key.color_two_side, key.half_precision);
+ ir3_shader_disasm(v, bin, stdout);
+ }
+
+ if (shader_debug_enabled(v->shader->type)) {
+ fprintf(stderr, "Native code for unnamed %s shader %s:\n",
+ _mesa_shader_stage_to_string(v->shader->type),
+ v->shader->nir->info.name);
+ if (v->shader->type == MESA_SHADER_FRAGMENT)
+ fprintf(stderr, "SIMD0\n");
+ ir3_shader_disasm(v, bin, stderr);
+ }
+
+ free(bin);
+
+ /* no need to keep the ir around beyond this point: */
+ ir3_destroy(v->ir);
+ v->ir = NULL;
+}
+
+static struct ir3_shader_variant *
+create_variant(struct ir3_shader *shader, struct ir3_shader_key *key,
+ bool binning_pass)
+{
+ struct ir3_shader_variant *v = CALLOC_STRUCT(ir3_shader_variant);
+ int ret;
+
+ if (!v)
+ return NULL;
+
+ v->id = ++shader->variant_count;
+ v->shader = shader;
+ v->binning_pass = binning_pass;
+ v->key = *key;
+ v->type = shader->type;
+
+ ret = ir3_compile_shader_nir(shader->compiler, v);
+ if (ret) {
+ debug_error("compile failed!");
+ goto fail;
+ }
+
+ assemble_variant(v);
+ if (!v->bo) {
+ debug_error("assemble failed!");
+ goto fail;
+ }
+
+ return v;
+
+fail:
+ delete_variant(v);
+ return NULL;
+}
+
+static inline struct ir3_shader_variant *
+shader_variant(struct ir3_shader *shader, struct ir3_shader_key *key,
+ bool *created)
+{
+ struct ir3_shader_variant *v;
+
+ *created = false;
+
+ for (v = shader->variants; v; v = v->next)
+ if (ir3_shader_key_equal(key, &v->key))
+ return v;
+
+ /* compile new variant if it doesn't exist already: */
+ v = create_variant(shader, key, false);
+ if (v) {
+ v->next = shader->variants;
+ shader->variants = v;
+ *created = true;
+ }
+
+ return v;
+}
+
+struct ir3_shader_variant *
+ir3_shader_get_variant(struct ir3_shader *shader, struct ir3_shader_key *key,
+ bool binning_pass, bool *created)
+{
+ struct ir3_shader_variant *v =
+ shader_variant(shader, key, created);
+
+ if (binning_pass) {
+ if (!v->binning)
+ v->binning = create_variant(shader, key, true);
+ return v->binning;
+ }
+
+ return v;
+}
+
+void
+ir3_shader_destroy(struct ir3_shader *shader)
+{
+ struct ir3_shader_variant *v, *t;
+ for (v = shader->variants; v; ) {
+ t = v;
+ v = v->next;
+ delete_variant(t);
+ }
+ ralloc_free(shader->nir);
+ free(shader);
+}
+
+struct ir3_shader *
+ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir)
+{
+ struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader);
+
+ shader->compiler = compiler;
+ shader->id = ++shader->compiler->shader_count;
+ shader->type = nir->info.stage;
+
+ NIR_PASS_V(nir, nir_lower_io, nir_var_all, ir3_glsl_type_size,
+ (nir_lower_io_options)0);
+
+ /* do first pass optimization, ignoring the key: */
+ shader->nir = ir3_optimize_nir(shader, nir, NULL);
+ if (ir3_shader_debug & IR3_DBG_DISASM) {
+ printf("dump nir%d: type=%d", shader->id, shader->type);
+ nir_print_shader(shader->nir, stdout);
+ }
+
+ return shader;
+}
+
+static void dump_reg(FILE *out, const char *name, uint32_t r)
+{
+ if (r != regid(63,0))
+ fprintf(out, "; %s: r%d.%c\n", name, r >> 2, "xyzw"[r & 0x3]);
+}
+
+static void dump_output(FILE *out, struct ir3_shader_variant *so,
+ unsigned slot, const char *name)
+{
+ uint32_t regid;
+ regid = ir3_find_output_regid(so, slot);
+ dump_reg(out, name, regid);
+}
+
+void
+ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out)
+{
+ struct ir3 *ir = so->ir;
+ struct ir3_register *reg;
+ const char *type = ir3_shader_stage(so->shader);
+ uint8_t regid;
+ unsigned i;
+
+ for (i = 0; i < ir->ninputs; i++) {
+ if (!ir->inputs[i]) {
+ fprintf(out, "; in%d unused\n", i);
+ continue;
+ }
+ reg = ir->inputs[i]->regs[0];
+ regid = reg->num;
+ fprintf(out, "@in(%sr%d.%c)\tin%d\n",
+ (reg->flags & IR3_REG_HALF) ? "h" : "",
+ (regid >> 2), "xyzw"[regid & 0x3], i);
+ }
+
+ for (i = 0; i < ir->noutputs; i++) {
+ if (!ir->outputs[i]) {
+ fprintf(out, "; out%d unused\n", i);
+ continue;
+ }
+ /* kill shows up as a virtual output.. skip it! */
+ if (is_kill(ir->outputs[i]))
+ continue;
+ reg = ir->outputs[i]->regs[0];
+ regid = reg->num;
+ fprintf(out, "@out(%sr%d.%c)\tout%d\n",
+ (reg->flags & IR3_REG_HALF) ? "h" : "",
+ (regid >> 2), "xyzw"[regid & 0x3], i);
+ }
+
+ for (i = 0; i < so->immediates_count; i++) {
+ fprintf(out, "@const(c%d.x)\t", so->constbase.immediate + i);
+ fprintf(out, "0x%08x, 0x%08x, 0x%08x, 0x%08x\n",
+ so->immediates[i].val[0],
+ so->immediates[i].val[1],
+ so->immediates[i].val[2],
+ so->immediates[i].val[3]);
+ }
+
+ disasm_a3xx(bin, so->info.sizedwords, 0, out);
+
+ switch (so->type) {
+ case MESA_SHADER_VERTEX:
+ fprintf(out, "; %s: outputs:", type);
+ for (i = 0; i < so->outputs_count; i++) {
+ uint8_t regid = so->outputs[i].regid;
+ fprintf(out, " r%d.%c (%s)",
+ (regid >> 2), "xyzw"[regid & 0x3],
+ gl_varying_slot_name(so->outputs[i].slot));
+ }
+ fprintf(out, "\n");
+ fprintf(out, "; %s: inputs:", type);
+ for (i = 0; i < so->inputs_count; i++) {
+ uint8_t regid = so->inputs[i].regid;
+ fprintf(out, " r%d.%c (cm=%x,il=%u,b=%u)",
+ (regid >> 2), "xyzw"[regid & 0x3],
+ so->inputs[i].compmask,
+ so->inputs[i].inloc,
+ so->inputs[i].bary);
+ }
+ fprintf(out, "\n");
+ break;
+ case MESA_SHADER_FRAGMENT:
+ fprintf(out, "; %s: outputs:", type);
+ for (i = 0; i < so->outputs_count; i++) {
+ uint8_t regid = so->outputs[i].regid;
+ fprintf(out, " r%d.%c (%s)",
+ (regid >> 2), "xyzw"[regid & 0x3],
+ gl_frag_result_name(so->outputs[i].slot));
+ }
+ fprintf(out, "\n");
+ fprintf(out, "; %s: inputs:", type);
+ for (i = 0; i < so->inputs_count; i++) {
+ uint8_t regid = so->inputs[i].regid;
+ fprintf(out, " r%d.%c (%s,cm=%x,il=%u,b=%u)",
+ (regid >> 2), "xyzw"[regid & 0x3],
+ gl_varying_slot_name(so->inputs[i].slot),
+ so->inputs[i].compmask,
+ so->inputs[i].inloc,
+ so->inputs[i].bary);
+ }
+ fprintf(out, "\n");
+ break;
+ default:
+ /* TODO */
+ break;
+ }
+
+ /* print generic shader info: */
+ fprintf(out, "; %s prog %d/%d: %u instructions, %d half, %d full\n",
+ type, so->shader->id, so->id,
+ so->info.instrs_count,
+ so->info.max_half_reg + 1,
+ so->info.max_reg + 1);
+
+ fprintf(out, "; %d const, %u constlen\n",
+ so->info.max_const + 1,
+ so->constlen);
+
+ fprintf(out, "; %u (ss), %u (sy)\n", so->info.ss, so->info.sy);
+
+ /* print shader type specific info: */
+ switch (so->type) {
+ case MESA_SHADER_VERTEX:
+ dump_output(out, so, VARYING_SLOT_POS, "pos");
+ dump_output(out, so, VARYING_SLOT_PSIZ, "psize");
+ break;
+ case MESA_SHADER_FRAGMENT:
+ dump_reg(out, "pos (bary)",
+ ir3_find_sysval_regid(so, SYSTEM_VALUE_VARYING_COORD));
+ dump_output(out, so, FRAG_RESULT_DEPTH, "posz");
+ if (so->color0_mrt) {
+ dump_output(out, so, FRAG_RESULT_COLOR, "color");
+ } else {
+ dump_output(out, so, FRAG_RESULT_DATA0, "data0");
+ dump_output(out, so, FRAG_RESULT_DATA1, "data1");
+ dump_output(out, so, FRAG_RESULT_DATA2, "data2");
+ dump_output(out, so, FRAG_RESULT_DATA3, "data3");
+ dump_output(out, so, FRAG_RESULT_DATA4, "data4");
+ dump_output(out, so, FRAG_RESULT_DATA5, "data5");
+ dump_output(out, so, FRAG_RESULT_DATA6, "data6");
+ dump_output(out, so, FRAG_RESULT_DATA7, "data7");
+ }
+ /* these two are hard-coded since we don't know how to
+ * program them to anything but all 0's...
+ */
+ if (so->frag_coord)
+ fprintf(out, "; fragcoord: r0.x\n");
+ if (so->frag_face)
+ fprintf(out, "; fragface: hr0.x\n");
+ break;
+ default:
+ /* TODO */
+ break;
+ }
+
+ fprintf(out, "\n");
+}
+
+uint64_t
+ir3_shader_outputs(const struct ir3_shader *so)
+{
+ return so->nir->info.outputs_written;
+}
diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h
new file mode 100644
index 00000000000..bc47160d6ea
--- /dev/null
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -0,0 +1,587 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#ifndef IR3_SHADER_H_
+#define IR3_SHADER_H_
+
+#include <stdio.h>
+
+#include "compiler/shader_enums.h"
+#include "compiler/nir/nir.h"
+#include "util/bitscan.h"
+
+#include "ir3.h"
+
+struct glsl_type;
+
+/* driver param indices: */
+enum ir3_driver_param {
+ /* compute shader driver params: */
+ IR3_DP_NUM_WORK_GROUPS_X = 0,
+ IR3_DP_NUM_WORK_GROUPS_Y = 1,
+ IR3_DP_NUM_WORK_GROUPS_Z = 2,
+ IR3_DP_LOCAL_GROUP_SIZE_X = 4,
+ IR3_DP_LOCAL_GROUP_SIZE_Y = 5,
+ IR3_DP_LOCAL_GROUP_SIZE_Z = 6,
+ /* NOTE: gl_NumWorkGroups should be vec4 aligned because
+ * glDispatchComputeIndirect() needs to load these from
+ * the info->indirect buffer. Keep that in mind when/if
+ * adding any addition CS driver params.
+ */
+ IR3_DP_CS_COUNT = 8, /* must be aligned to vec4 */
+
+ /* vertex shader driver params: */
+ IR3_DP_VTXID_BASE = 0,
+ IR3_DP_VTXCNT_MAX = 1,
+ /* user-clip-plane components, up to 8x vec4's: */
+ IR3_DP_UCP0_X = 4,
+ /* .... */
+ IR3_DP_UCP7_W = 35,
+ IR3_DP_VS_COUNT = 36 /* must be aligned to vec4 */
+};
+
+#define IR3_MAX_SHADER_BUFFERS 32
+#define IR3_MAX_SHADER_IMAGES 32
+#define IR3_MAX_SO_BUFFERS 4
+#define IR3_MAX_SO_OUTPUTS 64
+
+/**
+ * For consts needed to pass internal values to shader which may or may not
+ * be required, rather than allocating worst-case const space, we scan the
+ * shader and allocate consts as-needed:
+ *
+ * + SSBO sizes: only needed if shader has a get_buffer_size intrinsic
+ * for a given SSBO
+ *
+ * + Image dimensions: needed to calculate pixel offset, but only for
+ * images that have a image_store intrinsic
+ */
+struct ir3_driver_const_layout {
+ struct {
+ uint32_t mask; /* bitmask of SSBOs that have get_buffer_size */
+ uint32_t count; /* number of consts allocated */
+ /* one const allocated per SSBO which has get_buffer_size,
+ * ssbo_sizes.off[ssbo_id] is offset from start of ssbo_sizes
+ * consts:
+ */
+ uint32_t off[IR3_MAX_SHADER_BUFFERS];
+ } ssbo_size;
+
+ struct {
+ uint32_t mask; /* bitmask of images that have image_store */
+ uint32_t count; /* number of consts allocated */
+ /* three const allocated per image which has image_store:
+ * + cpp (bytes per pixel)
+ * + pitch (y pitch)
+ * + array_pitch (z pitch)
+ */
+ uint32_t off[IR3_MAX_SHADER_IMAGES];
+ } image_dims;
+};
+
+/**
+ * A single output for vertex transform feedback.
+ */
+struct ir3_stream_output {
+ unsigned register_index:6; /**< 0 to 63 (OUT index) */
+ unsigned start_component:2; /** 0 to 3 */
+ unsigned num_components:3; /** 1 to 4 */
+ unsigned output_buffer:3; /**< 0 to PIPE_MAX_SO_BUFFERS */
+ unsigned dst_offset:16; /**< offset into the buffer in dwords */
+ unsigned stream:2; /**< 0 to 3 */
+};
+
+/**
+ * Stream output for vertex transform feedback.
+ */
+struct ir3_stream_output_info {
+ unsigned num_outputs;
+ /** stride for an entire vertex for each buffer in dwords */
+ uint16_t stride[IR3_MAX_SO_BUFFERS];
+
+ /**
+ * Array of stream outputs, in the order they are to be written in.
+ * Selected components are tightly packed into the output buffer.
+ */
+ struct ir3_stream_output output[IR3_MAX_SO_OUTPUTS];
+};
+
+/* Configuration key used to identify a shader variant.. different
+ * shader variants can be used to implement features not supported
+ * in hw (two sided color), binning-pass vertex shader, etc.
+ */
+struct ir3_shader_key {
+ union {
+ struct {
+ /*
+ * Combined Vertex/Fragment shader parameters:
+ */
+ unsigned ucp_enables : 8;
+
+ /* do we need to check {v,f}saturate_{s,t,r}? */
+ unsigned has_per_samp : 1;
+
+ /*
+ * Vertex shader variant parameters:
+ */
+ unsigned vclamp_color : 1;
+
+ /*
+ * Fragment shader variant parameters:
+ */
+ unsigned color_two_side : 1;
+ unsigned half_precision : 1;
+ /* used when shader needs to handle flat varyings (a4xx)
+ * for front/back color inputs to frag shader:
+ */
+ unsigned rasterflat : 1;
+ unsigned fclamp_color : 1;
+ };
+ uint32_t global;
+ };
+
+ /* bitmask of sampler which needs coords clamped for vertex
+ * shader:
+ */
+ uint16_t vsaturate_s, vsaturate_t, vsaturate_r;
+
+ /* bitmask of sampler which needs coords clamped for frag
+ * shader:
+ */
+ uint16_t fsaturate_s, fsaturate_t, fsaturate_r;
+
+ /* bitmask of ms shifts */
+ uint32_t vsamples, fsamples;
+
+ /* bitmask of samplers which need astc srgb workaround: */
+ uint16_t vastc_srgb, fastc_srgb;
+};
+
+static inline bool
+ir3_shader_key_equal(struct ir3_shader_key *a, struct ir3_shader_key *b)
+{
+ /* slow-path if we need to check {v,f}saturate_{s,t,r} */
+ if (a->has_per_samp || b->has_per_samp)
+ return memcmp(a, b, sizeof(struct ir3_shader_key)) == 0;
+ return a->global == b->global;
+}
+
+/* will the two keys produce different lowering for a fragment shader? */
+static inline bool
+ir3_shader_key_changes_fs(struct ir3_shader_key *key, struct ir3_shader_key *last_key)
+{
+ if (last_key->has_per_samp || key->has_per_samp) {
+ if ((last_key->fsaturate_s != key->fsaturate_s) ||
+ (last_key->fsaturate_t != key->fsaturate_t) ||
+ (last_key->fsaturate_r != key->fsaturate_r) ||
+ (last_key->fsamples != key->fsamples) ||
+ (last_key->fastc_srgb != key->fastc_srgb))
+ return true;
+ }
+
+ if (last_key->fclamp_color != key->fclamp_color)
+ return true;
+
+ if (last_key->color_two_side != key->color_two_side)
+ return true;
+
+ if (last_key->half_precision != key->half_precision)
+ return true;
+
+ if (last_key->rasterflat != key->rasterflat)
+ return true;
+
+ if (last_key->ucp_enables != key->ucp_enables)
+ return true;
+
+ return false;
+}
+
+/* will the two keys produce different lowering for a vertex shader? */
+static inline bool
+ir3_shader_key_changes_vs(struct ir3_shader_key *key, struct ir3_shader_key *last_key)
+{
+ if (last_key->has_per_samp || key->has_per_samp) {
+ if ((last_key->vsaturate_s != key->vsaturate_s) ||
+ (last_key->vsaturate_t != key->vsaturate_t) ||
+ (last_key->vsaturate_r != key->vsaturate_r) ||
+ (last_key->vsamples != key->vsamples) ||
+ (last_key->vastc_srgb != key->vastc_srgb))
+ return true;
+ }
+
+ if (last_key->vclamp_color != key->vclamp_color)
+ return true;
+
+ if (last_key->ucp_enables != key->ucp_enables)
+ return true;
+
+ return false;
+}
+
+/* clears shader-key flags which don't apply to the given shader
+ * stage
+ */
+static inline void
+ir3_normalize_key(struct ir3_shader_key *key, gl_shader_stage type)
+{
+ switch (type) {
+ case MESA_SHADER_FRAGMENT:
+ if (key->has_per_samp) {
+ key->vsaturate_s = 0;
+ key->vsaturate_t = 0;
+ key->vsaturate_r = 0;
+ key->vastc_srgb = 0;
+ key->vsamples = 0;
+ }
+ break;
+ case MESA_SHADER_VERTEX:
+ key->color_two_side = false;
+ key->half_precision = false;
+ key->rasterflat = false;
+ if (key->has_per_samp) {
+ key->fsaturate_s = 0;
+ key->fsaturate_t = 0;
+ key->fsaturate_r = 0;
+ key->fastc_srgb = 0;
+ key->fsamples = 0;
+ }
+ break;
+ default:
+ /* TODO */
+ break;
+ }
+
+}
+
+struct ir3_shader_variant {
+ struct fd_bo *bo;
+
+ /* variant id (for debug) */
+ uint32_t id;
+
+ struct ir3_shader_key key;
+
+ /* vertex shaders can have an extra version for hwbinning pass,
+ * which is pointed to by so->binning:
+ */
+ bool binning_pass;
+ struct ir3_shader_variant *binning;
+
+ struct ir3_driver_const_layout const_layout;
+ struct ir3_info info;
+ struct ir3 *ir;
+
+ /* the instructions length is in units of instruction groups
+ * (4 instructions for a3xx, 16 instructions for a4xx.. each
+ * instruction is 2 dwords):
+ */
+ unsigned instrlen;
+
+ /* the constants length is in units of vec4's, and is the sum of
+ * the uniforms and the built-in compiler constants
+ */
+ unsigned constlen;
+
+ /* number of uniforms (in vec4), not including built-in compiler
+ * constants, etc.
+ */
+ unsigned num_uniforms;
+
+ unsigned num_ubos;
+
+ /* About Linkage:
+ * + Let the frag shader determine the position/compmask for the
+ * varyings, since it is the place where we know if the varying
+ * is actually used, and if so, which components are used. So
+ * what the hw calls "outloc" is taken from the "inloc" of the
+ * frag shader.
+ * + From the vert shader, we only need the output regid
+ */
+
+ bool frag_coord, frag_face, color0_mrt;
+
+ /* NOTE: for input/outputs, slot is:
+ * gl_vert_attrib - for VS inputs
+ * gl_varying_slot - for VS output / FS input
+ * gl_frag_result - for FS output
+ */
+
+ /* varyings/outputs: */
+ unsigned outputs_count;
+ struct {
+ uint8_t slot;
+ uint8_t regid;
+ } outputs[16 + 2]; /* +POSITION +PSIZE */
+ bool writes_pos, writes_psize;
+
+ /* attributes (VS) / varyings (FS):
+ * Note that sysval's should come *after* normal inputs.
+ */
+ unsigned inputs_count;
+ struct {
+ uint8_t slot;
+ uint8_t regid;
+ uint8_t compmask;
+ uint8_t ncomp;
+ /* location of input (ie. offset passed to bary.f, etc). This
+ * matches the SP_VS_VPC_DST_REG.OUTLOCn value (a3xx and a4xx
+ * have the OUTLOCn value offset by 8, presumably to account
+ * for gl_Position/gl_PointSize)
+ */
+ uint8_t inloc;
+ /* vertex shader specific: */
+ bool sysval : 1; /* slot is a gl_system_value */
+ /* fragment shader specific: */
+ bool bary : 1; /* fetched varying (vs one loaded into reg) */
+ bool rasterflat : 1; /* special handling for emit->rasterflat */
+ enum glsl_interp_mode interpolate;
+ } inputs[16 + 2]; /* +POSITION +FACE */
+
+ /* sum of input components (scalar). For frag shaders, it only counts
+ * the varying inputs:
+ */
+ unsigned total_in;
+
+ /* For frag shaders, the total number of inputs (not scalar,
+ * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR)
+ */
+ unsigned varying_in;
+
+ /* number of samplers/textures (which are currently 1:1): */
+ int num_samp;
+
+ /* do we have one or more SSBO instructions: */
+ bool has_ssbo;
+
+ /* do we have kill instructions: */
+ bool has_kill;
+
+ /* Layout of constant registers, each section (in vec4). Pointer size
+ * is 32b (a3xx, a4xx), or 64b (a5xx+), which effects the size of the
+ * UBO and stream-out consts.
+ */
+ struct {
+ /* user const start at zero */
+ unsigned ubo;
+ /* NOTE that a3xx might need a section for SSBO addresses too */
+ unsigned ssbo_sizes;
+ unsigned image_dims;
+ unsigned driver_param;
+ unsigned tfbo;
+ unsigned immediate;
+ } constbase;
+
+ unsigned immediates_count;
+ unsigned immediates_size;
+ struct {
+ uint32_t val[4];
+ } *immediates;
+
+ /* for astc srgb workaround, the number/base of additional
+ * alpha tex states we need, and index of original tex states
+ */
+ struct {
+ unsigned base, count;
+ unsigned orig_idx[16];
+ } astc_srgb;
+
+ /* shader variants form a linked list: */
+ struct ir3_shader_variant *next;
+
+ /* replicated here to avoid passing extra ptrs everywhere: */
+ gl_shader_stage type;
+ struct ir3_shader *shader;
+};
+
+struct ir3_shader {
+ gl_shader_stage type;
+
+ /* shader id (for debug): */
+ uint32_t id;
+ uint32_t variant_count;
+
+ /* so we know when we can disable TGSI related hacks: */
+ bool from_tgsi;
+
+ struct ir3_compiler *compiler;
+
+ struct nir_shader *nir;
+ struct ir3_stream_output_info stream_output;
+
+ struct ir3_shader_variant *variants;
+};
+
+void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id);
+struct ir3_shader_variant * ir3_shader_get_variant(struct ir3_shader *shader,
+ struct ir3_shader_key *key, bool binning_pass, bool *created);
+struct ir3_shader * ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir);
+void ir3_shader_destroy(struct ir3_shader *shader);
+void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out);
+uint64_t ir3_shader_outputs(const struct ir3_shader *so);
+
+int
+ir3_glsl_type_size(const struct glsl_type *type);
+
+static inline const char *
+ir3_shader_stage(struct ir3_shader *shader)
+{
+ switch (shader->type) {
+ case MESA_SHADER_VERTEX: return "VERT";
+ case MESA_SHADER_FRAGMENT: return "FRAG";
+ case MESA_SHADER_COMPUTE: return "CL";
+ default:
+ unreachable("invalid type");
+ return NULL;
+ }
+}
+
+/*
+ * Helper/util:
+ */
+
+static inline int
+ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot)
+{
+ int j;
+
+ for (j = 0; j < so->outputs_count; j++)
+ if (so->outputs[j].slot == slot)
+ return j;
+
+ /* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n]
+ * in the vertex shader.. but the fragment shader doesn't know this
+ * so it will always have both IN.COLOR[n] and IN.BCOLOR[n]. So
+ * at link time if there is no matching OUT.BCOLOR[n], we must map
+ * OUT.COLOR[n] to IN.BCOLOR[n]. And visa versa if there is only
+ * a OUT.BCOLOR[n] but no matching OUT.COLOR[n]
+ */
+ if (slot == VARYING_SLOT_BFC0) {
+ slot = VARYING_SLOT_COL0;
+ } else if (slot == VARYING_SLOT_BFC1) {
+ slot = VARYING_SLOT_COL1;
+ } else if (slot == VARYING_SLOT_COL0) {
+ slot = VARYING_SLOT_BFC0;
+ } else if (slot == VARYING_SLOT_COL1) {
+ slot = VARYING_SLOT_BFC1;
+ } else {
+ return 0;
+ }
+
+ for (j = 0; j < so->outputs_count; j++)
+ if (so->outputs[j].slot == slot)
+ return j;
+
+ debug_assert(0);
+
+ return 0;
+}
+
+static inline int
+ir3_next_varying(const struct ir3_shader_variant *so, int i)
+{
+ while (++i < so->inputs_count)
+ if (so->inputs[i].compmask && so->inputs[i].bary)
+ break;
+ return i;
+}
+
+struct ir3_shader_linkage {
+ uint8_t max_loc;
+ uint8_t cnt;
+ struct {
+ uint8_t regid;
+ uint8_t compmask;
+ uint8_t loc;
+ } var[32];
+};
+
+static inline void
+ir3_link_add(struct ir3_shader_linkage *l, uint8_t regid, uint8_t compmask, uint8_t loc)
+{
+ int i = l->cnt++;
+
+ debug_assert(i < ARRAY_SIZE(l->var));
+
+ l->var[i].regid = regid;
+ l->var[i].compmask = compmask;
+ l->var[i].loc = loc;
+ l->max_loc = MAX2(l->max_loc, loc + util_last_bit(compmask));
+}
+
+static inline void
+ir3_link_shaders(struct ir3_shader_linkage *l,
+ const struct ir3_shader_variant *vs,
+ const struct ir3_shader_variant *fs)
+{
+ int j = -1, k;
+
+ while (l->cnt < ARRAY_SIZE(l->var)) {
+ j = ir3_next_varying(fs, j);
+
+ if (j >= fs->inputs_count)
+ break;
+
+ if (fs->inputs[j].inloc >= fs->total_in)
+ continue;
+
+ k = ir3_find_output(vs, fs->inputs[j].slot);
+
+ ir3_link_add(l, vs->outputs[k].regid,
+ fs->inputs[j].compmask, fs->inputs[j].inloc);
+ }
+}
+
+static inline uint32_t
+ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot)
+{
+ int j;
+ for (j = 0; j < so->outputs_count; j++)
+ if (so->outputs[j].slot == slot)
+ return so->outputs[j].regid;
+ return regid(63, 0);
+}
+
+static inline uint32_t
+ir3_find_sysval_regid(const struct ir3_shader_variant *so, unsigned slot)
+{
+ int j;
+ for (j = 0; j < so->inputs_count; j++)
+ if (so->inputs[j].sysval && (so->inputs[j].slot == slot))
+ return so->inputs[j].regid;
+ return regid(63, 0);
+}
+
+/* calculate register footprint in terms of half-regs (ie. one full
+ * reg counts as two half-regs).
+ */
+static inline uint32_t
+ir3_shader_halfregs(const struct ir3_shader_variant *v)
+{
+ return (2 * (v->info.max_reg + 1)) + (v->info.max_half_reg + 1);
+}
+
+#endif /* IR3_SHADER_H_ */
diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build
new file mode 100644
index 00000000000..07319dff595
--- /dev/null
+++ b/src/freedreno/ir3/meson.build
@@ -0,0 +1,64 @@
+# Copyright © 2018 Rob Clark
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+ir3_nir_trig_c = custom_target(
+ 'ir3_nir_trig.c',
+ input : 'ir3_nir_trig.py',
+ output : 'ir3_nir_trig.c',
+ command : [
+ prog_python, '@INPUT@',
+ '-p', join_paths(meson.source_root(), 'src/compiler/nir/'),
+ ],
+ capture : true,
+ depend_files : nir_algebraic_py,
+)
+
+libfreedreno_ir3_files = files(
+ 'disasm-a3xx.c',
+ 'instr-a3xx.h',
+ 'ir3.c',
+ 'ir3_compiler_nir.c',
+ 'ir3_compiler.c',
+ 'ir3_compiler.h',
+ 'ir3_cp.c',
+ 'ir3_depth.c',
+ 'ir3_group.c',
+ 'ir3.h',
+ 'ir3_legalize.c',
+ 'ir3_nir.c',
+ 'ir3_nir.h',
+ 'ir3_nir_lower_tg4_to_tex.c',
+ 'ir3_print.c',
+ 'ir3_ra.c',
+ 'ir3_sched.c',
+ 'ir3_shader.c',
+ 'ir3_shader.h',
+)
+
+libfreedreno_ir3 = static_library(
+ 'freedreno_ir3',
+ [libfreedreno_ir3_files, ir3_nir_trig_c],
+ include_directories : [inc_freedreno, inc_common],
+ c_args : [c_vis_args, no_override_init_args],
+ cpp_args : [cpp_vis_args],
+ dependencies : idep_nir_headers,
+ build_by_default : false,
+)
+
diff --git a/src/freedreno/meson.build b/src/freedreno/meson.build
index bb2cb201c0d..26ee6213890 100644
--- a/src/freedreno/meson.build
+++ b/src/freedreno/meson.build
@@ -21,3 +21,4 @@
inc_freedreno = include_directories('.')
subdir('drm')
+subdir('ir3')