diff options
Diffstat (limited to 'src/gallium/drivers/freedreno')
40 files changed, 37 insertions, 13732 deletions
diff --git a/src/gallium/drivers/freedreno/Automake.inc b/src/gallium/drivers/freedreno/Automake.inc index 9b9b3d39fea..936c286f4c9 100644 --- a/src/gallium/drivers/freedreno/Automake.inc +++ b/src/gallium/drivers/freedreno/Automake.inc @@ -6,6 +6,7 @@ TARGET_LIB_DEPS += \ $(top_builddir)/src/gallium/winsys/freedreno/drm/libfreedrenodrm.la \ $(top_builddir)/src/gallium/drivers/freedreno/libfreedreno.la \ $(top_builddir)/src/freedreno/libfreedreno_drm.la \ + $(top_builddir)/src/freedreno/libfreedreno_ir3.la \ $(FREEDRENO_LIBS) \ $(LIBDRM_LIBS) diff --git a/src/gallium/drivers/freedreno/Makefile.am b/src/gallium/drivers/freedreno/Makefile.am index 39887e13e37..32130ab94c5 100644 --- a/src/gallium/drivers/freedreno/Makefile.am +++ b/src/gallium/drivers/freedreno/Makefile.am @@ -9,11 +9,6 @@ AM_CFLAGS = \ -I$(top_srcdir)/src/compiler/nir \ $(GALLIUM_DRIVER_CFLAGS) -MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D) -ir3/ir3_nir_trig.c: ir3/ir3_nir_trig.py $(top_srcdir)/src/compiler/nir/nir_algebraic.py - $(MKDIR_GEN) - $(AM_V_GEN) $(PYTHON) $(PYTHON_FLAGS) $(srcdir)/ir3/ir3_nir_trig.py -p $(top_srcdir)/src/compiler/nir > $@ || ($(RM) $@; false) - noinst_LTLIBRARIES = libfreedreno.la libfreedreno_la_SOURCES = \ @@ -23,28 +18,6 @@ libfreedreno_la_SOURCES = \ $(a4xx_SOURCES) \ $(a5xx_SOURCES) \ $(a6xx_SOURCES) \ - $(ir3_SOURCES) \ - $(ir3_GENERATED_FILES) - -BUILT_SOURCES := $(ir3_GENERATED_FILES) -CLEANFILES := $(BUILT_SOURCES) -EXTRA_DIST = ir3/ir3_nir_trig.py - -noinst_PROGRAMS = ir3_compiler - -# XXX: Required due to the C++ sources in libnir -nodist_EXTRA_ir3_compiler_SOURCES = dummy.cpp -ir3_compiler_SOURCES = \ - ir3/ir3_cmdline.c - -ir3_compiler_LDADD = \ - libfreedreno.la \ - $(top_builddir)/src/gallium/auxiliary/libgallium.la \ - $(top_builddir)/src/compiler/nir/libnir.la \ - $(top_builddir)/src/compiler/glsl/libstandalone.la \ - $(top_builddir)/src/util/libmesautil.la \ - $(top_builddir)/src/mesa/libmesagallium.la \ - $(top_builddir)/src/freedreno/libfreedreno_drm.la \ - $(GALLIUM_COMMON_LIB_DEPS) + $(ir3_SOURCES) -EXTRA_DIST += meson.build +EXTRA_DIST = meson.build diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources index bde217d80a2..039a8ca7af7 100644 --- a/src/gallium/drivers/freedreno/Makefile.sources +++ b/src/gallium/drivers/freedreno/Makefile.sources @@ -195,29 +195,8 @@ a6xx_SOURCES := \ a6xx/fd6_zsa.h ir3_SOURCES := \ - ir3/disasm-a3xx.c \ - ir3/instr-a3xx.h \ - ir3/ir3.c \ ir3/ir3_cache.c \ ir3/ir3_cache.h \ - ir3/ir3_compiler_nir.c \ - ir3/ir3_compiler.c \ - ir3/ir3_compiler.h \ - ir3/ir3_cp.c \ - ir3/ir3_depth.c \ ir3/ir3_gallium.c \ - ir3/ir3_gallium.h \ - ir3/ir3_group.c \ - ir3/ir3.h \ - ir3/ir3_legalize.c \ - ir3/ir3_nir.c \ - ir3/ir3_nir.h \ - ir3/ir3_nir_lower_tg4_to_tex.c \ - ir3/ir3_print.c \ - ir3/ir3_ra.c \ - ir3/ir3_sched.c \ - ir3/ir3_shader.c \ - ir3/ir3_shader.h + ir3/ir3_gallium.h -ir3_GENERATED_FILES := \ - ir3/ir3_nir_trig.c diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.h b/src/gallium/drivers/freedreno/a3xx/fd3_context.h index 4596aeee025..0c9412a7501 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_context.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.h @@ -31,7 +31,7 @@ #include "freedreno_context.h" -#include "ir3_shader.h" +#include "ir3/ir3_shader.h" struct fd3_context { diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.h b/src/gallium/drivers/freedreno/a3xx/fd3_program.h index 0551f1f8b91..533838a9a6d 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.h @@ -29,7 +29,8 @@ #include "pipe/p_context.h" #include "freedreno_context.h" -#include "ir3_shader.h" + +#include "ir3/ir3_shader.h" struct fd3_emit; diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c index a010a4df9a1..7ed57d2de5a 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c @@ -30,7 +30,8 @@ #include "fd3_screen.h" #include "fd3_context.h" #include "fd3_format.h" -#include "ir3_compiler.h" + +#include "ir3/ir3_compiler.h" static boolean fd3_screen_is_format_supported(struct pipe_screen *pscreen, diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.h b/src/gallium/drivers/freedreno/a4xx/fd4_context.h index a4b84d400ef..a84e3a90f83 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_context.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.h @@ -31,7 +31,7 @@ #include "freedreno_context.h" -#include "ir3_shader.h" +#include "ir3/ir3_shader.h" struct fd4_context { struct fd_context base; diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.h b/src/gallium/drivers/freedreno/a4xx/fd4_program.h index cc98bc9a4d6..a0a0bec264f 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_program.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.h @@ -29,7 +29,8 @@ #include "pipe/p_context.h" #include "freedreno_context.h" -#include "ir3_shader.h" + +#include "ir3/ir3_shader.h" struct fd4_emit; diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c index 4e4e274cd10..961e907b779 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c @@ -30,7 +30,8 @@ #include "fd4_screen.h" #include "fd4_context.h" #include "fd4_format.h" -#include "ir3_compiler.h" + +#include "ir3/ir3_compiler.h" static boolean fd4_screen_is_format_supported(struct pipe_screen *pscreen, diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_context.h b/src/gallium/drivers/freedreno/a5xx/fd5_context.h index 0cd252167b7..324878b4348 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_context.h +++ b/src/gallium/drivers/freedreno/a5xx/fd5_context.h @@ -31,7 +31,7 @@ #include "freedreno_context.h" -#include "ir3_shader.h" +#include "ir3/ir3_shader.h" struct fd5_context { struct fd_context base; diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_program.h b/src/gallium/drivers/freedreno/a5xx/fd5_program.h index 72cbf9a8b88..cdb31c62b63 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_program.h +++ b/src/gallium/drivers/freedreno/a5xx/fd5_program.h @@ -29,7 +29,8 @@ #include "pipe/p_context.h" #include "freedreno_context.h" -#include "ir3_shader.h" + +#include "ir3/ir3_shader.h" struct fd5_emit; diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_screen.c b/src/gallium/drivers/freedreno/a5xx/fd5_screen.c index 7d8d2b3e5b8..db961790879 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_screen.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_screen.c @@ -33,7 +33,7 @@ #include "fd5_format.h" #include "fd5_resource.h" -#include "ir3_compiler.h" +#include "ir3/ir3_compiler.h" static bool valid_sample_count(unsigned sample_count) diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_context.h b/src/gallium/drivers/freedreno/a6xx/fd6_context.h index f3cdd44dec4..2493813fe1a 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_context.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_context.h @@ -32,7 +32,7 @@ #include "freedreno_context.h" -#include "ir3_shader.h" +#include "ir3/ir3_shader.h" #include "a6xx.xml.h" diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.h b/src/gallium/drivers/freedreno/a6xx/fd6_program.h index 83c4688a243..3ed5426b50e 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_program.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.h @@ -30,7 +30,8 @@ #include "pipe/p_context.h" #include "freedreno_context.h" -#include "ir3_shader.h" + +#include "ir3/ir3_shader.h" #include "ir3_cache.h" struct fd6_streamout_state { diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_screen.c b/src/gallium/drivers/freedreno/a6xx/fd6_screen.c index 9e039bf87a9..a191ea696ba 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_screen.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_screen.c @@ -33,7 +33,7 @@ #include "fd6_format.h" #include "fd6_resource.h" -#include "ir3_compiler.h" +#include "ir3/ir3_compiler.h" static boolean fd6_screen_is_format_supported(struct pipe_screen *pscreen, diff --git a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c deleted file mode 100644 index 4cf45ce9227..00000000000 --- a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c +++ /dev/null @@ -1,1038 +0,0 @@ -/* - * Copyright (c) 2013 Rob Clark <[email protected]> - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include <stdio.h> -#include <stdlib.h> -#include <stdint.h> -#include <stdbool.h> -#include <string.h> -#include <assert.h> - -#include <util/u_debug.h> - -#include "instr-a3xx.h" - -/* bitmask of debug flags */ -enum debug_t { - PRINT_RAW = 0x1, /* dump raw hexdump */ - PRINT_VERBOSE = 0x2, -}; - -static enum debug_t debug; - -#define printf debug_printf - -static const char *levels[] = { - "", - "\t", - "\t\t", - "\t\t\t", - "\t\t\t\t", - "\t\t\t\t\t", - "\t\t\t\t\t\t", - "\t\t\t\t\t\t\t", - "\t\t\t\t\t\t\t\t", - "\t\t\t\t\t\t\t\t\t", - "x", - "x", - "x", - "x", - "x", - "x", -}; - -static const char *component = "xyzw"; - -static const char *type[] = { - [TYPE_F16] = "f16", - [TYPE_F32] = "f32", - [TYPE_U16] = "u16", - [TYPE_U32] = "u32", - [TYPE_S16] = "s16", - [TYPE_S32] = "s32", - [TYPE_U8] = "u8", - [TYPE_S8] = "s8", -}; - -struct disasm_ctx { - FILE *out; - int level; - - /* current instruction repeat flag: */ - unsigned repeat; -}; - -static void print_reg(struct disasm_ctx *ctx, reg_t reg, bool full, bool r, - bool c, bool im, bool neg, bool abs, bool addr_rel) -{ - const char type = c ? 'c' : 'r'; - - // XXX I prefer - and || for neg/abs, but preserving format used - // by libllvm-a3xx for easy diffing.. - - if (abs && neg) - fprintf(ctx->out, "(absneg)"); - else if (neg) - fprintf(ctx->out, "(neg)"); - else if (abs) - fprintf(ctx->out, "(abs)"); - - if (r) - fprintf(ctx->out, "(r)"); - - if (im) { - fprintf(ctx->out, "%d", reg.iim_val); - } else if (addr_rel) { - /* I would just use %+d but trying to make it diff'able with - * libllvm-a3xx... - */ - if (reg.iim_val < 0) - fprintf(ctx->out, "%s%c<a0.x - %d>", full ? "" : "h", type, -reg.iim_val); - else if (reg.iim_val > 0) - fprintf(ctx->out, "%s%c<a0.x + %d>", full ? "" : "h", type, reg.iim_val); - else - fprintf(ctx->out, "%s%c<a0.x>", full ? "" : "h", type); - } else if ((reg.num == REG_A0) && !c) { - fprintf(ctx->out, "a0.%c", component[reg.comp]); - } else if ((reg.num == REG_P0) && !c) { - fprintf(ctx->out, "p0.%c", component[reg.comp]); - } else { - fprintf(ctx->out, "%s%c%d.%c", full ? "" : "h", type, reg.num & 0x3f, component[reg.comp]); - } -} - - -static void print_reg_dst(struct disasm_ctx *ctx, reg_t reg, bool full, bool addr_rel) -{ - print_reg(ctx, reg, full, false, false, false, false, false, addr_rel); -} - -static void print_reg_src(struct disasm_ctx *ctx, reg_t reg, bool full, bool r, - bool c, bool im, bool neg, bool abs, bool addr_rel) -{ - print_reg(ctx, reg, full, r, c, im, neg, abs, addr_rel); -} - -/* TODO switch to using reginfo struct everywhere, since more readable - * than passing a bunch of bools to print_reg_src - */ - -struct reginfo { - reg_t reg; - bool full; - bool r; - bool c; - bool im; - bool neg; - bool abs; - bool addr_rel; -}; - -static void print_src(struct disasm_ctx *ctx, struct reginfo *info) -{ - print_reg_src(ctx, info->reg, info->full, info->r, info->c, info->im, - info->neg, info->abs, info->addr_rel); -} - -//static void print_dst(struct disasm_ctx *ctx, struct reginfo *info) -//{ -// print_reg_dst(ctx, info->reg, info->full, info->addr_rel); -//} - -static void print_instr_cat0(struct disasm_ctx *ctx, instr_t *instr) -{ - instr_cat0_t *cat0 = &instr->cat0; - - switch (cat0->opc) { - case OPC_KILL: - fprintf(ctx->out, " %sp0.%c", cat0->inv ? "!" : "", - component[cat0->comp]); - break; - case OPC_BR: - fprintf(ctx->out, " %sp0.%c, #%d", cat0->inv ? "!" : "", - component[cat0->comp], cat0->a3xx.immed); - break; - case OPC_JUMP: - case OPC_CALL: - fprintf(ctx->out, " #%d", cat0->a3xx.immed); - break; - } - - if ((debug & PRINT_VERBOSE) && (cat0->dummy2|cat0->dummy3|cat0->dummy4)) - fprintf(ctx->out, "\t{0: %x,%x,%x}", cat0->dummy2, cat0->dummy3, cat0->dummy4); -} - -static void print_instr_cat1(struct disasm_ctx *ctx, instr_t *instr) -{ - instr_cat1_t *cat1 = &instr->cat1; - - if (cat1->ul) - fprintf(ctx->out, "(ul)"); - - if (cat1->src_type == cat1->dst_type) { - if ((cat1->src_type == TYPE_S16) && (((reg_t)cat1->dst).num == REG_A0)) { - /* special case (nmemonic?): */ - fprintf(ctx->out, "mova"); - } else { - fprintf(ctx->out, "mov.%s%s", type[cat1->src_type], type[cat1->dst_type]); - } - } else { - fprintf(ctx->out, "cov.%s%s", type[cat1->src_type], type[cat1->dst_type]); - } - - fprintf(ctx->out, " "); - - if (cat1->even) - fprintf(ctx->out, "(even)"); - - if (cat1->pos_inf) - fprintf(ctx->out, "(pos_infinity)"); - - print_reg_dst(ctx, (reg_t)(cat1->dst), type_size(cat1->dst_type) == 32, - cat1->dst_rel); - - fprintf(ctx->out, ", "); - - /* ugg, have to special case this.. vs print_reg().. */ - if (cat1->src_im) { - if (type_float(cat1->src_type)) - fprintf(ctx->out, "(%f)", cat1->fim_val); - else if (type_uint(cat1->src_type)) - fprintf(ctx->out, "0x%08x", cat1->uim_val); - else - fprintf(ctx->out, "%d", cat1->iim_val); - } else if (cat1->src_rel && !cat1->src_c) { - /* I would just use %+d but trying to make it diff'able with - * libllvm-a3xx... - */ - char type = cat1->src_rel_c ? 'c' : 'r'; - if (cat1->off < 0) - fprintf(ctx->out, "%c<a0.x - %d>", type, -cat1->off); - else if (cat1->off > 0) - fprintf(ctx->out, "%c<a0.x + %d>", type, cat1->off); - else - fprintf(ctx->out, "%c<a0.x>", type); - } else { - print_reg_src(ctx, (reg_t)(cat1->src), type_size(cat1->src_type) == 32, - cat1->src_r, cat1->src_c, cat1->src_im, false, false, false); - } - - if ((debug & PRINT_VERBOSE) && (cat1->must_be_0)) - fprintf(ctx->out, "\t{1: %x}", cat1->must_be_0); -} - -static void print_instr_cat2(struct disasm_ctx *ctx, instr_t *instr) -{ - instr_cat2_t *cat2 = &instr->cat2; - static const char *cond[] = { - "lt", - "le", - "gt", - "ge", - "eq", - "ne", - "?6?", - }; - - switch (_OPC(2, cat2->opc)) { - case OPC_CMPS_F: - case OPC_CMPS_U: - case OPC_CMPS_S: - case OPC_CMPV_F: - case OPC_CMPV_U: - case OPC_CMPV_S: - fprintf(ctx->out, ".%s", cond[cat2->cond]); - break; - } - - fprintf(ctx->out, " "); - if (cat2->ei) - fprintf(ctx->out, "(ei)"); - print_reg_dst(ctx, (reg_t)(cat2->dst), cat2->full ^ cat2->dst_half, false); - fprintf(ctx->out, ", "); - - if (cat2->c1.src1_c) { - print_reg_src(ctx, (reg_t)(cat2->c1.src1), cat2->full, cat2->src1_r, - cat2->c1.src1_c, cat2->src1_im, cat2->src1_neg, - cat2->src1_abs, false); - } else if (cat2->rel1.src1_rel) { - print_reg_src(ctx, (reg_t)(cat2->rel1.src1), cat2->full, cat2->src1_r, - cat2->rel1.src1_c, cat2->src1_im, cat2->src1_neg, - cat2->src1_abs, cat2->rel1.src1_rel); - } else { - print_reg_src(ctx, (reg_t)(cat2->src1), cat2->full, cat2->src1_r, - false, cat2->src1_im, cat2->src1_neg, - cat2->src1_abs, false); - } - - switch (_OPC(2, cat2->opc)) { - case OPC_ABSNEG_F: - case OPC_ABSNEG_S: - case OPC_CLZ_B: - case OPC_CLZ_S: - case OPC_SIGN_F: - case OPC_FLOOR_F: - case OPC_CEIL_F: - case OPC_RNDNE_F: - case OPC_RNDAZ_F: - case OPC_TRUNC_F: - case OPC_NOT_B: - case OPC_BFREV_B: - case OPC_SETRM: - case OPC_CBITS_B: - /* these only have one src reg */ - break; - default: - fprintf(ctx->out, ", "); - if (cat2->c2.src2_c) { - print_reg_src(ctx, (reg_t)(cat2->c2.src2), cat2->full, cat2->src2_r, - cat2->c2.src2_c, cat2->src2_im, cat2->src2_neg, - cat2->src2_abs, false); - } else if (cat2->rel2.src2_rel) { - print_reg_src(ctx, (reg_t)(cat2->rel2.src2), cat2->full, cat2->src2_r, - cat2->rel2.src2_c, cat2->src2_im, cat2->src2_neg, - cat2->src2_abs, cat2->rel2.src2_rel); - } else { - print_reg_src(ctx, (reg_t)(cat2->src2), cat2->full, cat2->src2_r, - false, cat2->src2_im, cat2->src2_neg, - cat2->src2_abs, false); - } - break; - } -} - -static void print_instr_cat3(struct disasm_ctx *ctx, instr_t *instr) -{ - instr_cat3_t *cat3 = &instr->cat3; - bool full = instr_cat3_full(cat3); - - fprintf(ctx->out, " "); - print_reg_dst(ctx, (reg_t)(cat3->dst), full ^ cat3->dst_half, false); - fprintf(ctx->out, ", "); - if (cat3->c1.src1_c) { - print_reg_src(ctx, (reg_t)(cat3->c1.src1), full, - cat3->src1_r, cat3->c1.src1_c, false, cat3->src1_neg, - false, false); - } else if (cat3->rel1.src1_rel) { - print_reg_src(ctx, (reg_t)(cat3->rel1.src1), full, - cat3->src1_r, cat3->rel1.src1_c, false, cat3->src1_neg, - false, cat3->rel1.src1_rel); - } else { - print_reg_src(ctx, (reg_t)(cat3->src1), full, - cat3->src1_r, false, false, cat3->src1_neg, - false, false); - } - fprintf(ctx->out, ", "); - print_reg_src(ctx, (reg_t)cat3->src2, full, - cat3->src2_r, cat3->src2_c, false, cat3->src2_neg, - false, false); - fprintf(ctx->out, ", "); - if (cat3->c2.src3_c) { - print_reg_src(ctx, (reg_t)(cat3->c2.src3), full, - cat3->src3_r, cat3->c2.src3_c, false, cat3->src3_neg, - false, false); - } else if (cat3->rel2.src3_rel) { - print_reg_src(ctx, (reg_t)(cat3->rel2.src3), full, - cat3->src3_r, cat3->rel2.src3_c, false, cat3->src3_neg, - false, cat3->rel2.src3_rel); - } else { - print_reg_src(ctx, (reg_t)(cat3->src3), full, - cat3->src3_r, false, false, cat3->src3_neg, - false, false); - } -} - -static void print_instr_cat4(struct disasm_ctx *ctx, instr_t *instr) -{ - instr_cat4_t *cat4 = &instr->cat4; - - fprintf(ctx->out, " "); - print_reg_dst(ctx, (reg_t)(cat4->dst), cat4->full ^ cat4->dst_half, false); - fprintf(ctx->out, ", "); - - if (cat4->c.src_c) { - print_reg_src(ctx, (reg_t)(cat4->c.src), cat4->full, - cat4->src_r, cat4->c.src_c, cat4->src_im, - cat4->src_neg, cat4->src_abs, false); - } else if (cat4->rel.src_rel) { - print_reg_src(ctx, (reg_t)(cat4->rel.src), cat4->full, - cat4->src_r, cat4->rel.src_c, cat4->src_im, - cat4->src_neg, cat4->src_abs, cat4->rel.src_rel); - } else { - print_reg_src(ctx, (reg_t)(cat4->src), cat4->full, - cat4->src_r, false, cat4->src_im, - cat4->src_neg, cat4->src_abs, false); - } - - if ((debug & PRINT_VERBOSE) && (cat4->dummy1|cat4->dummy2)) - fprintf(ctx->out, "\t{4: %x,%x}", cat4->dummy1, cat4->dummy2); -} - -static void print_instr_cat5(struct disasm_ctx *ctx, instr_t *instr) -{ - static const struct { - bool src1, src2, samp, tex; - } info[0x1f] = { - [opc_op(OPC_ISAM)] = { true, false, true, true, }, - [opc_op(OPC_ISAML)] = { true, true, true, true, }, - [opc_op(OPC_ISAMM)] = { true, false, true, true, }, - [opc_op(OPC_SAM)] = { true, false, true, true, }, - [opc_op(OPC_SAMB)] = { true, true, true, true, }, - [opc_op(OPC_SAML)] = { true, true, true, true, }, - [opc_op(OPC_SAMGQ)] = { true, false, true, true, }, - [opc_op(OPC_GETLOD)] = { true, false, true, true, }, - [opc_op(OPC_CONV)] = { true, true, true, true, }, - [opc_op(OPC_CONVM)] = { true, true, true, true, }, - [opc_op(OPC_GETSIZE)] = { true, false, false, true, }, - [opc_op(OPC_GETBUF)] = { false, false, false, true, }, - [opc_op(OPC_GETPOS)] = { true, false, false, true, }, - [opc_op(OPC_GETINFO)] = { false, false, false, true, }, - [opc_op(OPC_DSX)] = { true, false, false, false, }, - [opc_op(OPC_DSY)] = { true, false, false, false, }, - [opc_op(OPC_GATHER4R)] = { true, false, true, true, }, - [opc_op(OPC_GATHER4G)] = { true, false, true, true, }, - [opc_op(OPC_GATHER4B)] = { true, false, true, true, }, - [opc_op(OPC_GATHER4A)] = { true, false, true, true, }, - [opc_op(OPC_SAMGP0)] = { true, false, true, true, }, - [opc_op(OPC_SAMGP1)] = { true, false, true, true, }, - [opc_op(OPC_SAMGP2)] = { true, false, true, true, }, - [opc_op(OPC_SAMGP3)] = { true, false, true, true, }, - [opc_op(OPC_DSXPP_1)] = { true, false, false, false, }, - [opc_op(OPC_DSYPP_1)] = { true, false, false, false, }, - [opc_op(OPC_RGETPOS)] = { false, false, false, false, }, - [opc_op(OPC_RGETINFO)] = { false, false, false, false, }, - }; - instr_cat5_t *cat5 = &instr->cat5; - int i; - - if (cat5->is_3d) fprintf(ctx->out, ".3d"); - if (cat5->is_a) fprintf(ctx->out, ".a"); - if (cat5->is_o) fprintf(ctx->out, ".o"); - if (cat5->is_p) fprintf(ctx->out, ".p"); - if (cat5->is_s) fprintf(ctx->out, ".s"); - if (cat5->is_s2en) fprintf(ctx->out, ".s2en"); - - fprintf(ctx->out, " "); - - switch (_OPC(5, cat5->opc)) { - case OPC_DSXPP_1: - case OPC_DSYPP_1: - break; - default: - fprintf(ctx->out, "(%s)", type[cat5->type]); - break; - } - - fprintf(ctx->out, "("); - for (i = 0; i < 4; i++) - if (cat5->wrmask & (1 << i)) - fprintf(ctx->out, "%c", "xyzw"[i]); - fprintf(ctx->out, ")"); - - print_reg_dst(ctx, (reg_t)(cat5->dst), type_size(cat5->type) == 32, false); - - if (info[cat5->opc].src1) { - fprintf(ctx->out, ", "); - print_reg_src(ctx, (reg_t)(cat5->src1), cat5->full, false, false, false, - false, false, false); - } - - if (cat5->is_s2en) { - fprintf(ctx->out, ", "); - print_reg_src(ctx, (reg_t)(cat5->s2en.src2), cat5->full, false, false, false, - false, false, false); - fprintf(ctx->out, ", "); - print_reg_src(ctx, (reg_t)(cat5->s2en.src3), false, false, false, false, - false, false, false); - } else { - if (cat5->is_o || info[cat5->opc].src2) { - fprintf(ctx->out, ", "); - print_reg_src(ctx, (reg_t)(cat5->norm.src2), cat5->full, - false, false, false, false, false, false); - } - if (info[cat5->opc].samp) - fprintf(ctx->out, ", s#%d", cat5->norm.samp); - if (info[cat5->opc].tex) - fprintf(ctx->out, ", t#%d", cat5->norm.tex); - } - - if (debug & PRINT_VERBOSE) { - if (cat5->is_s2en) { - if ((debug & PRINT_VERBOSE) && (cat5->s2en.dummy1|cat5->s2en.dummy2|cat5->dummy2)) - fprintf(ctx->out, "\t{5: %x,%x,%x}", cat5->s2en.dummy1, cat5->s2en.dummy2, cat5->dummy2); - } else { - if ((debug & PRINT_VERBOSE) && (cat5->norm.dummy1|cat5->dummy2)) - fprintf(ctx->out, "\t{5: %x,%x}", cat5->norm.dummy1, cat5->dummy2); - } - } -} - -static void print_instr_cat6(struct disasm_ctx *ctx, instr_t *instr) -{ - instr_cat6_t *cat6 = &instr->cat6; - char sd = 0, ss = 0; /* dst/src address space */ - bool nodst = false; - struct reginfo dst, src1, src2; - int src1off = 0, dstoff = 0; - - memset(&dst, 0, sizeof(dst)); - memset(&src1, 0, sizeof(src1)); - memset(&src2, 0, sizeof(src2)); - - switch (_OPC(6, cat6->opc)) { - case OPC_RESINFO: - case OPC_RESFMT: - dst.full = type_size(cat6->type) == 32; - src1.full = type_size(cat6->type) == 32; - src2.full = type_size(cat6->type) == 32; - break; - case OPC_L2G: - case OPC_G2L: - dst.full = true; - src1.full = true; - src2.full = true; - break; - case OPC_STG: - case OPC_STL: - case OPC_STP: - case OPC_STI: - case OPC_STLW: - case OPC_STIB: - dst.full = true; - src1.full = type_size(cat6->type) == 32; - src2.full = type_size(cat6->type) == 32; - break; - default: - dst.full = type_size(cat6->type) == 32; - src1.full = true; - src2.full = true; - break; - } - - switch (_OPC(6, cat6->opc)) { - case OPC_PREFETCH: - break; - case OPC_RESINFO: - fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1); - break; - case OPC_LDGB: - fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped"); - fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1); - fprintf(ctx->out, ".%s", type[cat6->type]); - fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1); - break; - case OPC_STGB: - case OPC_STIB: - fprintf(ctx->out, ".%s", cat6->stgb.typed ? "typed" : "untyped"); - fprintf(ctx->out, ".%dd", cat6->stgb.d + 1); - fprintf(ctx->out, ".%s", type[cat6->type]); - fprintf(ctx->out, ".%d", cat6->stgb.type_size + 1); - break; - case OPC_ATOMIC_ADD: - case OPC_ATOMIC_SUB: - case OPC_ATOMIC_XCHG: - case OPC_ATOMIC_INC: - case OPC_ATOMIC_DEC: - case OPC_ATOMIC_CMPXCHG: - case OPC_ATOMIC_MIN: - case OPC_ATOMIC_MAX: - case OPC_ATOMIC_AND: - case OPC_ATOMIC_OR: - case OPC_ATOMIC_XOR: - ss = cat6->g ? 'g' : 'l'; - fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped"); - fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1); - fprintf(ctx->out, ".%s", type[cat6->type]); - fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1); - fprintf(ctx->out, ".%c", ss); - break; - default: - dst.im = cat6->g && !cat6->dst_off; - fprintf(ctx->out, ".%s", type[cat6->type]); - break; - } - fprintf(ctx->out, " "); - - switch (_OPC(6, cat6->opc)) { - case OPC_STG: - sd = 'g'; - break; - case OPC_STP: - sd = 'p'; - break; - case OPC_STL: - case OPC_STLW: - sd = 'l'; - break; - - case OPC_LDG: - case OPC_LDC: - ss = 'g'; - break; - case OPC_LDP: - ss = 'p'; - break; - case OPC_LDL: - case OPC_LDLW: - case OPC_LDLV: - ss = 'l'; - break; - - case OPC_L2G: - ss = 'l'; - sd = 'g'; - break; - - case OPC_G2L: - ss = 'g'; - sd = 'l'; - break; - - case OPC_PREFETCH: - ss = 'g'; - nodst = true; - break; - - case OPC_STI: - dst.full = false; // XXX or inverts?? - break; - } - - if ((_OPC(6, cat6->opc) == OPC_STGB) || (_OPC(6, cat6->opc) == OPC_STIB)) { - struct reginfo src3; - - memset(&src3, 0, sizeof(src3)); - - src1.reg = (reg_t)(cat6->stgb.src1); - src2.reg = (reg_t)(cat6->stgb.src2); - src2.im = cat6->stgb.src2_im; - src3.reg = (reg_t)(cat6->stgb.src3); - src3.im = cat6->stgb.src3_im; - src3.full = true; - - fprintf(ctx->out, "g[%u], ", cat6->stgb.dst_ssbo); - print_src(ctx, &src1); - fprintf(ctx->out, ", "); - print_src(ctx, &src2); - fprintf(ctx->out, ", "); - print_src(ctx, &src3); - - if (debug & PRINT_VERBOSE) - fprintf(ctx->out, " (pad0=%x, pad3=%x)", cat6->stgb.pad0, cat6->stgb.pad3); - - return; - } - - if (is_atomic(_OPC(6, cat6->opc))) { - - src1.reg = (reg_t)(cat6->ldgb.src1); - src1.im = cat6->ldgb.src1_im; - src2.reg = (reg_t)(cat6->ldgb.src2); - src2.im = cat6->ldgb.src2_im; - dst.reg = (reg_t)(cat6->ldgb.dst); - - print_src(ctx, &dst); - fprintf(ctx->out, ", "); - if (ss == 'g') { - struct reginfo src3; - memset(&src3, 0, sizeof(src3)); - - src3.reg = (reg_t)(cat6->ldgb.src3); - src3.full = true; - - /* For images, the ".typed" variant is used and src2 is - * the ivecN coordinates, ie ivec2 for 2d. - * - * For SSBOs, the ".untyped" variant is used and src2 is - * a simple dword offset.. src3 appears to be - * uvec2(offset * 4, 0). Not sure the point of that. - */ - - fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo); - print_src(ctx, &src1); /* value */ - fprintf(ctx->out, ", "); - print_src(ctx, &src2); /* offset/coords */ - fprintf(ctx->out, ", "); - print_src(ctx, &src3); /* 64b byte offset.. */ - - if (debug & PRINT_VERBOSE) { - fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0, - cat6->ldgb.pad3, cat6->ldgb.mustbe0); - } - } else { /* ss == 'l' */ - fprintf(ctx->out, "l["); - print_src(ctx, &src1); /* simple byte offset */ - fprintf(ctx->out, "], "); - print_src(ctx, &src2); /* value */ - - if (debug & PRINT_VERBOSE) { - fprintf(ctx->out, " (src3=%x, pad0=%x, pad3=%x, mustbe0=%x)", - cat6->ldgb.src3, cat6->ldgb.pad0, - cat6->ldgb.pad3, cat6->ldgb.mustbe0); - } - } - - return; - } else if (_OPC(6, cat6->opc) == OPC_RESINFO) { - dst.reg = (reg_t)(cat6->ldgb.dst); - - print_src(ctx, &dst); - fprintf(ctx->out, ", "); - fprintf(ctx->out, "g[%u]", cat6->ldgb.src_ssbo); - - return; - } else if (_OPC(6, cat6->opc) == OPC_LDGB) { - - src1.reg = (reg_t)(cat6->ldgb.src1); - src1.im = cat6->ldgb.src1_im; - src2.reg = (reg_t)(cat6->ldgb.src2); - src2.im = cat6->ldgb.src2_im; - dst.reg = (reg_t)(cat6->ldgb.dst); - - print_src(ctx, &dst); - fprintf(ctx->out, ", "); - fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo); - print_src(ctx, &src1); - fprintf(ctx->out, ", "); - print_src(ctx, &src2); - - if (debug & PRINT_VERBOSE) - fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0, cat6->ldgb.pad3, cat6->ldgb.mustbe0); - - return; - } - if (cat6->dst_off) { - dst.reg = (reg_t)(cat6->c.dst); - dstoff = cat6->c.off; - } else { - dst.reg = (reg_t)(cat6->d.dst); - } - - if (cat6->src_off) { - src1.reg = (reg_t)(cat6->a.src1); - src1.im = cat6->a.src1_im; - src2.reg = (reg_t)(cat6->a.src2); - src2.im = cat6->a.src2_im; - src1off = cat6->a.off; - } else { - src1.reg = (reg_t)(cat6->b.src1); - src1.im = cat6->b.src1_im; - src2.reg = (reg_t)(cat6->b.src2); - src2.im = cat6->b.src2_im; - } - - if (!nodst) { - if (sd) - fprintf(ctx->out, "%c[", sd); - /* note: dst might actually be a src (ie. address to store to) */ - print_src(ctx, &dst); - if (dstoff) - fprintf(ctx->out, "%+d", dstoff); - if (sd) - fprintf(ctx->out, "]"); - fprintf(ctx->out, ", "); - } - - if (ss) - fprintf(ctx->out, "%c[", ss); - - /* can have a larger than normal immed, so hack: */ - if (src1.im) { - fprintf(ctx->out, "%u", src1.reg.dummy13); - } else { - print_src(ctx, &src1); - } - - if (src1off) - fprintf(ctx->out, "%+d", src1off); - if (ss) - fprintf(ctx->out, "]"); - - switch (_OPC(6, cat6->opc)) { - case OPC_RESINFO: - case OPC_RESFMT: - break; - default: - fprintf(ctx->out, ", "); - print_src(ctx, &src2); - break; - } -} - -static void print_instr_cat7(struct disasm_ctx *ctx, instr_t *instr) -{ - instr_cat7_t *cat7 = &instr->cat7; - - if (cat7->g) - fprintf(ctx->out, ".g"); - if (cat7->l) - fprintf(ctx->out, ".l"); - - if (_OPC(7, cat7->opc) == OPC_FENCE) { - if (cat7->r) - fprintf(ctx->out, ".r"); - if (cat7->w) - fprintf(ctx->out, ".w"); - } -} - -/* size of largest OPC field of all the instruction categories: */ -#define NOPC_BITS 6 - -static const struct opc_info { - uint16_t cat; - uint16_t opc; - const char *name; - void (*print)(struct disasm_ctx *ctx, instr_t *instr); -} opcs[1 << (3+NOPC_BITS)] = { -#define OPC(cat, opc, name) [(opc)] = { (cat), (opc), #name, print_instr_cat##cat } - /* category 0: */ - OPC(0, OPC_NOP, nop), - OPC(0, OPC_BR, br), - OPC(0, OPC_JUMP, jump), - OPC(0, OPC_CALL, call), - OPC(0, OPC_RET, ret), - OPC(0, OPC_KILL, kill), - OPC(0, OPC_END, end), - OPC(0, OPC_EMIT, emit), - OPC(0, OPC_CUT, cut), - OPC(0, OPC_CHMASK, chmask), - OPC(0, OPC_CHSH, chsh), - OPC(0, OPC_FLOW_REV, flow_rev), - - /* category 1: */ - OPC(1, OPC_MOV, ), - - /* category 2: */ - OPC(2, OPC_ADD_F, add.f), - OPC(2, OPC_MIN_F, min.f), - OPC(2, OPC_MAX_F, max.f), - OPC(2, OPC_MUL_F, mul.f), - OPC(2, OPC_SIGN_F, sign.f), - OPC(2, OPC_CMPS_F, cmps.f), - OPC(2, OPC_ABSNEG_F, absneg.f), - OPC(2, OPC_CMPV_F, cmpv.f), - OPC(2, OPC_FLOOR_F, floor.f), - OPC(2, OPC_CEIL_F, ceil.f), - OPC(2, OPC_RNDNE_F, rndne.f), - OPC(2, OPC_RNDAZ_F, rndaz.f), - OPC(2, OPC_TRUNC_F, trunc.f), - OPC(2, OPC_ADD_U, add.u), - OPC(2, OPC_ADD_S, add.s), - OPC(2, OPC_SUB_U, sub.u), - OPC(2, OPC_SUB_S, sub.s), - OPC(2, OPC_CMPS_U, cmps.u), - OPC(2, OPC_CMPS_S, cmps.s), - OPC(2, OPC_MIN_U, min.u), - OPC(2, OPC_MIN_S, min.s), - OPC(2, OPC_MAX_U, max.u), - OPC(2, OPC_MAX_S, max.s), - OPC(2, OPC_ABSNEG_S, absneg.s), - OPC(2, OPC_AND_B, and.b), - OPC(2, OPC_OR_B, or.b), - OPC(2, OPC_NOT_B, not.b), - OPC(2, OPC_XOR_B, xor.b), - OPC(2, OPC_CMPV_U, cmpv.u), - OPC(2, OPC_CMPV_S, cmpv.s), - OPC(2, OPC_MUL_U, mul.u), - OPC(2, OPC_MUL_S, mul.s), - OPC(2, OPC_MULL_U, mull.u), - OPC(2, OPC_BFREV_B, bfrev.b), - OPC(2, OPC_CLZ_S, clz.s), - OPC(2, OPC_CLZ_B, clz.b), - OPC(2, OPC_SHL_B, shl.b), - OPC(2, OPC_SHR_B, shr.b), - OPC(2, OPC_ASHR_B, ashr.b), - OPC(2, OPC_BARY_F, bary.f), - OPC(2, OPC_MGEN_B, mgen.b), - OPC(2, OPC_GETBIT_B, getbit.b), - OPC(2, OPC_SETRM, setrm), - OPC(2, OPC_CBITS_B, cbits.b), - OPC(2, OPC_SHB, shb), - OPC(2, OPC_MSAD, msad), - - /* category 3: */ - OPC(3, OPC_MAD_U16, mad.u16), - OPC(3, OPC_MADSH_U16, madsh.u16), - OPC(3, OPC_MAD_S16, mad.s16), - OPC(3, OPC_MADSH_M16, madsh.m16), - OPC(3, OPC_MAD_U24, mad.u24), - OPC(3, OPC_MAD_S24, mad.s24), - OPC(3, OPC_MAD_F16, mad.f16), - OPC(3, OPC_MAD_F32, mad.f32), - OPC(3, OPC_SEL_B16, sel.b16), - OPC(3, OPC_SEL_B32, sel.b32), - OPC(3, OPC_SEL_S16, sel.s16), - OPC(3, OPC_SEL_S32, sel.s32), - OPC(3, OPC_SEL_F16, sel.f16), - OPC(3, OPC_SEL_F32, sel.f32), - OPC(3, OPC_SAD_S16, sad.s16), - OPC(3, OPC_SAD_S32, sad.s32), - - /* category 4: */ - OPC(4, OPC_RCP, rcp), - OPC(4, OPC_RSQ, rsq), - OPC(4, OPC_LOG2, log2), - OPC(4, OPC_EXP2, exp2), - OPC(4, OPC_SIN, sin), - OPC(4, OPC_COS, cos), - OPC(4, OPC_SQRT, sqrt), - - /* category 5: */ - OPC(5, OPC_ISAM, isam), - OPC(5, OPC_ISAML, isaml), - OPC(5, OPC_ISAMM, isamm), - OPC(5, OPC_SAM, sam), - OPC(5, OPC_SAMB, samb), - OPC(5, OPC_SAML, saml), - OPC(5, OPC_SAMGQ, samgq), - OPC(5, OPC_GETLOD, getlod), - OPC(5, OPC_CONV, conv), - OPC(5, OPC_CONVM, convm), - OPC(5, OPC_GETSIZE, getsize), - OPC(5, OPC_GETBUF, getbuf), - OPC(5, OPC_GETPOS, getpos), - OPC(5, OPC_GETINFO, getinfo), - OPC(5, OPC_DSX, dsx), - OPC(5, OPC_DSY, dsy), - OPC(5, OPC_GATHER4R, gather4r), - OPC(5, OPC_GATHER4G, gather4g), - OPC(5, OPC_GATHER4B, gather4b), - OPC(5, OPC_GATHER4A, gather4a), - OPC(5, OPC_SAMGP0, samgp0), - OPC(5, OPC_SAMGP1, samgp1), - OPC(5, OPC_SAMGP2, samgp2), - OPC(5, OPC_SAMGP3, samgp3), - OPC(5, OPC_DSXPP_1, dsxpp.1), - OPC(5, OPC_DSYPP_1, dsypp.1), - OPC(5, OPC_RGETPOS, rgetpos), - OPC(5, OPC_RGETINFO, rgetinfo), - - - /* category 6: */ - OPC(6, OPC_LDG, ldg), - OPC(6, OPC_LDL, ldl), - OPC(6, OPC_LDP, ldp), - OPC(6, OPC_STG, stg), - OPC(6, OPC_STL, stl), - OPC(6, OPC_STP, stp), - OPC(6, OPC_STI, sti), - OPC(6, OPC_G2L, g2l), - OPC(6, OPC_L2G, l2g), - OPC(6, OPC_PREFETCH, prefetch), - OPC(6, OPC_LDLW, ldlw), - OPC(6, OPC_STLW, stlw), - OPC(6, OPC_RESFMT, resfmt), - OPC(6, OPC_RESINFO, resinfo), - OPC(6, OPC_ATOMIC_ADD, atomic.add), - OPC(6, OPC_ATOMIC_SUB, atomic.sub), - OPC(6, OPC_ATOMIC_XCHG, atomic.xchg), - OPC(6, OPC_ATOMIC_INC, atomic.inc), - OPC(6, OPC_ATOMIC_DEC, atomic.dec), - OPC(6, OPC_ATOMIC_CMPXCHG, atomic.cmpxchg), - OPC(6, OPC_ATOMIC_MIN, atomic.min), - OPC(6, OPC_ATOMIC_MAX, atomic.max), - OPC(6, OPC_ATOMIC_AND, atomic.and), - OPC(6, OPC_ATOMIC_OR, atomic.or), - OPC(6, OPC_ATOMIC_XOR, atomic.xor), - OPC(6, OPC_LDGB, ldgb), - OPC(6, OPC_STGB, stgb), - OPC(6, OPC_STIB, stib), - OPC(6, OPC_LDC, ldc), - OPC(6, OPC_LDLV, ldlv), - - OPC(7, OPC_BAR, bar), - OPC(7, OPC_FENCE, fence), - -#undef OPC -}; - -#define GETINFO(instr) (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr)])) - -// XXX hack.. probably should move this table somewhere common: -#include "ir3.h" -const char *ir3_instr_name(struct ir3_instruction *instr) -{ - if (opc_cat(instr->opc) == -1) return "??meta??"; - return opcs[instr->opc].name; -} - -static bool print_instr(struct disasm_ctx *ctx, uint32_t *dwords, int n) -{ - instr_t *instr = (instr_t *)dwords; - uint32_t opc = instr_opc(instr); - const char *name; - - if (debug & PRINT_VERBOSE) - fprintf(ctx->out, "%s%04d[%08xx_%08xx] ", levels[ctx->level], n, dwords[1], dwords[0]); - - /* NOTE: order flags are printed is a bit fugly.. but for now I - * try to match the order in llvm-a3xx disassembler for easy - * diff'ing.. - */ - - ctx->repeat = instr_repeat(instr); - - if (instr->sync) - fprintf(ctx->out, "(sy)"); - if (instr->ss && ((instr->opc_cat <= 4) || (instr->opc_cat == 7))) - fprintf(ctx->out, "(ss)"); - if (instr->jmp_tgt) - fprintf(ctx->out, "(jp)"); - if (instr_sat(instr)) - fprintf(ctx->out, "(sat)"); - if (ctx->repeat) - fprintf(ctx->out, "(rpt%d)", ctx->repeat); - if (instr->ul && ((2 <= instr->opc_cat) && (instr->opc_cat <= 4))) - fprintf(ctx->out, "(ul)"); - - name = GETINFO(instr)->name; - - if (name) { - fprintf(ctx->out, "%s", name); - GETINFO(instr)->print(ctx, instr); - } else { - fprintf(ctx->out, "unknown(%d,%d)", instr->opc_cat, opc); - } - - fprintf(ctx->out, "\n"); - - return (instr->opc_cat == 0) && (opc == OPC_END); -} - -int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out) -{ - struct disasm_ctx ctx; - int i; - - assert((sizedwords % 2) == 0); - - memset(&ctx, 0, sizeof(ctx)); - ctx.out = out; - ctx.level = level; - - for (i = 0; i < sizedwords; i += 2) - print_instr(&ctx, &dwords[i], i/2); - - return 0; -} diff --git a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h deleted file mode 100644 index 7f60ee5fd4c..00000000000 --- a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h +++ /dev/null @@ -1,872 +0,0 @@ -/* - * Copyright (c) 2013 Rob Clark <[email protected]> - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef INSTR_A3XX_H_ -#define INSTR_A3XX_H_ - -#define PACKED __attribute__((__packed__)) - -#include <stdint.h> -#include <stdio.h> -#include <stdbool.h> -#include <assert.h> - -/* size of largest OPC field of all the instruction categories: */ -#define NOPC_BITS 6 - -#define _OPC(cat, opc) (((cat) << NOPC_BITS) | opc) - -typedef enum { - /* category 0: */ - OPC_NOP = _OPC(0, 0), - OPC_BR = _OPC(0, 1), - OPC_JUMP = _OPC(0, 2), - OPC_CALL = _OPC(0, 3), - OPC_RET = _OPC(0, 4), - OPC_KILL = _OPC(0, 5), - OPC_END = _OPC(0, 6), - OPC_EMIT = _OPC(0, 7), - OPC_CUT = _OPC(0, 8), - OPC_CHMASK = _OPC(0, 9), - OPC_CHSH = _OPC(0, 10), - OPC_FLOW_REV = _OPC(0, 11), - - /* category 1: */ - OPC_MOV = _OPC(1, 0), - - /* category 2: */ - OPC_ADD_F = _OPC(2, 0), - OPC_MIN_F = _OPC(2, 1), - OPC_MAX_F = _OPC(2, 2), - OPC_MUL_F = _OPC(2, 3), - OPC_SIGN_F = _OPC(2, 4), - OPC_CMPS_F = _OPC(2, 5), - OPC_ABSNEG_F = _OPC(2, 6), - OPC_CMPV_F = _OPC(2, 7), - /* 8 - invalid */ - OPC_FLOOR_F = _OPC(2, 9), - OPC_CEIL_F = _OPC(2, 10), - OPC_RNDNE_F = _OPC(2, 11), - OPC_RNDAZ_F = _OPC(2, 12), - OPC_TRUNC_F = _OPC(2, 13), - /* 14-15 - invalid */ - OPC_ADD_U = _OPC(2, 16), - OPC_ADD_S = _OPC(2, 17), - OPC_SUB_U = _OPC(2, 18), - OPC_SUB_S = _OPC(2, 19), - OPC_CMPS_U = _OPC(2, 20), - OPC_CMPS_S = _OPC(2, 21), - OPC_MIN_U = _OPC(2, 22), - OPC_MIN_S = _OPC(2, 23), - OPC_MAX_U = _OPC(2, 24), - OPC_MAX_S = _OPC(2, 25), - OPC_ABSNEG_S = _OPC(2, 26), - /* 27 - invalid */ - OPC_AND_B = _OPC(2, 28), - OPC_OR_B = _OPC(2, 29), - OPC_NOT_B = _OPC(2, 30), - OPC_XOR_B = _OPC(2, 31), - /* 32 - invalid */ - OPC_CMPV_U = _OPC(2, 33), - OPC_CMPV_S = _OPC(2, 34), - /* 35-47 - invalid */ - OPC_MUL_U = _OPC(2, 48), - OPC_MUL_S = _OPC(2, 49), - OPC_MULL_U = _OPC(2, 50), - OPC_BFREV_B = _OPC(2, 51), - OPC_CLZ_S = _OPC(2, 52), - OPC_CLZ_B = _OPC(2, 53), - OPC_SHL_B = _OPC(2, 54), - OPC_SHR_B = _OPC(2, 55), - OPC_ASHR_B = _OPC(2, 56), - OPC_BARY_F = _OPC(2, 57), - OPC_MGEN_B = _OPC(2, 58), - OPC_GETBIT_B = _OPC(2, 59), - OPC_SETRM = _OPC(2, 60), - OPC_CBITS_B = _OPC(2, 61), - OPC_SHB = _OPC(2, 62), - OPC_MSAD = _OPC(2, 63), - - /* category 3: */ - OPC_MAD_U16 = _OPC(3, 0), - OPC_MADSH_U16 = _OPC(3, 1), - OPC_MAD_S16 = _OPC(3, 2), - OPC_MADSH_M16 = _OPC(3, 3), /* should this be .s16? */ - OPC_MAD_U24 = _OPC(3, 4), - OPC_MAD_S24 = _OPC(3, 5), - OPC_MAD_F16 = _OPC(3, 6), - OPC_MAD_F32 = _OPC(3, 7), - OPC_SEL_B16 = _OPC(3, 8), - OPC_SEL_B32 = _OPC(3, 9), - OPC_SEL_S16 = _OPC(3, 10), - OPC_SEL_S32 = _OPC(3, 11), - OPC_SEL_F16 = _OPC(3, 12), - OPC_SEL_F32 = _OPC(3, 13), - OPC_SAD_S16 = _OPC(3, 14), - OPC_SAD_S32 = _OPC(3, 15), - - /* category 4: */ - OPC_RCP = _OPC(4, 0), - OPC_RSQ = _OPC(4, 1), - OPC_LOG2 = _OPC(4, 2), - OPC_EXP2 = _OPC(4, 3), - OPC_SIN = _OPC(4, 4), - OPC_COS = _OPC(4, 5), - OPC_SQRT = _OPC(4, 6), - // 7-63 - invalid - - /* category 5: */ - OPC_ISAM = _OPC(5, 0), - OPC_ISAML = _OPC(5, 1), - OPC_ISAMM = _OPC(5, 2), - OPC_SAM = _OPC(5, 3), - OPC_SAMB = _OPC(5, 4), - OPC_SAML = _OPC(5, 5), - OPC_SAMGQ = _OPC(5, 6), - OPC_GETLOD = _OPC(5, 7), - OPC_CONV = _OPC(5, 8), - OPC_CONVM = _OPC(5, 9), - OPC_GETSIZE = _OPC(5, 10), - OPC_GETBUF = _OPC(5, 11), - OPC_GETPOS = _OPC(5, 12), - OPC_GETINFO = _OPC(5, 13), - OPC_DSX = _OPC(5, 14), - OPC_DSY = _OPC(5, 15), - OPC_GATHER4R = _OPC(5, 16), - OPC_GATHER4G = _OPC(5, 17), - OPC_GATHER4B = _OPC(5, 18), - OPC_GATHER4A = _OPC(5, 19), - OPC_SAMGP0 = _OPC(5, 20), - OPC_SAMGP1 = _OPC(5, 21), - OPC_SAMGP2 = _OPC(5, 22), - OPC_SAMGP3 = _OPC(5, 23), - OPC_DSXPP_1 = _OPC(5, 24), - OPC_DSYPP_1 = _OPC(5, 25), - OPC_RGETPOS = _OPC(5, 26), - OPC_RGETINFO = _OPC(5, 27), - - /* category 6: */ - OPC_LDG = _OPC(6, 0), /* load-global */ - OPC_LDL = _OPC(6, 1), - OPC_LDP = _OPC(6, 2), - OPC_STG = _OPC(6, 3), /* store-global */ - OPC_STL = _OPC(6, 4), - OPC_STP = _OPC(6, 5), - OPC_STI = _OPC(6, 6), - OPC_G2L = _OPC(6, 7), - OPC_L2G = _OPC(6, 8), - OPC_PREFETCH = _OPC(6, 9), - OPC_LDLW = _OPC(6, 10), - OPC_STLW = _OPC(6, 11), - OPC_RESFMT = _OPC(6, 14), - OPC_RESINFO = _OPC(6, 15), - OPC_ATOMIC_ADD = _OPC(6, 16), - OPC_ATOMIC_SUB = _OPC(6, 17), - OPC_ATOMIC_XCHG = _OPC(6, 18), - OPC_ATOMIC_INC = _OPC(6, 19), - OPC_ATOMIC_DEC = _OPC(6, 20), - OPC_ATOMIC_CMPXCHG = _OPC(6, 21), - OPC_ATOMIC_MIN = _OPC(6, 22), - OPC_ATOMIC_MAX = _OPC(6, 23), - OPC_ATOMIC_AND = _OPC(6, 24), - OPC_ATOMIC_OR = _OPC(6, 25), - OPC_ATOMIC_XOR = _OPC(6, 26), - OPC_LDGB = _OPC(6, 27), - OPC_STGB = _OPC(6, 28), - OPC_STIB = _OPC(6, 29), - OPC_LDC = _OPC(6, 30), - OPC_LDLV = _OPC(6, 31), - - /* category 7: */ - OPC_BAR = _OPC(7, 0), - OPC_FENCE = _OPC(7, 1), - - /* meta instructions (category -1): */ - /* placeholder instr to mark shader inputs: */ - OPC_META_INPUT = _OPC(-1, 0), - /* The "fan-in" and "fan-out" instructions are used for keeping - * track of instructions that write to multiple dst registers - * (fan-out) like texture sample instructions, or read multiple - * consecutive scalar registers (fan-in) (bary.f, texture samp) - */ - OPC_META_FO = _OPC(-1, 2), - OPC_META_FI = _OPC(-1, 3), - -} opc_t; - -#define opc_cat(opc) ((int)((opc) >> NOPC_BITS)) -#define opc_op(opc) ((unsigned)((opc) & ((1 << NOPC_BITS) - 1))) - -typedef enum { - TYPE_F16 = 0, - TYPE_F32 = 1, - TYPE_U16 = 2, - TYPE_U32 = 3, - TYPE_S16 = 4, - TYPE_S32 = 5, - TYPE_U8 = 6, - TYPE_S8 = 7, // XXX I assume? -} type_t; - -static inline uint32_t type_size(type_t type) -{ - switch (type) { - case TYPE_F32: - case TYPE_U32: - case TYPE_S32: - return 32; - case TYPE_F16: - case TYPE_U16: - case TYPE_S16: - return 16; - case TYPE_U8: - case TYPE_S8: - return 8; - default: - assert(0); /* invalid type */ - return 0; - } -} - -static inline int type_float(type_t type) -{ - return (type == TYPE_F32) || (type == TYPE_F16); -} - -static inline int type_uint(type_t type) -{ - return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8); -} - -static inline int type_sint(type_t type) -{ - return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8); -} - -typedef union PACKED { - /* normal gpr or const src register: */ - struct PACKED { - uint32_t comp : 2; - uint32_t num : 10; - }; - /* for immediate val: */ - int32_t iim_val : 11; - /* to make compiler happy: */ - uint32_t dummy32; - uint32_t dummy10 : 10; - int32_t idummy10 : 10; - uint32_t dummy11 : 11; - uint32_t dummy12 : 12; - uint32_t dummy13 : 13; - uint32_t dummy8 : 8; -} reg_t; - -/* special registers: */ -#define REG_A0 61 /* address register */ -#define REG_P0 62 /* predicate register */ - -static inline int reg_special(reg_t reg) -{ - return (reg.num == REG_A0) || (reg.num == REG_P0); -} - -typedef struct PACKED { - /* dword0: */ - union PACKED { - struct PACKED { - int16_t immed : 16; - uint32_t dummy1 : 16; - } a3xx; - struct PACKED { - int32_t immed : 20; - uint32_t dummy1 : 12; - } a4xx; - struct PACKED { - int32_t immed : 32; - } a5xx; - }; - - /* dword1: */ - uint32_t dummy2 : 8; - uint32_t repeat : 3; - uint32_t dummy3 : 1; - uint32_t ss : 1; - uint32_t dummy4 : 7; - uint32_t inv : 1; - uint32_t comp : 2; - uint32_t opc : 4; - uint32_t jmp_tgt : 1; - uint32_t sync : 1; - uint32_t opc_cat : 3; -} instr_cat0_t; - -typedef struct PACKED { - /* dword0: */ - union PACKED { - /* for normal src register: */ - struct PACKED { - uint32_t src : 11; - /* at least low bit of pad must be zero or it will - * look like a address relative src - */ - uint32_t pad : 21; - }; - /* for address relative: */ - struct PACKED { - int32_t off : 10; - uint32_t src_rel_c : 1; - uint32_t src_rel : 1; - uint32_t unknown : 20; - }; - /* for immediate: */ - int32_t iim_val; - uint32_t uim_val; - float fim_val; - }; - - /* dword1: */ - uint32_t dst : 8; - uint32_t repeat : 3; - uint32_t src_r : 1; - uint32_t ss : 1; - uint32_t ul : 1; - uint32_t dst_type : 3; - uint32_t dst_rel : 1; - uint32_t src_type : 3; - uint32_t src_c : 1; - uint32_t src_im : 1; - uint32_t even : 1; - uint32_t pos_inf : 1; - uint32_t must_be_0 : 2; - uint32_t jmp_tgt : 1; - uint32_t sync : 1; - uint32_t opc_cat : 3; -} instr_cat1_t; - -typedef struct PACKED { - /* dword0: */ - union PACKED { - struct PACKED { - uint32_t src1 : 11; - uint32_t must_be_zero1: 2; - uint32_t src1_im : 1; /* immediate */ - uint32_t src1_neg : 1; /* negate */ - uint32_t src1_abs : 1; /* absolute value */ - }; - struct PACKED { - uint32_t src1 : 10; - uint32_t src1_c : 1; /* relative-const */ - uint32_t src1_rel : 1; /* relative address */ - uint32_t must_be_zero : 1; - uint32_t dummy : 3; - } rel1; - struct PACKED { - uint32_t src1 : 12; - uint32_t src1_c : 1; /* const */ - uint32_t dummy : 3; - } c1; - }; - - union PACKED { - struct PACKED { - uint32_t src2 : 11; - uint32_t must_be_zero2: 2; - uint32_t src2_im : 1; /* immediate */ - uint32_t src2_neg : 1; /* negate */ - uint32_t src2_abs : 1; /* absolute value */ - }; - struct PACKED { - uint32_t src2 : 10; - uint32_t src2_c : 1; /* relative-const */ - uint32_t src2_rel : 1; /* relative address */ - uint32_t must_be_zero : 1; - uint32_t dummy : 3; - } rel2; - struct PACKED { - uint32_t src2 : 12; - uint32_t src2_c : 1; /* const */ - uint32_t dummy : 3; - } c2; - }; - - /* dword1: */ - uint32_t dst : 8; - uint32_t repeat : 2; - uint32_t sat : 1; - uint32_t src1_r : 1; - uint32_t ss : 1; - uint32_t ul : 1; /* dunno */ - uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */ - uint32_t ei : 1; - uint32_t cond : 3; - uint32_t src2_r : 1; - uint32_t full : 1; /* not half */ - uint32_t opc : 6; - uint32_t jmp_tgt : 1; - uint32_t sync : 1; - uint32_t opc_cat : 3; -} instr_cat2_t; - -typedef struct PACKED { - /* dword0: */ - union PACKED { - struct PACKED { - uint32_t src1 : 11; - uint32_t must_be_zero1: 2; - uint32_t src2_c : 1; - uint32_t src1_neg : 1; - uint32_t src2_r : 1; - }; - struct PACKED { - uint32_t src1 : 10; - uint32_t src1_c : 1; - uint32_t src1_rel : 1; - uint32_t must_be_zero : 1; - uint32_t dummy : 3; - } rel1; - struct PACKED { - uint32_t src1 : 12; - uint32_t src1_c : 1; - uint32_t dummy : 3; - } c1; - }; - - union PACKED { - struct PACKED { - uint32_t src3 : 11; - uint32_t must_be_zero2: 2; - uint32_t src3_r : 1; - uint32_t src2_neg : 1; - uint32_t src3_neg : 1; - }; - struct PACKED { - uint32_t src3 : 10; - uint32_t src3_c : 1; - uint32_t src3_rel : 1; - uint32_t must_be_zero : 1; - uint32_t dummy : 3; - } rel2; - struct PACKED { - uint32_t src3 : 12; - uint32_t src3_c : 1; - uint32_t dummy : 3; - } c2; - }; - - /* dword1: */ - uint32_t dst : 8; - uint32_t repeat : 2; - uint32_t sat : 1; - uint32_t src1_r : 1; - uint32_t ss : 1; - uint32_t ul : 1; - uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */ - uint32_t src2 : 8; - uint32_t opc : 4; - uint32_t jmp_tgt : 1; - uint32_t sync : 1; - uint32_t opc_cat : 3; -} instr_cat3_t; - -static inline bool instr_cat3_full(instr_cat3_t *cat3) -{ - switch (_OPC(3, cat3->opc)) { - case OPC_MAD_F16: - case OPC_MAD_U16: - case OPC_MAD_S16: - case OPC_SEL_B16: - case OPC_SEL_S16: - case OPC_SEL_F16: - case OPC_SAD_S16: - case OPC_SAD_S32: // really?? - return false; - default: - return true; - } -} - -typedef struct PACKED { - /* dword0: */ - union PACKED { - struct PACKED { - uint32_t src : 11; - uint32_t must_be_zero1: 2; - uint32_t src_im : 1; /* immediate */ - uint32_t src_neg : 1; /* negate */ - uint32_t src_abs : 1; /* absolute value */ - }; - struct PACKED { - uint32_t src : 10; - uint32_t src_c : 1; /* relative-const */ - uint32_t src_rel : 1; /* relative address */ - uint32_t must_be_zero : 1; - uint32_t dummy : 3; - } rel; - struct PACKED { - uint32_t src : 12; - uint32_t src_c : 1; /* const */ - uint32_t dummy : 3; - } c; - }; - uint32_t dummy1 : 16; /* seem to be ignored */ - - /* dword1: */ - uint32_t dst : 8; - uint32_t repeat : 2; - uint32_t sat : 1; - uint32_t src_r : 1; - uint32_t ss : 1; - uint32_t ul : 1; - uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */ - uint32_t dummy2 : 5; /* seem to be ignored */ - uint32_t full : 1; /* not half */ - uint32_t opc : 6; - uint32_t jmp_tgt : 1; - uint32_t sync : 1; - uint32_t opc_cat : 3; -} instr_cat4_t; - -typedef struct PACKED { - /* dword0: */ - union PACKED { - /* normal case: */ - struct PACKED { - uint32_t full : 1; /* not half */ - uint32_t src1 : 8; - uint32_t src2 : 8; - uint32_t dummy1 : 4; /* seem to be ignored */ - uint32_t samp : 4; - uint32_t tex : 7; - } norm; - /* s2en case: */ - struct PACKED { - uint32_t full : 1; /* not half */ - uint32_t src1 : 8; - uint32_t src2 : 11; - uint32_t dummy1 : 1; - uint32_t src3 : 8; - uint32_t dummy2 : 3; - } s2en; - /* same in either case: */ - // XXX I think, confirm this - struct PACKED { - uint32_t full : 1; /* not half */ - uint32_t src1 : 8; - uint32_t pad : 23; - }; - }; - - /* dword1: */ - uint32_t dst : 8; - uint32_t wrmask : 4; /* write-mask */ - uint32_t type : 3; - uint32_t dummy2 : 1; /* seems to be ignored */ - uint32_t is_3d : 1; - - uint32_t is_a : 1; - uint32_t is_s : 1; - uint32_t is_s2en : 1; - uint32_t is_o : 1; - uint32_t is_p : 1; - - uint32_t opc : 5; - uint32_t jmp_tgt : 1; - uint32_t sync : 1; - uint32_t opc_cat : 3; -} instr_cat5_t; - -/* dword0 encoding for src_off: [src1 + off], src2: */ -typedef struct PACKED { - /* dword0: */ - uint32_t mustbe1 : 1; - int32_t off : 13; - uint32_t src1 : 8; - uint32_t src1_im : 1; - uint32_t src2_im : 1; - uint32_t src2 : 8; - - /* dword1: */ - uint32_t dword1; -} instr_cat6a_t; - -/* dword0 encoding for !src_off: [src1], src2 */ -typedef struct PACKED { - /* dword0: */ - uint32_t mustbe0 : 1; - uint32_t src1 : 13; - uint32_t ignore0 : 8; - uint32_t src1_im : 1; - uint32_t src2_im : 1; - uint32_t src2 : 8; - - /* dword1: */ - uint32_t dword1; -} instr_cat6b_t; - -/* dword1 encoding for dst_off: */ -typedef struct PACKED { - /* dword0: */ - uint32_t dword0; - - /* note: there is some weird stuff going on where sometimes - * cat6->a.off is involved.. but that seems like a bug in - * the blob, since it is used even if !cat6->src_off - * It would make sense for there to be some more bits to - * bring us to 11 bits worth of offset, but not sure.. - */ - int32_t off : 8; - uint32_t mustbe1 : 1; - uint32_t dst : 8; - uint32_t pad1 : 15; -} instr_cat6c_t; - -/* dword1 encoding for !dst_off: */ -typedef struct PACKED { - /* dword0: */ - uint32_t dword0; - - uint32_t dst : 8; - uint32_t mustbe0 : 1; - uint32_t idx : 8; - uint32_t pad0 : 15; -} instr_cat6d_t; - -/* ldgb and atomics.. - * - * ldgb: pad0=0, pad3=1 - * atomic .g: pad0=1, pad3=1 - * .l: pad0=1, pad3=0 - */ -typedef struct PACKED { - /* dword0: */ - uint32_t pad0 : 1; - uint32_t src3 : 8; - uint32_t d : 2; - uint32_t typed : 1; - uint32_t type_size : 2; - uint32_t src1 : 8; - uint32_t src1_im : 1; - uint32_t src2_im : 1; - uint32_t src2 : 8; - - /* dword1: */ - uint32_t dst : 8; - uint32_t mustbe0 : 1; - uint32_t src_ssbo : 8; - uint32_t pad2 : 3; // type - uint32_t g : 1; - uint32_t pad3 : 1; - uint32_t pad4 : 10; // opc/jmp_tgt/sync/opc_cat -} instr_cat6ldgb_t; - -/* stgb, pad0=0, pad3=2 - */ -typedef struct PACKED { - /* dword0: */ - uint32_t mustbe1 : 1; // ??? - uint32_t src1 : 8; - uint32_t d : 2; - uint32_t typed : 1; - uint32_t type_size : 2; - uint32_t pad0 : 9; - uint32_t src2_im : 1; - uint32_t src2 : 8; - - /* dword1: */ - uint32_t src3 : 8; - uint32_t src3_im : 1; - uint32_t dst_ssbo : 8; - uint32_t pad2 : 3; // type - uint32_t pad3 : 2; - uint32_t pad4 : 10; // opc/jmp_tgt/sync/opc_cat -} instr_cat6stgb_t; - -typedef union PACKED { - instr_cat6a_t a; - instr_cat6b_t b; - instr_cat6c_t c; - instr_cat6d_t d; - instr_cat6ldgb_t ldgb; - instr_cat6stgb_t stgb; - struct PACKED { - /* dword0: */ - uint32_t src_off : 1; - uint32_t pad1 : 31; - - /* dword1: */ - uint32_t pad2 : 8; - uint32_t dst_off : 1; - uint32_t pad3 : 8; - uint32_t type : 3; - uint32_t g : 1; /* or in some cases it means dst immed */ - uint32_t pad4 : 1; - uint32_t opc : 5; - uint32_t jmp_tgt : 1; - uint32_t sync : 1; - uint32_t opc_cat : 3; - }; -} instr_cat6_t; - -typedef struct PACKED { - /* dword0: */ - uint32_t pad1 : 32; - - /* dword1: */ - uint32_t pad2 : 12; - uint32_t ss : 1; /* maybe in the encoding, but blob only uses (sy) */ - uint32_t pad3 : 6; - uint32_t w : 1; /* write */ - uint32_t r : 1; /* read */ - uint32_t l : 1; /* local */ - uint32_t g : 1; /* global */ - uint32_t opc : 4; /* presumed, but only a couple known OPCs */ - uint32_t jmp_tgt : 1; /* (jp) */ - uint32_t sync : 1; /* (sy) */ - uint32_t opc_cat : 3; -} instr_cat7_t; - -typedef union PACKED { - instr_cat0_t cat0; - instr_cat1_t cat1; - instr_cat2_t cat2; - instr_cat3_t cat3; - instr_cat4_t cat4; - instr_cat5_t cat5; - instr_cat6_t cat6; - instr_cat7_t cat7; - struct PACKED { - /* dword0: */ - uint32_t pad1 : 32; - - /* dword1: */ - uint32_t pad2 : 12; - uint32_t ss : 1; /* cat1-cat4 (cat0??) and cat7 (?) */ - uint32_t ul : 1; /* cat2-cat4 (and cat1 in blob.. which may be bug??) */ - uint32_t pad3 : 13; - uint32_t jmp_tgt : 1; - uint32_t sync : 1; - uint32_t opc_cat : 3; - - }; -} instr_t; - -static inline uint32_t instr_repeat(instr_t *instr) -{ - switch (instr->opc_cat) { - case 0: return instr->cat0.repeat; - case 1: return instr->cat1.repeat; - case 2: return instr->cat2.repeat; - case 3: return instr->cat3.repeat; - case 4: return instr->cat4.repeat; - default: return 0; - } -} - -static inline bool instr_sat(instr_t *instr) -{ - switch (instr->opc_cat) { - case 2: return instr->cat2.sat; - case 3: return instr->cat3.sat; - case 4: return instr->cat4.sat; - default: return false; - } -} - -static inline uint32_t instr_opc(instr_t *instr) -{ - switch (instr->opc_cat) { - case 0: return instr->cat0.opc; - case 1: return 0; - case 2: return instr->cat2.opc; - case 3: return instr->cat3.opc; - case 4: return instr->cat4.opc; - case 5: return instr->cat5.opc; - case 6: return instr->cat6.opc; - case 7: return instr->cat7.opc; - default: return 0; - } -} - -static inline bool is_mad(opc_t opc) -{ - switch (opc) { - case OPC_MAD_U16: - case OPC_MAD_S16: - case OPC_MAD_U24: - case OPC_MAD_S24: - case OPC_MAD_F16: - case OPC_MAD_F32: - return true; - default: - return false; - } -} - -static inline bool is_madsh(opc_t opc) -{ - switch (opc) { - case OPC_MADSH_U16: - case OPC_MADSH_M16: - return true; - default: - return false; - } -} - -static inline bool is_atomic(opc_t opc) -{ - switch (opc) { - case OPC_ATOMIC_ADD: - case OPC_ATOMIC_SUB: - case OPC_ATOMIC_XCHG: - case OPC_ATOMIC_INC: - case OPC_ATOMIC_DEC: - case OPC_ATOMIC_CMPXCHG: - case OPC_ATOMIC_MIN: - case OPC_ATOMIC_MAX: - case OPC_ATOMIC_AND: - case OPC_ATOMIC_OR: - case OPC_ATOMIC_XOR: - return true; - default: - return false; - } -} - -static inline bool is_ssbo(opc_t opc) -{ - switch (opc) { - case OPC_RESFMT: - case OPC_RESINFO: - case OPC_LDGB: - case OPC_STGB: - case OPC_STIB: - return true; - default: - return false; - } -} - -int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out); - -#endif /* INSTR_A3XX_H_ */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c deleted file mode 100644 index 3d1c4449b12..00000000000 --- a/src/gallium/drivers/freedreno/ir3/ir3.c +++ /dev/null @@ -1,941 +0,0 @@ -/* - * Copyright (c) 2012 Rob Clark <[email protected]> - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "ir3.h" - -#include <stdlib.h> -#include <stdio.h> -#include <string.h> -#include <assert.h> -#include <stdbool.h> -#include <errno.h> - -#include "util/bitscan.h" -#include "util/ralloc.h" -#include "util/u_math.h" - -#include "instr-a3xx.h" - -/* simple allocator to carve allocations out of an up-front allocated heap, - * so that we can free everything easily in one shot. - */ -void * ir3_alloc(struct ir3 *shader, int sz) -{ - return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */ -} - -struct ir3 * ir3_create(struct ir3_compiler *compiler, - unsigned nin, unsigned nout) -{ - struct ir3 *shader = rzalloc(compiler, struct ir3); - - shader->compiler = compiler; - shader->ninputs = nin; - shader->inputs = ir3_alloc(shader, sizeof(shader->inputs[0]) * nin); - - shader->noutputs = nout; - shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout); - - list_inithead(&shader->block_list); - list_inithead(&shader->array_list); - - return shader; -} - -void ir3_destroy(struct ir3 *shader) -{ - ralloc_free(shader); -} - -#define iassert(cond) do { \ - if (!(cond)) { \ - debug_assert(cond); \ - return -1; \ - } } while (0) - -#define iassert_type(reg, full) do { \ - if ((full)) { \ - iassert(!((reg)->flags & IR3_REG_HALF)); \ - } else { \ - iassert((reg)->flags & IR3_REG_HALF); \ - } } while (0); - -static uint32_t reg(struct ir3_register *reg, struct ir3_info *info, - uint32_t repeat, uint32_t valid_flags) -{ - reg_t val = { .dummy32 = 0 }; - - if (reg->flags & ~valid_flags) { - debug_printf("INVALID FLAGS: %x vs %x\n", - reg->flags, valid_flags); - } - - if (!(reg->flags & IR3_REG_R)) - repeat = 0; - - if (reg->flags & IR3_REG_IMMED) { - val.iim_val = reg->iim_val; - } else { - unsigned components; - int16_t max; - - if (reg->flags & IR3_REG_RELATIV) { - components = reg->size; - val.idummy10 = reg->array.offset; - max = (reg->array.offset + repeat + components - 1) >> 2; - } else { - components = util_last_bit(reg->wrmask); - val.comp = reg->num & 0x3; - val.num = reg->num >> 2; - max = (reg->num + repeat + components - 1) >> 2; - } - - if (reg->flags & IR3_REG_CONST) { - info->max_const = MAX2(info->max_const, max); - } else if (val.num == 63) { - /* ignore writes to dummy register r63.x */ - } else if (max < 48) { - if (reg->flags & IR3_REG_HALF) { - if (info->gpu_id >= 600) { - /* starting w/ a6xx, half regs conflict with full regs: */ - info->max_reg = MAX2(info->max_reg, (max+1)/2); - } else { - info->max_half_reg = MAX2(info->max_half_reg, max); - } - } else { - info->max_reg = MAX2(info->max_reg, max); - } - } - } - - return val.dummy32; -} - -static int emit_cat0(struct ir3_instruction *instr, void *ptr, - struct ir3_info *info) -{ - instr_cat0_t *cat0 = ptr; - - if (info->gpu_id >= 500) { - cat0->a5xx.immed = instr->cat0.immed; - } else if (info->gpu_id >= 400) { - cat0->a4xx.immed = instr->cat0.immed; - } else { - cat0->a3xx.immed = instr->cat0.immed; - } - cat0->repeat = instr->repeat; - cat0->ss = !!(instr->flags & IR3_INSTR_SS); - cat0->inv = instr->cat0.inv; - cat0->comp = instr->cat0.comp; - cat0->opc = instr->opc; - cat0->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); - cat0->sync = !!(instr->flags & IR3_INSTR_SY); - cat0->opc_cat = 0; - - return 0; -} - -static int emit_cat1(struct ir3_instruction *instr, void *ptr, - struct ir3_info *info) -{ - struct ir3_register *dst = instr->regs[0]; - struct ir3_register *src = instr->regs[1]; - instr_cat1_t *cat1 = ptr; - - iassert(instr->regs_count == 2); - iassert_type(dst, type_size(instr->cat1.dst_type) == 32); - if (!(src->flags & IR3_REG_IMMED)) - iassert_type(src, type_size(instr->cat1.src_type) == 32); - - if (src->flags & IR3_REG_IMMED) { - cat1->iim_val = src->iim_val; - cat1->src_im = 1; - } else if (src->flags & IR3_REG_RELATIV) { - cat1->off = reg(src, info, instr->repeat, - IR3_REG_R | IR3_REG_CONST | IR3_REG_HALF | IR3_REG_RELATIV); - cat1->src_rel = 1; - cat1->src_rel_c = !!(src->flags & IR3_REG_CONST); - } else { - cat1->src = reg(src, info, instr->repeat, - IR3_REG_R | IR3_REG_CONST | IR3_REG_HALF); - cat1->src_c = !!(src->flags & IR3_REG_CONST); - } - - cat1->dst = reg(dst, info, instr->repeat, - IR3_REG_RELATIV | IR3_REG_EVEN | - IR3_REG_R | IR3_REG_POS_INF | IR3_REG_HALF); - cat1->repeat = instr->repeat; - cat1->src_r = !!(src->flags & IR3_REG_R); - cat1->ss = !!(instr->flags & IR3_INSTR_SS); - cat1->ul = !!(instr->flags & IR3_INSTR_UL); - cat1->dst_type = instr->cat1.dst_type; - cat1->dst_rel = !!(dst->flags & IR3_REG_RELATIV); - cat1->src_type = instr->cat1.src_type; - cat1->even = !!(dst->flags & IR3_REG_EVEN); - cat1->pos_inf = !!(dst->flags & IR3_REG_POS_INF); - cat1->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); - cat1->sync = !!(instr->flags & IR3_INSTR_SY); - cat1->opc_cat = 1; - - return 0; -} - -static int emit_cat2(struct ir3_instruction *instr, void *ptr, - struct ir3_info *info) -{ - struct ir3_register *dst = instr->regs[0]; - struct ir3_register *src1 = instr->regs[1]; - struct ir3_register *src2 = instr->regs[2]; - instr_cat2_t *cat2 = ptr; - unsigned absneg = ir3_cat2_absneg(instr->opc); - - iassert((instr->regs_count == 2) || (instr->regs_count == 3)); - - if (src1->flags & IR3_REG_RELATIV) { - iassert(src1->array.offset < (1 << 10)); - cat2->rel1.src1 = reg(src1, info, instr->repeat, - IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R | - IR3_REG_HALF | absneg); - cat2->rel1.src1_c = !!(src1->flags & IR3_REG_CONST); - cat2->rel1.src1_rel = 1; - } else if (src1->flags & IR3_REG_CONST) { - iassert(src1->num < (1 << 12)); - cat2->c1.src1 = reg(src1, info, instr->repeat, - IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF); - cat2->c1.src1_c = 1; - } else { - iassert(src1->num < (1 << 11)); - cat2->src1 = reg(src1, info, instr->repeat, - IR3_REG_IMMED | IR3_REG_R | IR3_REG_HALF | - absneg); - } - cat2->src1_im = !!(src1->flags & IR3_REG_IMMED); - cat2->src1_neg = !!(src1->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)); - cat2->src1_abs = !!(src1->flags & (IR3_REG_FABS | IR3_REG_SABS)); - cat2->src1_r = !!(src1->flags & IR3_REG_R); - - if (src2) { - iassert((src2->flags & IR3_REG_IMMED) || - !((src1->flags ^ src2->flags) & IR3_REG_HALF)); - - if (src2->flags & IR3_REG_RELATIV) { - iassert(src2->array.offset < (1 << 10)); - cat2->rel2.src2 = reg(src2, info, instr->repeat, - IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R | - IR3_REG_HALF | absneg); - cat2->rel2.src2_c = !!(src2->flags & IR3_REG_CONST); - cat2->rel2.src2_rel = 1; - } else if (src2->flags & IR3_REG_CONST) { - iassert(src2->num < (1 << 12)); - cat2->c2.src2 = reg(src2, info, instr->repeat, - IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF); - cat2->c2.src2_c = 1; - } else { - iassert(src2->num < (1 << 11)); - cat2->src2 = reg(src2, info, instr->repeat, - IR3_REG_IMMED | IR3_REG_R | IR3_REG_HALF | - absneg); - } - - cat2->src2_im = !!(src2->flags & IR3_REG_IMMED); - cat2->src2_neg = !!(src2->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)); - cat2->src2_abs = !!(src2->flags & (IR3_REG_FABS | IR3_REG_SABS)); - cat2->src2_r = !!(src2->flags & IR3_REG_R); - } - - cat2->dst = reg(dst, info, instr->repeat, - IR3_REG_R | IR3_REG_EI | IR3_REG_HALF); - cat2->repeat = instr->repeat; - cat2->sat = !!(instr->flags & IR3_INSTR_SAT); - cat2->ss = !!(instr->flags & IR3_INSTR_SS); - cat2->ul = !!(instr->flags & IR3_INSTR_UL); - cat2->dst_half = !!((src1->flags ^ dst->flags) & IR3_REG_HALF); - cat2->ei = !!(dst->flags & IR3_REG_EI); - cat2->cond = instr->cat2.condition; - cat2->full = ! (src1->flags & IR3_REG_HALF); - cat2->opc = instr->opc; - cat2->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); - cat2->sync = !!(instr->flags & IR3_INSTR_SY); - cat2->opc_cat = 2; - - return 0; -} - -static int emit_cat3(struct ir3_instruction *instr, void *ptr, - struct ir3_info *info) -{ - struct ir3_register *dst = instr->regs[0]; - struct ir3_register *src1 = instr->regs[1]; - struct ir3_register *src2 = instr->regs[2]; - struct ir3_register *src3 = instr->regs[3]; - unsigned absneg = ir3_cat3_absneg(instr->opc); - instr_cat3_t *cat3 = ptr; - uint32_t src_flags = 0; - - switch (instr->opc) { - case OPC_MAD_F16: - case OPC_MAD_U16: - case OPC_MAD_S16: - case OPC_SEL_B16: - case OPC_SEL_S16: - case OPC_SEL_F16: - case OPC_SAD_S16: - case OPC_SAD_S32: // really?? - src_flags |= IR3_REG_HALF; - break; - default: - break; - } - - iassert(instr->regs_count == 4); - iassert(!((src1->flags ^ src_flags) & IR3_REG_HALF)); - iassert(!((src2->flags ^ src_flags) & IR3_REG_HALF)); - iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF)); - - if (src1->flags & IR3_REG_RELATIV) { - iassert(src1->array.offset < (1 << 10)); - cat3->rel1.src1 = reg(src1, info, instr->repeat, - IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R | - IR3_REG_HALF | absneg); - cat3->rel1.src1_c = !!(src1->flags & IR3_REG_CONST); - cat3->rel1.src1_rel = 1; - } else if (src1->flags & IR3_REG_CONST) { - iassert(src1->num < (1 << 12)); - cat3->c1.src1 = reg(src1, info, instr->repeat, - IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF); - cat3->c1.src1_c = 1; - } else { - iassert(src1->num < (1 << 11)); - cat3->src1 = reg(src1, info, instr->repeat, - IR3_REG_R | IR3_REG_HALF | absneg); - } - - cat3->src1_neg = !!(src1->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)); - cat3->src1_r = !!(src1->flags & IR3_REG_R); - - cat3->src2 = reg(src2, info, instr->repeat, - IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF | absneg); - cat3->src2_c = !!(src2->flags & IR3_REG_CONST); - cat3->src2_neg = !!(src2->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)); - cat3->src2_r = !!(src2->flags & IR3_REG_R); - - - if (src3->flags & IR3_REG_RELATIV) { - iassert(src3->array.offset < (1 << 10)); - cat3->rel2.src3 = reg(src3, info, instr->repeat, - IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R | - IR3_REG_HALF | absneg); - cat3->rel2.src3_c = !!(src3->flags & IR3_REG_CONST); - cat3->rel2.src3_rel = 1; - } else if (src3->flags & IR3_REG_CONST) { - iassert(src3->num < (1 << 12)); - cat3->c2.src3 = reg(src3, info, instr->repeat, - IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF); - cat3->c2.src3_c = 1; - } else { - iassert(src3->num < (1 << 11)); - cat3->src3 = reg(src3, info, instr->repeat, - IR3_REG_R | IR3_REG_HALF | absneg); - } - - cat3->src3_neg = !!(src3->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)); - cat3->src3_r = !!(src3->flags & IR3_REG_R); - - cat3->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); - cat3->repeat = instr->repeat; - cat3->sat = !!(instr->flags & IR3_INSTR_SAT); - cat3->ss = !!(instr->flags & IR3_INSTR_SS); - cat3->ul = !!(instr->flags & IR3_INSTR_UL); - cat3->dst_half = !!((src_flags ^ dst->flags) & IR3_REG_HALF); - cat3->opc = instr->opc; - cat3->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); - cat3->sync = !!(instr->flags & IR3_INSTR_SY); - cat3->opc_cat = 3; - - return 0; -} - -static int emit_cat4(struct ir3_instruction *instr, void *ptr, - struct ir3_info *info) -{ - struct ir3_register *dst = instr->regs[0]; - struct ir3_register *src = instr->regs[1]; - instr_cat4_t *cat4 = ptr; - - iassert(instr->regs_count == 2); - - if (src->flags & IR3_REG_RELATIV) { - iassert(src->array.offset < (1 << 10)); - cat4->rel.src = reg(src, info, instr->repeat, - IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_FNEG | - IR3_REG_FABS | IR3_REG_R | IR3_REG_HALF); - cat4->rel.src_c = !!(src->flags & IR3_REG_CONST); - cat4->rel.src_rel = 1; - } else if (src->flags & IR3_REG_CONST) { - iassert(src->num < (1 << 12)); - cat4->c.src = reg(src, info, instr->repeat, - IR3_REG_CONST | IR3_REG_FNEG | IR3_REG_FABS | - IR3_REG_R | IR3_REG_HALF); - cat4->c.src_c = 1; - } else { - iassert(src->num < (1 << 11)); - cat4->src = reg(src, info, instr->repeat, - IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS | - IR3_REG_R | IR3_REG_HALF); - } - - cat4->src_im = !!(src->flags & IR3_REG_IMMED); - cat4->src_neg = !!(src->flags & IR3_REG_FNEG); - cat4->src_abs = !!(src->flags & IR3_REG_FABS); - cat4->src_r = !!(src->flags & IR3_REG_R); - - cat4->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); - cat4->repeat = instr->repeat; - cat4->sat = !!(instr->flags & IR3_INSTR_SAT); - cat4->ss = !!(instr->flags & IR3_INSTR_SS); - cat4->ul = !!(instr->flags & IR3_INSTR_UL); - cat4->dst_half = !!((src->flags ^ dst->flags) & IR3_REG_HALF); - cat4->full = ! (src->flags & IR3_REG_HALF); - cat4->opc = instr->opc; - cat4->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); - cat4->sync = !!(instr->flags & IR3_INSTR_SY); - cat4->opc_cat = 4; - - return 0; -} - -static int emit_cat5(struct ir3_instruction *instr, void *ptr, - struct ir3_info *info) -{ - struct ir3_register *dst = instr->regs[0]; - struct ir3_register *src1 = instr->regs[1]; - struct ir3_register *src2 = instr->regs[2]; - struct ir3_register *src3 = instr->regs[3]; - instr_cat5_t *cat5 = ptr; - - iassert_type(dst, type_size(instr->cat5.type) == 32) - - assume(src1 || !src2); - assume(src2 || !src3); - - if (src1) { - cat5->full = ! (src1->flags & IR3_REG_HALF); - cat5->src1 = reg(src1, info, instr->repeat, IR3_REG_HALF); - } - - if (instr->flags & IR3_INSTR_S2EN) { - if (src2) { - iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF)); - cat5->s2en.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF); - } - if (src3) { - iassert(src3->flags & IR3_REG_HALF); - cat5->s2en.src3 = reg(src3, info, instr->repeat, IR3_REG_HALF); - } - iassert(!(instr->cat5.samp | instr->cat5.tex)); - } else { - iassert(!src3); - if (src2) { - iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF)); - cat5->norm.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF); - } - cat5->norm.samp = instr->cat5.samp; - cat5->norm.tex = instr->cat5.tex; - } - - cat5->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); - cat5->wrmask = dst->wrmask; - cat5->type = instr->cat5.type; - cat5->is_3d = !!(instr->flags & IR3_INSTR_3D); - cat5->is_a = !!(instr->flags & IR3_INSTR_A); - cat5->is_s = !!(instr->flags & IR3_INSTR_S); - cat5->is_s2en = !!(instr->flags & IR3_INSTR_S2EN); - cat5->is_o = !!(instr->flags & IR3_INSTR_O); - cat5->is_p = !!(instr->flags & IR3_INSTR_P); - cat5->opc = instr->opc; - cat5->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); - cat5->sync = !!(instr->flags & IR3_INSTR_SY); - cat5->opc_cat = 5; - - return 0; -} - -static int emit_cat6(struct ir3_instruction *instr, void *ptr, - struct ir3_info *info) -{ - struct ir3_register *dst, *src1, *src2; - instr_cat6_t *cat6 = ptr; - bool type_full = type_size(instr->cat6.type) == 32; - - cat6->type = instr->cat6.type; - cat6->opc = instr->opc; - cat6->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); - cat6->sync = !!(instr->flags & IR3_INSTR_SY); - cat6->g = !!(instr->flags & IR3_INSTR_G); - cat6->opc_cat = 6; - - switch (instr->opc) { - case OPC_RESINFO: - case OPC_RESFMT: - iassert_type(instr->regs[0], type_full); /* dst */ - iassert_type(instr->regs[1], type_full); /* src1 */ - break; - case OPC_L2G: - case OPC_G2L: - iassert_type(instr->regs[0], true); /* dst */ - iassert_type(instr->regs[1], true); /* src1 */ - break; - case OPC_STG: - case OPC_STL: - case OPC_STP: - case OPC_STI: - case OPC_STLW: - case OPC_STIB: - /* no dst, so regs[0] is dummy */ - iassert_type(instr->regs[1], true); /* dst */ - iassert_type(instr->regs[2], type_full); /* src1 */ - iassert_type(instr->regs[3], true); /* src2 */ - break; - default: - iassert_type(instr->regs[0], type_full); /* dst */ - iassert_type(instr->regs[1], true); /* src1 */ - if (instr->regs_count > 2) - iassert_type(instr->regs[2], true); /* src1 */ - break; - } - - /* the "dst" for a store instruction is (from the perspective - * of data flow in the shader, ie. register use/def, etc) in - * fact a register that is read by the instruction, rather - * than written: - */ - if (is_store(instr)) { - iassert(instr->regs_count >= 3); - - dst = instr->regs[1]; - src1 = instr->regs[2]; - src2 = (instr->regs_count >= 4) ? instr->regs[3] : NULL; - } else { - iassert(instr->regs_count >= 2); - - dst = instr->regs[0]; - src1 = instr->regs[1]; - src2 = (instr->regs_count >= 3) ? instr->regs[2] : NULL; - } - - /* TODO we need a more comprehensive list about which instructions - * can be encoded which way. Or possibly use IR3_INSTR_0 flag to - * indicate to use the src_off encoding even if offset is zero - * (but then what to do about dst_off?) - */ - if (is_atomic(instr->opc)) { - instr_cat6ldgb_t *ldgb = ptr; - - /* maybe these two bits both determine the instruction encoding? */ - cat6->src_off = false; - - ldgb->d = instr->cat6.d - 1; - ldgb->typed = instr->cat6.typed; - ldgb->type_size = instr->cat6.iim_val - 1; - - ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); - - if (ldgb->g) { - struct ir3_register *src3 = instr->regs[3]; - struct ir3_register *src4 = instr->regs[4]; - - /* first src is src_ssbo: */ - iassert(src1->flags & IR3_REG_IMMED); - ldgb->src_ssbo = src1->uim_val; - - ldgb->src1 = reg(src2, info, instr->repeat, IR3_REG_IMMED); - ldgb->src1_im = !!(src2->flags & IR3_REG_IMMED); - ldgb->src2 = reg(src3, info, instr->repeat, IR3_REG_IMMED); - ldgb->src2_im = !!(src3->flags & IR3_REG_IMMED); - - ldgb->src3 = reg(src4, info, instr->repeat, 0); - ldgb->pad0 = 0x1; - ldgb->pad3 = 0x1; - } else { - ldgb->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED); - ldgb->src1_im = !!(src1->flags & IR3_REG_IMMED); - ldgb->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED); - ldgb->src2_im = !!(src2->flags & IR3_REG_IMMED); - ldgb->pad0 = 0x1; - ldgb->pad3 = 0x0; - } - - return 0; - } else if (instr->opc == OPC_LDGB) { - struct ir3_register *src3 = instr->regs[3]; - instr_cat6ldgb_t *ldgb = ptr; - - /* maybe these two bits both determine the instruction encoding? */ - cat6->src_off = false; - - ldgb->d = instr->cat6.d - 1; - ldgb->typed = instr->cat6.typed; - ldgb->type_size = instr->cat6.iim_val - 1; - - ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); - - /* first src is src_ssbo: */ - iassert(src1->flags & IR3_REG_IMMED); - ldgb->src_ssbo = src1->uim_val; - - /* then next two are src1/src2: */ - ldgb->src1 = reg(src2, info, instr->repeat, IR3_REG_IMMED); - ldgb->src1_im = !!(src2->flags & IR3_REG_IMMED); - ldgb->src2 = reg(src3, info, instr->repeat, IR3_REG_IMMED); - ldgb->src2_im = !!(src3->flags & IR3_REG_IMMED); - - ldgb->pad0 = 0x0; - ldgb->pad3 = 0x1; - - return 0; - } else if (instr->opc == OPC_RESINFO) { - instr_cat6ldgb_t *ldgb = ptr; - - ldgb->d = instr->cat6.d - 1; - - ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); - - /* first src is src_ssbo: */ - iassert(src1->flags & IR3_REG_IMMED); - ldgb->src_ssbo = src1->uim_val; - - return 0; - } else if ((instr->opc == OPC_STGB) || (instr->opc == OPC_STIB)) { - struct ir3_register *src3 = instr->regs[4]; - instr_cat6stgb_t *stgb = ptr; - - /* maybe these two bits both determine the instruction encoding? */ - cat6->src_off = true; - stgb->pad3 = 0x2; - - stgb->d = instr->cat6.d - 1; - stgb->typed = instr->cat6.typed; - stgb->type_size = instr->cat6.iim_val - 1; - - /* first src is dst_ssbo: */ - iassert(dst->flags & IR3_REG_IMMED); - stgb->dst_ssbo = dst->uim_val; - - /* then src1/src2/src3: */ - stgb->src1 = reg(src1, info, instr->repeat, 0); - stgb->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED); - stgb->src2_im = !!(src2->flags & IR3_REG_IMMED); - stgb->src3 = reg(src3, info, instr->repeat, IR3_REG_IMMED); - stgb->src3_im = !!(src3->flags & IR3_REG_IMMED); - - return 0; - } else if (instr->cat6.src_offset || (instr->opc == OPC_LDG) || - (instr->opc == OPC_LDL)) { - instr_cat6a_t *cat6a = ptr; - - cat6->src_off = true; - - cat6a->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED); - cat6a->src1_im = !!(src1->flags & IR3_REG_IMMED); - if (src2) { - cat6a->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED); - cat6a->src2_im = !!(src2->flags & IR3_REG_IMMED); - } - cat6a->off = instr->cat6.src_offset; - } else { - instr_cat6b_t *cat6b = ptr; - - cat6->src_off = false; - - cat6b->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED | IR3_REG_HALF); - cat6b->src1_im = !!(src1->flags & IR3_REG_IMMED); - if (src2) { - cat6b->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED); - cat6b->src2_im = !!(src2->flags & IR3_REG_IMMED); - } - } - - if (instr->cat6.dst_offset || (instr->opc == OPC_STG) || - (instr->opc == OPC_STL)) { - instr_cat6c_t *cat6c = ptr; - cat6->dst_off = true; - cat6c->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); - cat6c->off = instr->cat6.dst_offset; - } else { - instr_cat6d_t *cat6d = ptr; - cat6->dst_off = false; - cat6d->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); - } - - return 0; -} - -static int emit_cat7(struct ir3_instruction *instr, void *ptr, - struct ir3_info *info) -{ - instr_cat7_t *cat7 = ptr; - - cat7->ss = !!(instr->flags & IR3_INSTR_SS); - cat7->w = instr->cat7.w; - cat7->r = instr->cat7.r; - cat7->l = instr->cat7.l; - cat7->g = instr->cat7.g; - cat7->opc = instr->opc; - cat7->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); - cat7->sync = !!(instr->flags & IR3_INSTR_SY); - cat7->opc_cat = 7; - - return 0; -} - -static int (*emit[])(struct ir3_instruction *instr, void *ptr, - struct ir3_info *info) = { - emit_cat0, emit_cat1, emit_cat2, emit_cat3, emit_cat4, emit_cat5, emit_cat6, - emit_cat7, -}; - -void * ir3_assemble(struct ir3 *shader, struct ir3_info *info, - uint32_t gpu_id) -{ - uint32_t *ptr, *dwords; - - info->gpu_id = gpu_id; - info->max_reg = -1; - info->max_half_reg = -1; - info->max_const = -1; - info->instrs_count = 0; - info->sizedwords = 0; - info->ss = info->sy = 0; - - list_for_each_entry (struct ir3_block, block, &shader->block_list, node) { - list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { - info->sizedwords += 2; - } - } - - /* need an integer number of instruction "groups" (sets of 16 - * instructions on a4xx or sets of 4 instructions on a3xx), - * so pad out w/ NOPs if needed: (NOTE each instruction is 64bits) - */ - if (gpu_id >= 400) { - info->sizedwords = align(info->sizedwords, 16 * 2); - } else { - info->sizedwords = align(info->sizedwords, 4 * 2); - } - - ptr = dwords = calloc(4, info->sizedwords); - - list_for_each_entry (struct ir3_block, block, &shader->block_list, node) { - list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { - int ret = emit[opc_cat(instr->opc)](instr, dwords, info); - if (ret) - goto fail; - info->instrs_count += 1 + instr->repeat; - dwords += 2; - - if (instr->flags & IR3_INSTR_SS) - info->ss++; - - if (instr->flags & IR3_INSTR_SY) - info->sy++; - } - } - - return ptr; - -fail: - free(ptr); - return NULL; -} - -static struct ir3_register * reg_create(struct ir3 *shader, - int num, int flags) -{ - struct ir3_register *reg = - ir3_alloc(shader, sizeof(struct ir3_register)); - reg->wrmask = 1; - reg->flags = flags; - reg->num = num; - return reg; -} - -static void insert_instr(struct ir3_block *block, - struct ir3_instruction *instr) -{ - struct ir3 *shader = block->shader; -#ifdef DEBUG - instr->serialno = ++shader->instr_count; -#endif - list_addtail(&instr->node, &block->instr_list); - - if (is_input(instr)) - array_insert(shader, shader->baryfs, instr); -} - -struct ir3_block * ir3_block_create(struct ir3 *shader) -{ - struct ir3_block *block = ir3_alloc(shader, sizeof(*block)); -#ifdef DEBUG - block->serialno = ++shader->block_count; -#endif - block->shader = shader; - list_inithead(&block->node); - list_inithead(&block->instr_list); - return block; -} - -static struct ir3_instruction *instr_create(struct ir3_block *block, int nreg) -{ - struct ir3_instruction *instr; - unsigned sz = sizeof(*instr) + (nreg * sizeof(instr->regs[0])); - char *ptr = ir3_alloc(block->shader, sz); - - instr = (struct ir3_instruction *)ptr; - ptr += sizeof(*instr); - instr->regs = (struct ir3_register **)ptr; - -#ifdef DEBUG - instr->regs_max = nreg; -#endif - - return instr; -} - -struct ir3_instruction * ir3_instr_create2(struct ir3_block *block, - opc_t opc, int nreg) -{ - struct ir3_instruction *instr = instr_create(block, nreg); - instr->block = block; - instr->opc = opc; - insert_instr(block, instr); - return instr; -} - -struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc) -{ - /* NOTE: we could be slightly more clever, at least for non-meta, - * and choose # of regs based on category. - */ - return ir3_instr_create2(block, opc, 4); -} - -struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr) -{ - struct ir3_instruction *new_instr = instr_create(instr->block, - instr->regs_count); - struct ir3_register **regs; - unsigned i; - - regs = new_instr->regs; - *new_instr = *instr; - new_instr->regs = regs; - - insert_instr(instr->block, new_instr); - - /* clone registers: */ - new_instr->regs_count = 0; - for (i = 0; i < instr->regs_count; i++) { - struct ir3_register *reg = instr->regs[i]; - struct ir3_register *new_reg = - ir3_reg_create(new_instr, reg->num, reg->flags); - *new_reg = *reg; - } - - return new_instr; -} - -/* Add a false dependency to instruction, to ensure it is scheduled first: */ -void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep) -{ - array_insert(instr, instr->deps, dep); -} - -struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, - int num, int flags) -{ - struct ir3 *shader = instr->block->shader; - struct ir3_register *reg = reg_create(shader, num, flags); -#ifdef DEBUG - debug_assert(instr->regs_count < instr->regs_max); -#endif - instr->regs[instr->regs_count++] = reg; - return reg; -} - -struct ir3_register * ir3_reg_clone(struct ir3 *shader, - struct ir3_register *reg) -{ - struct ir3_register *new_reg = reg_create(shader, 0, 0); - *new_reg = *reg; - return new_reg; -} - -void -ir3_instr_set_address(struct ir3_instruction *instr, - struct ir3_instruction *addr) -{ - if (instr->address != addr) { - struct ir3 *ir = instr->block->shader; - instr->address = addr; - array_insert(ir, ir->indirects, instr); - } -} - -void -ir3_block_clear_mark(struct ir3_block *block) -{ - list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) - instr->flags &= ~IR3_INSTR_MARK; -} - -void -ir3_clear_mark(struct ir3 *ir) -{ - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - ir3_block_clear_mark(block); - } -} - -/* note: this will destroy instr->depth, don't do it until after sched! */ -unsigned -ir3_count_instructions(struct ir3 *ir) -{ - unsigned cnt = 0; - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { - instr->ip = cnt++; - } - block->start_ip = list_first_entry(&block->instr_list, struct ir3_instruction, node)->ip; - block->end_ip = list_last_entry(&block->instr_list, struct ir3_instruction, node)->ip; - } - return cnt; -} - -struct ir3_array * -ir3_lookup_array(struct ir3 *ir, unsigned id) -{ - list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) - if (arr->id == id) - return arr; - return NULL; -} diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h deleted file mode 100644 index ea3218828df..00000000000 --- a/src/gallium/drivers/freedreno/ir3/ir3.h +++ /dev/null @@ -1,1394 +0,0 @@ -/* - * Copyright (c) 2013 Rob Clark <[email protected]> - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef IR3_H_ -#define IR3_H_ - -#include <stdint.h> -#include <stdbool.h> - -#include "compiler/shader_enums.h" - -#include "util/u_debug.h" -#include "util/list.h" - -#include "instr-a3xx.h" - -/* low level intermediate representation of an adreno shader program */ - -struct ir3_compiler; -struct ir3; -struct ir3_instruction; -struct ir3_block; - -struct ir3_info { - uint32_t gpu_id; - uint16_t sizedwords; - uint16_t instrs_count; /* expanded to account for rpt's */ - /* NOTE: max_reg, etc, does not include registers not touched - * by the shader (ie. vertex fetched via VFD_DECODE but not - * touched by shader) - */ - int8_t max_reg; /* highest GPR # used by shader */ - int8_t max_half_reg; - int16_t max_const; - - /* number of sync bits: */ - uint16_t ss, sy; -}; - -struct ir3_register { - enum { - IR3_REG_CONST = 0x001, - IR3_REG_IMMED = 0x002, - IR3_REG_HALF = 0x004, - /* high registers are used for some things in compute shaders, - * for example. Seems to be for things that are global to all - * threads in a wave, so possibly these are global/shared by - * all the threads in the wave? - */ - IR3_REG_HIGH = 0x008, - IR3_REG_RELATIV= 0x010, - IR3_REG_R = 0x020, - /* Most instructions, it seems, can do float abs/neg but not - * integer. The CP pass needs to know what is intended (int or - * float) in order to do the right thing. For this reason the - * abs/neg flags are split out into float and int variants. In - * addition, .b (bitwise) operations, the negate is actually a - * bitwise not, so split that out into a new flag to make it - * more clear. - */ - IR3_REG_FNEG = 0x040, - IR3_REG_FABS = 0x080, - IR3_REG_SNEG = 0x100, - IR3_REG_SABS = 0x200, - IR3_REG_BNOT = 0x400, - IR3_REG_EVEN = 0x800, - IR3_REG_POS_INF= 0x1000, - /* (ei) flag, end-input? Set on last bary, presumably to signal - * that the shader needs no more input: - */ - IR3_REG_EI = 0x2000, - /* meta-flags, for intermediate stages of IR, ie. - * before register assignment is done: - */ - IR3_REG_SSA = 0x4000, /* 'instr' is ptr to assigning instr */ - IR3_REG_ARRAY = 0x8000, - - } flags; - - /* normal registers: - * the component is in the low two bits of the reg #, so - * rN.x becomes: (N << 2) | x - */ - int num; - union { - /* immediate: */ - int32_t iim_val; - uint32_t uim_val; - float fim_val; - /* relative: */ - struct { - uint16_t id; - int16_t offset; - } array; - }; - - /* For IR3_REG_SSA, src registers contain ptr back to assigning - * instruction. - * - * For IR3_REG_ARRAY, the pointer is back to the last dependent - * array access (although the net effect is the same, it points - * back to a previous instruction that we depend on). - */ - struct ir3_instruction *instr; - - union { - /* used for cat5 instructions, but also for internal/IR level - * tracking of what registers are read/written by an instruction. - * wrmask may be a bad name since it is used to represent both - * src and dst that touch multiple adjacent registers. - */ - unsigned wrmask; - /* for relative addressing, 32bits for array size is too small, - * but otoh we don't need to deal with disjoint sets, so instead - * use a simple size field (number of scalar components). - */ - unsigned size; - }; -}; - -/* - * Stupid/simple growable array implementation: - */ -#define DECLARE_ARRAY(type, name) \ - unsigned name ## _count, name ## _sz; \ - type * name; - -#define array_insert(ctx, arr, val) do { \ - if (arr ## _count == arr ## _sz) { \ - arr ## _sz = MAX2(2 * arr ## _sz, 16); \ - arr = reralloc_size(ctx, arr, arr ## _sz * sizeof(arr[0])); \ - } \ - arr[arr ##_count++] = val; \ - } while (0) - -struct ir3_instruction { - struct ir3_block *block; - opc_t opc; - enum { - /* (sy) flag is set on first instruction, and after sample - * instructions (probably just on RAW hazard). - */ - IR3_INSTR_SY = 0x001, - /* (ss) flag is set on first instruction, and first instruction - * to depend on the result of "long" instructions (RAW hazard): - * - * rcp, rsq, log2, exp2, sin, cos, sqrt - * - * It seems to synchronize until all in-flight instructions are - * completed, for example: - * - * rsq hr1.w, hr1.w - * add.f hr2.z, (neg)hr2.z, hc0.y - * mul.f hr2.w, (neg)hr2.y, (neg)hr2.y - * rsq hr2.x, hr2.x - * (rpt1)nop - * mad.f16 hr2.w, hr2.z, hr2.z, hr2.w - * nop - * mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w - * (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w - * (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x - * - * The last mul.f does not have (ss) set, presumably because the - * (ss) on the previous instruction does the job. - * - * The blob driver also seems to set it on WAR hazards, although - * not really clear if this is needed or just blob compiler being - * sloppy. So far I haven't found a case where removing the (ss) - * causes problems for WAR hazard, but I could just be getting - * lucky: - * - * rcp r1.y, r3.y - * (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z - * - */ - IR3_INSTR_SS = 0x002, - /* (jp) flag is set on jump targets: - */ - IR3_INSTR_JP = 0x004, - IR3_INSTR_UL = 0x008, - IR3_INSTR_3D = 0x010, - IR3_INSTR_A = 0x020, - IR3_INSTR_O = 0x040, - IR3_INSTR_P = 0x080, - IR3_INSTR_S = 0x100, - IR3_INSTR_S2EN = 0x200, - IR3_INSTR_G = 0x400, - IR3_INSTR_SAT = 0x800, - /* meta-flags, for intermediate stages of IR, ie. - * before register assignment is done: - */ - IR3_INSTR_MARK = 0x1000, - IR3_INSTR_UNUSED= 0x2000, - } flags; - int repeat; -#ifdef DEBUG - unsigned regs_max; -#endif - unsigned regs_count; - struct ir3_register **regs; - union { - struct { - char inv; - char comp; - int immed; - struct ir3_block *target; - } cat0; - struct { - type_t src_type, dst_type; - } cat1; - struct { - enum { - IR3_COND_LT = 0, - IR3_COND_LE = 1, - IR3_COND_GT = 2, - IR3_COND_GE = 3, - IR3_COND_EQ = 4, - IR3_COND_NE = 5, - } condition; - } cat2; - struct { - unsigned samp, tex; - type_t type; - } cat5; - struct { - type_t type; - int src_offset; - int dst_offset; - int iim_val : 3; /* for ldgb/stgb, # of components */ - int d : 3; - bool typed : 1; - } cat6; - struct { - unsigned w : 1; /* write */ - unsigned r : 1; /* read */ - unsigned l : 1; /* local */ - unsigned g : 1; /* global */ - } cat7; - /* for meta-instructions, just used to hold extra data - * before instruction scheduling, etc - */ - struct { - int off; /* component/offset */ - } fo; - struct { - struct ir3_block *block; - } inout; - }; - - /* transient values used during various algorithms: */ - union { - /* The instruction depth is the max dependency distance to output. - * - * You can also think of it as the "cost", if we did any sort of - * optimization for register footprint. Ie. a value that is just - * result of moving a const to a reg would have a low cost, so to - * it could make sense to duplicate the instruction at various - * points where the result is needed to reduce register footprint. - */ - unsigned depth; - /* When we get to the RA stage, we no longer need depth, but - * we do need instruction's position/name: - */ - struct { - uint16_t ip; - uint16_t name; - }; - }; - - /* used for per-pass extra instruction data. - */ - void *data; - - /* Used during CP and RA stages. For fanin and shader inputs/ - * outputs where we need a sequence of consecutive registers, - * keep track of each src instructions left (ie 'n-1') and right - * (ie 'n+1') neighbor. The front-end must insert enough mov's - * to ensure that each instruction has at most one left and at - * most one right neighbor. During the copy-propagation pass, - * we only remove mov's when we can preserve this constraint. - * And during the RA stage, we use the neighbor information to - * allocate a block of registers in one shot. - * - * TODO: maybe just add something like: - * struct ir3_instruction_ref { - * struct ir3_instruction *instr; - * unsigned cnt; - * } - * - * Or can we get away without the refcnt stuff? It seems like - * it should be overkill.. the problem is if, potentially after - * already eliminating some mov's, if you have a single mov that - * needs to be grouped with it's neighbors in two different - * places (ex. shader output and a fanin). - */ - struct { - struct ir3_instruction *left, *right; - uint16_t left_cnt, right_cnt; - } cp; - - /* an instruction can reference at most one address register amongst - * it's src/dst registers. Beyond that, you need to insert mov's. - * - * NOTE: do not write this directly, use ir3_instr_set_address() - */ - struct ir3_instruction *address; - - /* Tracking for additional dependent instructions. Used to handle - * barriers, WAR hazards for arrays/SSBOs/etc. - */ - DECLARE_ARRAY(struct ir3_instruction *, deps); - - /* - * From PoV of instruction scheduling, not execution (ie. ignores global/ - * local distinction): - * shared image atomic SSBO everything - * barrier()/ - R/W R/W R/W R/W X - * groupMemoryBarrier() - * memoryBarrier() - R/W R/W - * (but only images declared coherent?) - * memoryBarrierAtomic() - R/W - * memoryBarrierBuffer() - R/W - * memoryBarrierImage() - R/W - * memoryBarrierShared() - R/W - * - * TODO I think for SSBO/image/shared, in cases where we can determine - * which variable is accessed, we don't need to care about accesses to - * different variables (unless declared coherent??) - */ - enum { - IR3_BARRIER_EVERYTHING = 1 << 0, - IR3_BARRIER_SHARED_R = 1 << 1, - IR3_BARRIER_SHARED_W = 1 << 2, - IR3_BARRIER_IMAGE_R = 1 << 3, - IR3_BARRIER_IMAGE_W = 1 << 4, - IR3_BARRIER_BUFFER_R = 1 << 5, - IR3_BARRIER_BUFFER_W = 1 << 6, - IR3_BARRIER_ARRAY_R = 1 << 7, - IR3_BARRIER_ARRAY_W = 1 << 8, - } barrier_class, barrier_conflict; - - /* Entry in ir3_block's instruction list: */ - struct list_head node; - - int use_count; /* currently just updated/used by cp */ - -#ifdef DEBUG - uint32_t serialno; -#endif -}; - -static inline struct ir3_instruction * -ir3_neighbor_first(struct ir3_instruction *instr) -{ - int cnt = 0; - while (instr->cp.left) { - instr = instr->cp.left; - if (++cnt > 0xffff) { - debug_assert(0); - break; - } - } - return instr; -} - -static inline int ir3_neighbor_count(struct ir3_instruction *instr) -{ - int num = 1; - - debug_assert(!instr->cp.left); - - while (instr->cp.right) { - num++; - instr = instr->cp.right; - if (num > 0xffff) { - debug_assert(0); - break; - } - } - - return num; -} - -struct ir3 { - struct ir3_compiler *compiler; - - unsigned ninputs, noutputs; - struct ir3_instruction **inputs; - struct ir3_instruction **outputs; - - /* Track bary.f (and ldlv) instructions.. this is needed in - * scheduling to ensure that all varying fetches happen before - * any potential kill instructions. The hw gets grumpy if all - * threads in a group are killed before the last bary.f gets - * a chance to signal end of input (ei). - */ - DECLARE_ARRAY(struct ir3_instruction *, baryfs); - - /* Track all indirect instructions (read and write). To avoid - * deadlock scenario where an address register gets scheduled, - * but other dependent src instructions cannot be scheduled due - * to dependency on a *different* address register value, the - * scheduler needs to ensure that all dependencies other than - * the instruction other than the address register are scheduled - * before the one that writes the address register. Having a - * convenient list of instructions that reference some address - * register simplifies this. - */ - DECLARE_ARRAY(struct ir3_instruction *, indirects); - - /* and same for instructions that consume predicate register: */ - DECLARE_ARRAY(struct ir3_instruction *, predicates); - - /* Track texture sample instructions which need texture state - * patched in (for astc-srgb workaround): - */ - DECLARE_ARRAY(struct ir3_instruction *, astc_srgb); - - /* List of blocks: */ - struct list_head block_list; - - /* List of ir3_array's: */ - struct list_head array_list; - -#ifdef DEBUG - unsigned block_count, instr_count; -#endif -}; - -struct ir3_array { - struct list_head node; - unsigned length; - unsigned id; - - struct nir_register *r; - - /* To avoid array write's from getting DCE'd, keep track of the - * most recent write. Any array access depends on the most - * recent write. This way, nothing depends on writes after the - * last read. But all the writes that happen before that have - * something depending on them - */ - struct ir3_instruction *last_write; - - /* extra stuff used in RA pass: */ - unsigned base; /* base vreg name */ - unsigned reg; /* base physical reg */ - uint16_t start_ip, end_ip; -}; - -struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id); - -struct ir3_block { - struct list_head node; - struct ir3 *shader; - - const struct nir_block *nblock; - - struct list_head instr_list; /* list of ir3_instruction */ - - /* each block has either one or two successors.. in case of - * two successors, 'condition' decides which one to follow. - * A block preceding an if/else has two successors. - */ - struct ir3_instruction *condition; - struct ir3_block *successors[2]; - - unsigned predecessors_count; - struct ir3_block **predecessors; - - uint16_t start_ip, end_ip; - - /* Track instructions which do not write a register but other- - * wise must not be discarded (such as kill, stg, etc) - */ - DECLARE_ARRAY(struct ir3_instruction *, keeps); - - /* used for per-pass extra block data. Mainly used right - * now in RA step to track livein/liveout. - */ - void *data; - -#ifdef DEBUG - uint32_t serialno; -#endif -}; - -static inline uint32_t -block_id(struct ir3_block *block) -{ -#ifdef DEBUG - return block->serialno; -#else - return (uint32_t)(unsigned long)block; -#endif -} - -struct ir3 * ir3_create(struct ir3_compiler *compiler, - unsigned nin, unsigned nout); -void ir3_destroy(struct ir3 *shader); -void * ir3_assemble(struct ir3 *shader, - struct ir3_info *info, uint32_t gpu_id); -void * ir3_alloc(struct ir3 *shader, int sz); - -struct ir3_block * ir3_block_create(struct ir3 *shader); - -struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc); -struct ir3_instruction * ir3_instr_create2(struct ir3_block *block, - opc_t opc, int nreg); -struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr); -void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep); -const char *ir3_instr_name(struct ir3_instruction *instr); - -struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, - int num, int flags); -struct ir3_register * ir3_reg_clone(struct ir3 *shader, - struct ir3_register *reg); - -void ir3_instr_set_address(struct ir3_instruction *instr, - struct ir3_instruction *addr); - -static inline bool ir3_instr_check_mark(struct ir3_instruction *instr) -{ - if (instr->flags & IR3_INSTR_MARK) - return true; /* already visited */ - instr->flags |= IR3_INSTR_MARK; - return false; -} - -void ir3_block_clear_mark(struct ir3_block *block); -void ir3_clear_mark(struct ir3 *shader); - -unsigned ir3_count_instructions(struct ir3 *ir); - -static inline int ir3_instr_regno(struct ir3_instruction *instr, - struct ir3_register *reg) -{ - unsigned i; - for (i = 0; i < instr->regs_count; i++) - if (reg == instr->regs[i]) - return i; - return -1; -} - - -#define MAX_ARRAYS 16 - -/* comp: - * 0 - x - * 1 - y - * 2 - z - * 3 - w - */ -static inline uint32_t regid(int num, int comp) -{ - return (num << 2) | (comp & 0x3); -} - -static inline uint32_t reg_num(struct ir3_register *reg) -{ - return reg->num >> 2; -} - -static inline uint32_t reg_comp(struct ir3_register *reg) -{ - return reg->num & 0x3; -} - -static inline bool is_flow(struct ir3_instruction *instr) -{ - return (opc_cat(instr->opc) == 0); -} - -static inline bool is_kill(struct ir3_instruction *instr) -{ - return instr->opc == OPC_KILL; -} - -static inline bool is_nop(struct ir3_instruction *instr) -{ - return instr->opc == OPC_NOP; -} - -/* Is it a non-transformative (ie. not type changing) mov? This can - * also include absneg.s/absneg.f, which for the most part can be - * treated as a mov (single src argument). - */ -static inline bool is_same_type_mov(struct ir3_instruction *instr) -{ - struct ir3_register *dst; - - switch (instr->opc) { - case OPC_MOV: - if (instr->cat1.src_type != instr->cat1.dst_type) - return false; - break; - case OPC_ABSNEG_F: - case OPC_ABSNEG_S: - if (instr->flags & IR3_INSTR_SAT) - return false; - break; - default: - return false; - } - - dst = instr->regs[0]; - - /* mov's that write to a0.x or p0.x are special: */ - if (dst->num == regid(REG_P0, 0)) - return false; - if (dst->num == regid(REG_A0, 0)) - return false; - - if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY)) - return false; - - return true; -} - -static inline bool is_alu(struct ir3_instruction *instr) -{ - return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3); -} - -static inline bool is_sfu(struct ir3_instruction *instr) -{ - return (opc_cat(instr->opc) == 4); -} - -static inline bool is_tex(struct ir3_instruction *instr) -{ - return (opc_cat(instr->opc) == 5); -} - -static inline bool is_mem(struct ir3_instruction *instr) -{ - return (opc_cat(instr->opc) == 6); -} - -static inline bool is_barrier(struct ir3_instruction *instr) -{ - return (opc_cat(instr->opc) == 7); -} - -static inline bool -is_store(struct ir3_instruction *instr) -{ - /* these instructions, the "destination" register is - * actually a source, the address to store to. - */ - switch (instr->opc) { - case OPC_STG: - case OPC_STGB: - case OPC_STIB: - case OPC_STP: - case OPC_STL: - case OPC_STLW: - case OPC_L2G: - case OPC_G2L: - return true; - default: - return false; - } -} - -static inline bool is_load(struct ir3_instruction *instr) -{ - switch (instr->opc) { - case OPC_LDG: - case OPC_LDGB: - case OPC_LDL: - case OPC_LDP: - case OPC_L2G: - case OPC_LDLW: - case OPC_LDC: - case OPC_LDLV: - /* probably some others too.. */ - return true; - default: - return false; - } -} - -static inline bool is_input(struct ir3_instruction *instr) -{ - /* in some cases, ldlv is used to fetch varying without - * interpolation.. fortunately inloc is the first src - * register in either case - */ - switch (instr->opc) { - case OPC_LDLV: - case OPC_BARY_F: - return true; - default: - return false; - } -} - -static inline bool is_bool(struct ir3_instruction *instr) -{ - switch (instr->opc) { - case OPC_CMPS_F: - case OPC_CMPS_S: - case OPC_CMPS_U: - return true; - default: - return false; - } -} - -static inline bool is_meta(struct ir3_instruction *instr) -{ - /* TODO how should we count PHI (and maybe fan-in/out) which - * might actually contribute some instructions to the final - * result? - */ - return (opc_cat(instr->opc) == -1); -} - -static inline bool writes_addr(struct ir3_instruction *instr) -{ - if (instr->regs_count > 0) { - struct ir3_register *dst = instr->regs[0]; - return reg_num(dst) == REG_A0; - } - return false; -} - -static inline bool writes_pred(struct ir3_instruction *instr) -{ - if (instr->regs_count > 0) { - struct ir3_register *dst = instr->regs[0]; - return reg_num(dst) == REG_P0; - } - return false; -} - -/* returns defining instruction for reg */ -/* TODO better name */ -static inline struct ir3_instruction *ssa(struct ir3_register *reg) -{ - if (reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) { - return reg->instr; - } - return NULL; -} - -static inline bool conflicts(struct ir3_instruction *a, - struct ir3_instruction *b) -{ - return (a && b) && (a != b); -} - -static inline bool reg_gpr(struct ir3_register *r) -{ - if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED)) - return false; - if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0)) - return false; - return true; -} - -static inline type_t half_type(type_t type) -{ - switch (type) { - case TYPE_F32: return TYPE_F16; - case TYPE_U32: return TYPE_U16; - case TYPE_S32: return TYPE_S16; - case TYPE_F16: - case TYPE_U16: - case TYPE_S16: - return type; - default: - assert(0); - return ~0; - } -} - -/* some cat2 instructions (ie. those which are not float) can embed an - * immediate: - */ -static inline bool ir3_cat2_int(opc_t opc) -{ - switch (opc) { - case OPC_ADD_U: - case OPC_ADD_S: - case OPC_SUB_U: - case OPC_SUB_S: - case OPC_CMPS_U: - case OPC_CMPS_S: - case OPC_MIN_U: - case OPC_MIN_S: - case OPC_MAX_U: - case OPC_MAX_S: - case OPC_CMPV_U: - case OPC_CMPV_S: - case OPC_MUL_U: - case OPC_MUL_S: - case OPC_MULL_U: - case OPC_CLZ_S: - case OPC_ABSNEG_S: - case OPC_AND_B: - case OPC_OR_B: - case OPC_NOT_B: - case OPC_XOR_B: - case OPC_BFREV_B: - case OPC_CLZ_B: - case OPC_SHL_B: - case OPC_SHR_B: - case OPC_ASHR_B: - case OPC_MGEN_B: - case OPC_GETBIT_B: - case OPC_CBITS_B: - case OPC_BARY_F: - return true; - - default: - return false; - } -} - - -/* map cat2 instruction to valid abs/neg flags: */ -static inline unsigned ir3_cat2_absneg(opc_t opc) -{ - switch (opc) { - case OPC_ADD_F: - case OPC_MIN_F: - case OPC_MAX_F: - case OPC_MUL_F: - case OPC_SIGN_F: - case OPC_CMPS_F: - case OPC_ABSNEG_F: - case OPC_CMPV_F: - case OPC_FLOOR_F: - case OPC_CEIL_F: - case OPC_RNDNE_F: - case OPC_RNDAZ_F: - case OPC_TRUNC_F: - case OPC_BARY_F: - return IR3_REG_FABS | IR3_REG_FNEG; - - case OPC_ADD_U: - case OPC_ADD_S: - case OPC_SUB_U: - case OPC_SUB_S: - case OPC_CMPS_U: - case OPC_CMPS_S: - case OPC_MIN_U: - case OPC_MIN_S: - case OPC_MAX_U: - case OPC_MAX_S: - case OPC_CMPV_U: - case OPC_CMPV_S: - case OPC_MUL_U: - case OPC_MUL_S: - case OPC_MULL_U: - case OPC_CLZ_S: - return 0; - - case OPC_ABSNEG_S: - return IR3_REG_SABS | IR3_REG_SNEG; - - case OPC_AND_B: - case OPC_OR_B: - case OPC_NOT_B: - case OPC_XOR_B: - case OPC_BFREV_B: - case OPC_CLZ_B: - case OPC_SHL_B: - case OPC_SHR_B: - case OPC_ASHR_B: - case OPC_MGEN_B: - case OPC_GETBIT_B: - case OPC_CBITS_B: - return IR3_REG_BNOT; - - default: - return 0; - } -} - -/* map cat3 instructions to valid abs/neg flags: */ -static inline unsigned ir3_cat3_absneg(opc_t opc) -{ - switch (opc) { - case OPC_MAD_F16: - case OPC_MAD_F32: - case OPC_SEL_F16: - case OPC_SEL_F32: - return IR3_REG_FNEG; - - case OPC_MAD_U16: - case OPC_MADSH_U16: - case OPC_MAD_S16: - case OPC_MADSH_M16: - case OPC_MAD_U24: - case OPC_MAD_S24: - case OPC_SEL_S16: - case OPC_SEL_S32: - case OPC_SAD_S16: - case OPC_SAD_S32: - /* neg *may* work on 3rd src.. */ - - case OPC_SEL_B16: - case OPC_SEL_B32: - - default: - return 0; - } -} - -#define MASK(n) ((1 << (n)) - 1) - -/* iterator for an instructions's sources (reg), also returns src #: */ -#define foreach_src_n(__srcreg, __n, __instr) \ - if ((__instr)->regs_count) \ - for (unsigned __cnt = (__instr)->regs_count - 1, __n = 0; __n < __cnt; __n++) \ - if ((__srcreg = (__instr)->regs[__n + 1])) - -/* iterator for an instructions's sources (reg): */ -#define foreach_src(__srcreg, __instr) \ - foreach_src_n(__srcreg, __i, __instr) - -static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr) -{ - unsigned cnt = instr->regs_count + instr->deps_count; - if (instr->address) - cnt++; - return cnt; -} - -static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n) -{ - if (n == (instr->regs_count + instr->deps_count)) - return instr->address; - if (n >= instr->regs_count) - return instr->deps[n - instr->regs_count]; - return ssa(instr->regs[n]); -} - -static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n) -{ - if (n == (instr->regs_count + instr->deps_count)) - return false; - if (n >= instr->regs_count) - return true; - return false; -} - -#define __src_cnt(__instr) ((__instr)->address ? (__instr)->regs_count : (__instr)->regs_count - 1) - -/* iterator for an instruction's SSA sources (instr), also returns src #: */ -#define foreach_ssa_src_n(__srcinst, __n, __instr) \ - for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \ - if ((__srcinst = __ssa_src_n(__instr, __n))) - -/* iterator for an instruction's SSA sources (instr): */ -#define foreach_ssa_src(__srcinst, __instr) \ - foreach_ssa_src_n(__srcinst, __i, __instr) - - -/* dump: */ -void ir3_print(struct ir3 *ir); -void ir3_print_instr(struct ir3_instruction *instr); - -/* depth calculation: */ -int ir3_delayslots(struct ir3_instruction *assigner, - struct ir3_instruction *consumer, unsigned n); -void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list); -void ir3_depth(struct ir3 *ir); - -/* copy-propagate: */ -struct ir3_shader_variant; -void ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so); - -/* group neighbors and insert mov's to resolve conflicts: */ -void ir3_group(struct ir3 *ir); - -/* scheduling: */ -void ir3_sched_add_deps(struct ir3 *ir); -int ir3_sched(struct ir3 *ir); - -/* register assignment: */ -struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(struct ir3_compiler *compiler); -int ir3_ra(struct ir3 *ir3, gl_shader_stage type, - bool frag_coord, bool frag_face); - -/* legalize: */ -void ir3_legalize(struct ir3 *ir, int *num_samp, bool *has_ssbo, int *max_bary); - -/* ************************************************************************* */ -/* instruction helpers */ - -/* creates SSA src of correct type (ie. half vs full precision) */ -static inline struct ir3_register * __ssa_src(struct ir3_instruction *instr, - struct ir3_instruction *src, unsigned flags) -{ - struct ir3_register *reg; - if (src->regs[0]->flags & IR3_REG_HALF) - flags |= IR3_REG_HALF; - reg = ir3_reg_create(instr, 0, IR3_REG_SSA | flags); - reg->instr = src; - return reg; -} - -static inline struct ir3_instruction * -ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type) -{ - struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV); - ir3_reg_create(instr, 0, 0); /* dst */ - if (src->regs[0]->flags & IR3_REG_ARRAY) { - struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY); - src_reg->array = src->regs[0]->array; - } else { - __ssa_src(instr, src, 0); - } - debug_assert(!(src->regs[0]->flags & IR3_REG_RELATIV)); - instr->cat1.src_type = type; - instr->cat1.dst_type = type; - return instr; -} - -static inline struct ir3_instruction * -ir3_COV(struct ir3_block *block, struct ir3_instruction *src, - type_t src_type, type_t dst_type) -{ - struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV); - unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0; - unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0; - - debug_assert((src->regs[0]->flags & IR3_REG_HALF) == src_flags); - - ir3_reg_create(instr, 0, dst_flags); /* dst */ - __ssa_src(instr, src, 0); - instr->cat1.src_type = src_type; - instr->cat1.dst_type = dst_type; - debug_assert(!(src->regs[0]->flags & IR3_REG_ARRAY)); - return instr; -} - -static inline struct ir3_instruction * -ir3_NOP(struct ir3_block *block) -{ - return ir3_instr_create(block, OPC_NOP); -} - -#define INSTR0(name) \ -static inline struct ir3_instruction * \ -ir3_##name(struct ir3_block *block) \ -{ \ - struct ir3_instruction *instr = \ - ir3_instr_create(block, OPC_##name); \ - return instr; \ -} - -#define INSTR1(name) \ -static inline struct ir3_instruction * \ -ir3_##name(struct ir3_block *block, \ - struct ir3_instruction *a, unsigned aflags) \ -{ \ - struct ir3_instruction *instr = \ - ir3_instr_create(block, OPC_##name); \ - ir3_reg_create(instr, 0, 0); /* dst */ \ - __ssa_src(instr, a, aflags); \ - return instr; \ -} - -#define INSTR2(name) \ -static inline struct ir3_instruction * \ -ir3_##name(struct ir3_block *block, \ - struct ir3_instruction *a, unsigned aflags, \ - struct ir3_instruction *b, unsigned bflags) \ -{ \ - struct ir3_instruction *instr = \ - ir3_instr_create(block, OPC_##name); \ - ir3_reg_create(instr, 0, 0); /* dst */ \ - __ssa_src(instr, a, aflags); \ - __ssa_src(instr, b, bflags); \ - return instr; \ -} - -#define INSTR3(name) \ -static inline struct ir3_instruction * \ -ir3_##name(struct ir3_block *block, \ - struct ir3_instruction *a, unsigned aflags, \ - struct ir3_instruction *b, unsigned bflags, \ - struct ir3_instruction *c, unsigned cflags) \ -{ \ - struct ir3_instruction *instr = \ - ir3_instr_create(block, OPC_##name); \ - ir3_reg_create(instr, 0, 0); /* dst */ \ - __ssa_src(instr, a, aflags); \ - __ssa_src(instr, b, bflags); \ - __ssa_src(instr, c, cflags); \ - return instr; \ -} - -#define INSTR4(name) \ -static inline struct ir3_instruction * \ -ir3_##name(struct ir3_block *block, \ - struct ir3_instruction *a, unsigned aflags, \ - struct ir3_instruction *b, unsigned bflags, \ - struct ir3_instruction *c, unsigned cflags, \ - struct ir3_instruction *d, unsigned dflags) \ -{ \ - struct ir3_instruction *instr = \ - ir3_instr_create2(block, OPC_##name, 5); \ - ir3_reg_create(instr, 0, 0); /* dst */ \ - __ssa_src(instr, a, aflags); \ - __ssa_src(instr, b, bflags); \ - __ssa_src(instr, c, cflags); \ - __ssa_src(instr, d, dflags); \ - return instr; \ -} - -#define INSTR4F(f, name) \ -static inline struct ir3_instruction * \ -ir3_##name##_##f(struct ir3_block *block, \ - struct ir3_instruction *a, unsigned aflags, \ - struct ir3_instruction *b, unsigned bflags, \ - struct ir3_instruction *c, unsigned cflags, \ - struct ir3_instruction *d, unsigned dflags) \ -{ \ - struct ir3_instruction *instr = \ - ir3_instr_create2(block, OPC_##name, 5); \ - ir3_reg_create(instr, 0, 0); /* dst */ \ - __ssa_src(instr, a, aflags); \ - __ssa_src(instr, b, bflags); \ - __ssa_src(instr, c, cflags); \ - __ssa_src(instr, d, dflags); \ - instr->flags |= IR3_INSTR_##f; \ - return instr; \ -} - -/* cat0 instructions: */ -INSTR0(BR) -INSTR0(JUMP) -INSTR1(KILL) -INSTR0(END) - -/* cat2 instructions, most 2 src but some 1 src: */ -INSTR2(ADD_F) -INSTR2(MIN_F) -INSTR2(MAX_F) -INSTR2(MUL_F) -INSTR1(SIGN_F) -INSTR2(CMPS_F) -INSTR1(ABSNEG_F) -INSTR2(CMPV_F) -INSTR1(FLOOR_F) -INSTR1(CEIL_F) -INSTR1(RNDNE_F) -INSTR1(RNDAZ_F) -INSTR1(TRUNC_F) -INSTR2(ADD_U) -INSTR2(ADD_S) -INSTR2(SUB_U) -INSTR2(SUB_S) -INSTR2(CMPS_U) -INSTR2(CMPS_S) -INSTR2(MIN_U) -INSTR2(MIN_S) -INSTR2(MAX_U) -INSTR2(MAX_S) -INSTR1(ABSNEG_S) -INSTR2(AND_B) -INSTR2(OR_B) -INSTR1(NOT_B) -INSTR2(XOR_B) -INSTR2(CMPV_U) -INSTR2(CMPV_S) -INSTR2(MUL_U) -INSTR2(MUL_S) -INSTR2(MULL_U) -INSTR1(BFREV_B) -INSTR1(CLZ_S) -INSTR1(CLZ_B) -INSTR2(SHL_B) -INSTR2(SHR_B) -INSTR2(ASHR_B) -INSTR2(BARY_F) -INSTR2(MGEN_B) -INSTR2(GETBIT_B) -INSTR1(SETRM) -INSTR1(CBITS_B) -INSTR2(SHB) -INSTR2(MSAD) - -/* cat3 instructions: */ -INSTR3(MAD_U16) -INSTR3(MADSH_U16) -INSTR3(MAD_S16) -INSTR3(MADSH_M16) -INSTR3(MAD_U24) -INSTR3(MAD_S24) -INSTR3(MAD_F16) -INSTR3(MAD_F32) -INSTR3(SEL_B16) -INSTR3(SEL_B32) -INSTR3(SEL_S16) -INSTR3(SEL_S32) -INSTR3(SEL_F16) -INSTR3(SEL_F32) -INSTR3(SAD_S16) -INSTR3(SAD_S32) - -/* cat4 instructions: */ -INSTR1(RCP) -INSTR1(RSQ) -INSTR1(LOG2) -INSTR1(EXP2) -INSTR1(SIN) -INSTR1(COS) -INSTR1(SQRT) - -/* cat5 instructions: */ -INSTR1(DSX) -INSTR1(DSY) - -static inline struct ir3_instruction * -ir3_SAM(struct ir3_block *block, opc_t opc, type_t type, - unsigned wrmask, unsigned flags, unsigned samp, unsigned tex, - struct ir3_instruction *src0, struct ir3_instruction *src1) -{ - struct ir3_instruction *sam; - struct ir3_register *reg; - - sam = ir3_instr_create(block, opc); - sam->flags |= flags; - ir3_reg_create(sam, 0, 0)->wrmask = wrmask; - if (src0) { - reg = ir3_reg_create(sam, 0, IR3_REG_SSA); - reg->wrmask = (1 << (src0->regs_count - 1)) - 1; - reg->instr = src0; - } - if (src1) { - reg = ir3_reg_create(sam, 0, IR3_REG_SSA); - reg->instr = src1; - reg->wrmask = (1 << (src1->regs_count - 1)) - 1; - } - sam->cat5.samp = samp; - sam->cat5.tex = tex; - sam->cat5.type = type; - - return sam; -} - -/* cat6 instructions: */ -INSTR2(LDLV) -INSTR2(LDG) -INSTR2(LDL) -INSTR3(STG) -INSTR3(STL) -INSTR3(LDGB) -INSTR4(STGB) -INSTR4(STIB) -INSTR1(RESINFO) -INSTR1(RESFMT) -INSTR2(ATOMIC_ADD) -INSTR2(ATOMIC_SUB) -INSTR2(ATOMIC_XCHG) -INSTR2(ATOMIC_INC) -INSTR2(ATOMIC_DEC) -INSTR2(ATOMIC_CMPXCHG) -INSTR2(ATOMIC_MIN) -INSTR2(ATOMIC_MAX) -INSTR2(ATOMIC_AND) -INSTR2(ATOMIC_OR) -INSTR2(ATOMIC_XOR) -INSTR4F(G, ATOMIC_ADD) -INSTR4F(G, ATOMIC_SUB) -INSTR4F(G, ATOMIC_XCHG) -INSTR4F(G, ATOMIC_INC) -INSTR4F(G, ATOMIC_DEC) -INSTR4F(G, ATOMIC_CMPXCHG) -INSTR4F(G, ATOMIC_MIN) -INSTR4F(G, ATOMIC_MAX) -INSTR4F(G, ATOMIC_AND) -INSTR4F(G, ATOMIC_OR) -INSTR4F(G, ATOMIC_XOR) - -/* cat7 instructions: */ -INSTR0(BAR) -INSTR0(FENCE) - -/* ************************************************************************* */ -/* split this out or find some helper to use.. like main/bitset.h.. */ - -#include <string.h> - -#define MAX_REG 256 - -typedef uint8_t regmask_t[2 * MAX_REG / 8]; - -static inline unsigned regmask_idx(struct ir3_register *reg) -{ - unsigned num = (reg->flags & IR3_REG_RELATIV) ? reg->array.offset : reg->num; - debug_assert(num < MAX_REG); - if (reg->flags & IR3_REG_HALF) - num += MAX_REG; - return num; -} - -static inline void regmask_init(regmask_t *regmask) -{ - memset(regmask, 0, sizeof(*regmask)); -} - -static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg) -{ - unsigned idx = regmask_idx(reg); - if (reg->flags & IR3_REG_RELATIV) { - unsigned i; - for (i = 0; i < reg->size; i++, idx++) - (*regmask)[idx / 8] |= 1 << (idx % 8); - } else { - unsigned mask; - for (mask = reg->wrmask; mask; mask >>= 1, idx++) - if (mask & 1) - (*regmask)[idx / 8] |= 1 << (idx % 8); - } -} - -static inline void regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b) -{ - unsigned i; - for (i = 0; i < ARRAY_SIZE(*dst); i++) - (*dst)[i] = (*a)[i] | (*b)[i]; -} - -/* set bits in a if not set in b, conceptually: - * a |= (reg & ~b) - */ -static inline void regmask_set_if_not(regmask_t *a, - struct ir3_register *reg, regmask_t *b) -{ - unsigned idx = regmask_idx(reg); - if (reg->flags & IR3_REG_RELATIV) { - unsigned i; - for (i = 0; i < reg->size; i++, idx++) - if (!((*b)[idx / 8] & (1 << (idx % 8)))) - (*a)[idx / 8] |= 1 << (idx % 8); - } else { - unsigned mask; - for (mask = reg->wrmask; mask; mask >>= 1, idx++) - if (mask & 1) - if (!((*b)[idx / 8] & (1 << (idx % 8)))) - (*a)[idx / 8] |= 1 << (idx % 8); - } -} - -static inline bool regmask_get(regmask_t *regmask, - struct ir3_register *reg) -{ - unsigned idx = regmask_idx(reg); - if (reg->flags & IR3_REG_RELATIV) { - unsigned i; - for (i = 0; i < reg->size; i++, idx++) - if ((*regmask)[idx / 8] & (1 << (idx % 8))) - return true; - } else { - unsigned mask; - for (mask = reg->wrmask; mask; mask >>= 1, idx++) - if (mask & 1) - if ((*regmask)[idx / 8] & (1 << (idx % 8))) - return true; - } - return false; -} - -/* ************************************************************************* */ - -#endif /* IR3_H_ */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cache.h b/src/gallium/drivers/freedreno/ir3/ir3_cache.h index 3d3a7f8050d..73d555e92ce 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_cache.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_cache.h @@ -27,7 +27,7 @@ #ifndef IR3_CACHE_H_ #define IR3_CACHE_H_ -#include "ir3_shader.h" +#include "ir3/ir3_shader.h" /* * An in-memory cache for mapping shader state objects plus shader key to diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c index d12cdd353ab..47fd5dfd012 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c @@ -37,11 +37,11 @@ #include "tgsi/tgsi_text.h" #include "tgsi/tgsi_dump.h" -#include "ir3_compiler.h" -#include "ir3_gallium.h" -#include "ir3_nir.h" -#include "instr-a3xx.h" -#include "ir3.h" +#include "ir3/ir3_compiler.h" +#include "ir3/ir3_gallium.h" +#include "ir3/ir3_nir.h" +#include "ir3/instr-a3xx.h" +#include "ir3/ir3.h" #include "compiler/glsl/standalone.h" #include "compiler/glsl/glsl_to_nir.h" diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c deleted file mode 100644 index f00daebabf5..00000000000 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (C) 2015 Rob Clark <[email protected]> - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark <[email protected]> - */ - -#include "util/ralloc.h" - -#include "ir3_compiler.h" - -static const struct debug_named_value shader_debug_options[] = { - {"vs", IR3_DBG_SHADER_VS, "Print shader disasm for vertex shaders"}, - {"fs", IR3_DBG_SHADER_FS, "Print shader disasm for fragment shaders"}, - {"cs", IR3_DBG_SHADER_CS, "Print shader disasm for compute shaders"}, - {"disasm", IR3_DBG_DISASM, "Dump NIR and adreno shader disassembly"}, - {"optmsgs", IR3_DBG_OPTMSGS,"Enable optimizer debug messages"}, - DEBUG_NAMED_VALUE_END -}; - -DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG", shader_debug_options, 0) - -enum ir3_shader_debug ir3_shader_debug = 0; - -struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id) -{ - struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler); - - ir3_shader_debug = debug_get_option_ir3_shader_debug(); - - compiler->dev = dev; - compiler->gpu_id = gpu_id; - compiler->set = ir3_ra_alloc_reg_set(compiler); - - if (compiler->gpu_id >= 400) { - /* need special handling for "flat" */ - compiler->flat_bypass = true; - compiler->levels_add_one = false; - compiler->unminify_coords = false; - compiler->txf_ms_with_isaml = false; - compiler->array_index_add_half = true; - } else { - /* no special handling for "flat" */ - compiler->flat_bypass = false; - compiler->levels_add_one = true; - compiler->unminify_coords = true; - compiler->txf_ms_with_isaml = true; - compiler->array_index_add_half = false; - } - - return compiler; -} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h deleted file mode 100644 index e2336062b29..00000000000 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (C) 2013 Rob Clark <[email protected]> - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark <[email protected]> - */ - -#ifndef IR3_COMPILER_H_ -#define IR3_COMPILER_H_ - -#include "ir3_shader.h" - -struct ir3_ra_reg_set; - -struct ir3_compiler { - struct fd_device *dev; - uint32_t gpu_id; - struct ir3_ra_reg_set *set; - uint32_t shader_count; - - /* - * Configuration options for things that are handled differently on - * different generations: - */ - - /* a4xx (and later) drops SP_FS_FLAT_SHAD_MODE_REG_* for flat-interpolate - * so we need to use ldlv.u32 to load the varying directly: - */ - bool flat_bypass; - - /* on a3xx, we need to add one to # of array levels: - */ - bool levels_add_one; - - /* on a3xx, we need to scale up integer coords for isaml based - * on LoD: - */ - bool unminify_coords; - - /* on a3xx do txf_ms w/ isaml and scaled coords: */ - bool txf_ms_with_isaml; - - /* on a4xx, for array textures we need to add 0.5 to the array - * index coordinate: - */ - bool array_index_add_half; -}; - -struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id); - -int ir3_compile_shader_nir(struct ir3_compiler *compiler, - struct ir3_shader_variant *so); - -enum ir3_shader_debug { - IR3_DBG_SHADER_VS = 0x01, - IR3_DBG_SHADER_FS = 0x02, - IR3_DBG_SHADER_CS = 0x04, - IR3_DBG_DISASM = 0x08, - IR3_DBG_OPTMSGS = 0x10, -}; - -extern enum ir3_shader_debug ir3_shader_debug; - -static inline bool -shader_debug_enabled(gl_shader_stage type) -{ - switch (type) { - case MESA_SHADER_VERTEX: return !!(ir3_shader_debug & IR3_DBG_SHADER_VS); - case MESA_SHADER_FRAGMENT: return !!(ir3_shader_debug & IR3_DBG_SHADER_FS); - case MESA_SHADER_COMPUTE: return !!(ir3_shader_debug & IR3_DBG_SHADER_CS); - default: - debug_assert(0); - return false; - } -} - -#endif /* IR3_COMPILER_H_ */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c deleted file mode 100644 index 445a2b291e9..00000000000 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ /dev/null @@ -1,3818 +0,0 @@ -/* - * Copyright (C) 2015 Rob Clark <[email protected]> - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark <[email protected]> - */ - -#include <stdarg.h> - -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_math.h" - -#include "ir3_compiler.h" -#include "ir3_shader.h" -#include "ir3_nir.h" - -#include "instr-a3xx.h" -#include "ir3.h" - -/* for conditionally setting boolean flag(s): */ -#define COND(bool, val) ((bool) ? (val) : 0) - -#define DBG(fmt, ...) \ - do { debug_printf("%s:%d: "fmt "\n", \ - __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0) - -struct ir3_context { - struct ir3_compiler *compiler; - - struct nir_shader *s; - - struct nir_instr *cur_instr; /* current instruction, just for debug */ - - struct ir3 *ir; - struct ir3_shader_variant *so; - - struct ir3_block *block; /* the current block */ - struct ir3_block *in_block; /* block created for shader inputs */ - - nir_function_impl *impl; - - /* For fragment shaders, varyings are not actual shader inputs, - * instead the hw passes a varying-coord which is used with - * bary.f. - * - * But NIR doesn't know that, it still declares varyings as - * inputs. So we do all the input tracking normally and fix - * things up after compile_instructions() - * - * NOTE that frag_vcoord is the hardware position (possibly it - * is actually an index or tag or some such.. it is *not* - * values that can be directly used for gl_FragCoord..) - */ - struct ir3_instruction *frag_vcoord; - - /* for fragment shaders, for gl_FrontFacing and gl_FragCoord: */ - struct ir3_instruction *frag_face, *frag_coord; - - /* For vertex shaders, keep track of the system values sources */ - struct ir3_instruction *vertex_id, *basevertex, *instance_id; - - /* For fragment shaders: */ - struct ir3_instruction *samp_id, *samp_mask_in; - - /* Compute shader inputs: */ - struct ir3_instruction *local_invocation_id, *work_group_id; - - /* mapping from nir_register to defining instruction: */ - struct hash_table *def_ht; - - unsigned num_arrays; - - /* a common pattern for indirect addressing is to request the - * same address register multiple times. To avoid generating - * duplicate instruction sequences (which our backend does not - * try to clean up, since that should be done as the NIR stage) - * we cache the address value generated for a given src value: - * - * Note that we have to cache these per alignment, since same - * src used for an array of vec1 cannot be also used for an - * array of vec4. - */ - struct hash_table *addr_ht[4]; - - /* last dst array, for indirect we need to insert a var-store. - */ - struct ir3_instruction **last_dst; - unsigned last_dst_n; - - /* maps nir_block to ir3_block, mostly for the purposes of - * figuring out the blocks successors - */ - struct hash_table *block_ht; - - /* on a4xx, bitmask of samplers which need astc+srgb workaround: */ - unsigned astc_srgb; - - unsigned samples; /* bitmask of x,y sample shifts */ - - unsigned max_texture_index; - - /* set if we encounter something we can't handle yet, so we - * can bail cleanly and fallback to TGSI compiler f/e - */ - bool error; -}; - -/* gpu pointer size in units of 32bit registers/slots */ -static unsigned pointer_size(struct ir3_context *ctx) -{ - return (ctx->compiler->gpu_id >= 500) ? 2 : 1; -} - -static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val); -static struct ir3_block * get_block(struct ir3_context *ctx, const nir_block *nblock); - - -static struct ir3_context * -compile_init(struct ir3_compiler *compiler, - struct ir3_shader_variant *so) -{ - struct ir3_context *ctx = rzalloc(NULL, struct ir3_context); - - if (compiler->gpu_id >= 400) { - if (so->type == MESA_SHADER_VERTEX) { - ctx->astc_srgb = so->key.vastc_srgb; - } else if (so->type == MESA_SHADER_FRAGMENT) { - ctx->astc_srgb = so->key.fastc_srgb; - } - - } else { - if (so->type == MESA_SHADER_VERTEX) { - ctx->samples = so->key.vsamples; - } else if (so->type == MESA_SHADER_FRAGMENT) { - ctx->samples = so->key.fsamples; - } - } - - ctx->compiler = compiler; - ctx->so = so; - ctx->def_ht = _mesa_hash_table_create(ctx, - _mesa_hash_pointer, _mesa_key_pointer_equal); - ctx->block_ht = _mesa_hash_table_create(ctx, - _mesa_hash_pointer, _mesa_key_pointer_equal); - - /* TODO: maybe generate some sort of bitmask of what key - * lowers vs what shader has (ie. no need to lower - * texture clamp lowering if no texture sample instrs).. - * although should be done further up the stack to avoid - * creating duplicate variants.. - */ - - if (ir3_key_lowers_nir(&so->key)) { - nir_shader *s = nir_shader_clone(ctx, so->shader->nir); - ctx->s = ir3_optimize_nir(so->shader, s, &so->key); - } else { - /* fast-path for shader key that lowers nothing in NIR: */ - ctx->s = so->shader->nir; - } - - /* this needs to be the last pass run, so do this here instead of - * in ir3_optimize_nir(): - */ - NIR_PASS_V(ctx->s, nir_lower_locals_to_regs); - NIR_PASS_V(ctx->s, nir_convert_from_ssa, true); - - if (ir3_shader_debug & IR3_DBG_DISASM) { - printf("dump nir%dv%d: type=%d, k={cts=%u,hp=%u}", - so->shader->id, so->id, so->type, - so->key.color_two_side, so->key.half_precision); - nir_print_shader(ctx->s, stdout); - } - - if (shader_debug_enabled(so->type)) { - fprintf(stderr, "NIR (final form) for %s shader:\n", - _mesa_shader_stage_to_string(so->type)); - nir_print_shader(ctx->s, stderr); - } - - ir3_nir_scan_driver_consts(ctx->s, &so->const_layout); - - so->num_uniforms = ctx->s->num_uniforms; - so->num_ubos = ctx->s->info.num_ubos; - - /* Layout of constant registers, each section aligned to vec4. Note - * that pointer size (ubo, etc) changes depending on generation. - * - * user consts - * UBO addresses - * SSBO sizes - * if (vertex shader) { - * driver params (IR3_DP_*) - * if (stream_output.num_outputs > 0) - * stream-out addresses - * } - * immediates - * - * Immediates go last mostly because they are inserted in the CP pass - * after the nir -> ir3 frontend. - */ - unsigned constoff = align(ctx->s->num_uniforms, 4); - unsigned ptrsz = pointer_size(ctx); - - memset(&so->constbase, ~0, sizeof(so->constbase)); - - if (so->num_ubos > 0) { - so->constbase.ubo = constoff; - constoff += align(ctx->s->info.num_ubos * ptrsz, 4) / 4; - } - - if (so->const_layout.ssbo_size.count > 0) { - unsigned cnt = so->const_layout.ssbo_size.count; - so->constbase.ssbo_sizes = constoff; - constoff += align(cnt, 4) / 4; - } - - if (so->const_layout.image_dims.count > 0) { - unsigned cnt = so->const_layout.image_dims.count; - so->constbase.image_dims = constoff; - constoff += align(cnt, 4) / 4; - } - - unsigned num_driver_params = 0; - if (so->type == MESA_SHADER_VERTEX) { - num_driver_params = IR3_DP_VS_COUNT; - } else if (so->type == MESA_SHADER_COMPUTE) { - num_driver_params = IR3_DP_CS_COUNT; - } - - so->constbase.driver_param = constoff; - constoff += align(num_driver_params, 4) / 4; - - if ((so->type == MESA_SHADER_VERTEX) && - (compiler->gpu_id < 500) && - so->shader->stream_output.num_outputs > 0) { - so->constbase.tfbo = constoff; - constoff += align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4; - } - - so->constbase.immediate = constoff; - - return ctx; -} - -static void -compile_error(struct ir3_context *ctx, const char *format, ...) -{ - struct hash_table *errors = NULL; - va_list ap; - va_start(ap, format); - if (ctx->cur_instr) { - errors = _mesa_hash_table_create(NULL, - _mesa_hash_pointer, - _mesa_key_pointer_equal); - char *msg = ralloc_vasprintf(errors, format, ap); - _mesa_hash_table_insert(errors, ctx->cur_instr, msg); - } else { - _debug_vprintf(format, ap); - } - va_end(ap); - nir_print_shader_annotated(ctx->s, stdout, errors); - ralloc_free(errors); - ctx->error = true; - debug_assert(0); -} - -#define compile_assert(ctx, cond) do { \ - if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \ - } while (0) - -static void -compile_free(struct ir3_context *ctx) -{ - ralloc_free(ctx); -} - -static void -declare_array(struct ir3_context *ctx, nir_register *reg) -{ - struct ir3_array *arr = rzalloc(ctx, struct ir3_array); - arr->id = ++ctx->num_arrays; - /* NOTE: sometimes we get non array regs, for example for arrays of - * length 1. See fs-const-array-of-struct-of-array.shader_test. So - * treat a non-array as if it was an array of length 1. - * - * It would be nice if there was a nir pass to convert arrays of - * length 1 to ssa. - */ - arr->length = reg->num_components * MAX2(1, reg->num_array_elems); - compile_assert(ctx, arr->length > 0); - arr->r = reg; - list_addtail(&arr->node, &ctx->ir->array_list); -} - -static struct ir3_array * -get_array(struct ir3_context *ctx, nir_register *reg) -{ - list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { - if (arr->r == reg) - return arr; - } - compile_error(ctx, "bogus reg: %s\n", reg->name); - return NULL; -} - -/* relative (indirect) if address!=NULL */ -static struct ir3_instruction * -create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n, - struct ir3_instruction *address) -{ - struct ir3_block *block = ctx->block; - struct ir3_instruction *mov; - struct ir3_register *src; - - mov = ir3_instr_create(block, OPC_MOV); - mov->cat1.src_type = TYPE_U32; - mov->cat1.dst_type = TYPE_U32; - mov->barrier_class = IR3_BARRIER_ARRAY_R; - mov->barrier_conflict = IR3_BARRIER_ARRAY_W; - ir3_reg_create(mov, 0, 0); - src = ir3_reg_create(mov, 0, IR3_REG_ARRAY | - COND(address, IR3_REG_RELATIV)); - src->instr = arr->last_write; - src->size = arr->length; - src->array.id = arr->id; - src->array.offset = n; - - if (address) - ir3_instr_set_address(mov, address); - - return mov; -} - -/* relative (indirect) if address!=NULL */ -static void -create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n, - struct ir3_instruction *src, struct ir3_instruction *address) -{ - struct ir3_block *block = ctx->block; - struct ir3_instruction *mov; - struct ir3_register *dst; - - /* if not relative store, don't create an extra mov, since that - * ends up being difficult for cp to remove. - */ - if (!address) { - dst = src->regs[0]; - - src->barrier_class |= IR3_BARRIER_ARRAY_W; - src->barrier_conflict |= IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W; - - dst->flags |= IR3_REG_ARRAY; - dst->instr = arr->last_write; - dst->size = arr->length; - dst->array.id = arr->id; - dst->array.offset = n; - - arr->last_write = src; - - array_insert(block, block->keeps, src); - - return; - } - - mov = ir3_instr_create(block, OPC_MOV); - mov->cat1.src_type = TYPE_U32; - mov->cat1.dst_type = TYPE_U32; - mov->barrier_class = IR3_BARRIER_ARRAY_W; - mov->barrier_conflict = IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W; - dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY | - COND(address, IR3_REG_RELATIV)); - dst->instr = arr->last_write; - dst->size = arr->length; - dst->array.id = arr->id; - dst->array.offset = n; - ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src; - - if (address) - ir3_instr_set_address(mov, address); - - arr->last_write = mov; - - /* the array store may only matter to something in an earlier - * block (ie. loops), but since arrays are not in SSA, depth - * pass won't know this.. so keep all array stores: - */ - array_insert(block, block->keeps, mov); -} - -static inline type_t utype_for_size(unsigned bit_size) -{ - switch (bit_size) { - case 32: return TYPE_U32; - case 16: return TYPE_U16; - case 8: return TYPE_U8; - default: unreachable("bad bitsize"); return ~0; - } -} - -static inline type_t utype_src(nir_src src) -{ return utype_for_size(nir_src_bit_size(src)); } - -static inline type_t utype_dst(nir_dest dst) -{ return utype_for_size(nir_dest_bit_size(dst)); } - -/* allocate a n element value array (to be populated by caller) and - * insert in def_ht - */ -static struct ir3_instruction ** -get_dst_ssa(struct ir3_context *ctx, nir_ssa_def *dst, unsigned n) -{ - struct ir3_instruction **value = - ralloc_array(ctx->def_ht, struct ir3_instruction *, n); - _mesa_hash_table_insert(ctx->def_ht, dst, value); - return value; -} - -static struct ir3_instruction ** -get_dst(struct ir3_context *ctx, nir_dest *dst, unsigned n) -{ - struct ir3_instruction **value; - - if (dst->is_ssa) { - value = get_dst_ssa(ctx, &dst->ssa, n); - } else { - value = ralloc_array(ctx, struct ir3_instruction *, n); - } - - /* NOTE: in non-ssa case, we don't really need to store last_dst - * but this helps us catch cases where put_dst() call is forgotten - */ - compile_assert(ctx, !ctx->last_dst); - ctx->last_dst = value; - ctx->last_dst_n = n; - - return value; -} - -static struct ir3_instruction * get_addr(struct ir3_context *ctx, struct ir3_instruction *src, int align); - -static struct ir3_instruction * const * -get_src(struct ir3_context *ctx, nir_src *src) -{ - if (src->is_ssa) { - struct hash_entry *entry; - entry = _mesa_hash_table_search(ctx->def_ht, src->ssa); - compile_assert(ctx, entry); - return entry->data; - } else { - nir_register *reg = src->reg.reg; - struct ir3_array *arr = get_array(ctx, reg); - unsigned num_components = arr->r->num_components; - struct ir3_instruction *addr = NULL; - struct ir3_instruction **value = - ralloc_array(ctx, struct ir3_instruction *, num_components); - - if (src->reg.indirect) - addr = get_addr(ctx, get_src(ctx, src->reg.indirect)[0], - reg->num_components); - - for (unsigned i = 0; i < num_components; i++) { - unsigned n = src->reg.base_offset * reg->num_components + i; - compile_assert(ctx, n < arr->length); - value[i] = create_array_load(ctx, arr, n, addr); - } - - return value; - } -} - -static void -put_dst(struct ir3_context *ctx, nir_dest *dst) -{ - unsigned bit_size = nir_dest_bit_size(*dst); - - if (bit_size < 32) { - for (unsigned i = 0; i < ctx->last_dst_n; i++) { - struct ir3_instruction *dst = ctx->last_dst[i]; - dst->regs[0]->flags |= IR3_REG_HALF; - if (ctx->last_dst[i]->opc == OPC_META_FO) - dst->regs[1]->instr->regs[0]->flags |= IR3_REG_HALF; - } - } - - if (!dst->is_ssa) { - nir_register *reg = dst->reg.reg; - struct ir3_array *arr = get_array(ctx, reg); - unsigned num_components = ctx->last_dst_n; - struct ir3_instruction *addr = NULL; - - if (dst->reg.indirect) - addr = get_addr(ctx, get_src(ctx, dst->reg.indirect)[0], - reg->num_components); - - for (unsigned i = 0; i < num_components; i++) { - unsigned n = dst->reg.base_offset * reg->num_components + i; - compile_assert(ctx, n < arr->length); - if (!ctx->last_dst[i]) - continue; - create_array_store(ctx, arr, n, ctx->last_dst[i], addr); - } - - ralloc_free(ctx->last_dst); - } - ctx->last_dst = NULL; - ctx->last_dst_n = 0; -} - -static struct ir3_instruction * -create_immed_typed(struct ir3_block *block, uint32_t val, type_t type) -{ - struct ir3_instruction *mov; - unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0; - - mov = ir3_instr_create(block, OPC_MOV); - mov->cat1.src_type = type; - mov->cat1.dst_type = type; - ir3_reg_create(mov, 0, flags); - ir3_reg_create(mov, 0, IR3_REG_IMMED)->uim_val = val; - - return mov; -} - -static struct ir3_instruction * -create_immed(struct ir3_block *block, uint32_t val) -{ - return create_immed_typed(block, val, TYPE_U32); -} - -static struct ir3_instruction * -create_addr(struct ir3_block *block, struct ir3_instruction *src, int align) -{ - struct ir3_instruction *instr, *immed; - - /* TODO in at least some cases, the backend could probably be - * made clever enough to propagate IR3_REG_HALF.. - */ - instr = ir3_COV(block, src, TYPE_U32, TYPE_S16); - instr->regs[0]->flags |= IR3_REG_HALF; - - switch(align){ - case 1: - /* src *= 1: */ - break; - case 2: - /* src *= 2 => src <<= 1: */ - immed = create_immed(block, 1); - immed->regs[0]->flags |= IR3_REG_HALF; - - instr = ir3_SHL_B(block, instr, 0, immed, 0); - instr->regs[0]->flags |= IR3_REG_HALF; - instr->regs[1]->flags |= IR3_REG_HALF; - break; - case 3: - /* src *= 3: */ - immed = create_immed(block, 3); - immed->regs[0]->flags |= IR3_REG_HALF; - - instr = ir3_MULL_U(block, instr, 0, immed, 0); - instr->regs[0]->flags |= IR3_REG_HALF; - instr->regs[1]->flags |= IR3_REG_HALF; - break; - case 4: - /* src *= 4 => src <<= 2: */ - immed = create_immed(block, 2); - immed->regs[0]->flags |= IR3_REG_HALF; - - instr = ir3_SHL_B(block, instr, 0, immed, 0); - instr->regs[0]->flags |= IR3_REG_HALF; - instr->regs[1]->flags |= IR3_REG_HALF; - break; - default: - unreachable("bad align"); - return NULL; - } - - instr = ir3_MOV(block, instr, TYPE_S16); - instr->regs[0]->num = regid(REG_A0, 0); - instr->regs[0]->flags |= IR3_REG_HALF; - instr->regs[1]->flags |= IR3_REG_HALF; - - return instr; -} - -/* caches addr values to avoid generating multiple cov/shl/mova - * sequences for each use of a given NIR level src as address - */ -static struct ir3_instruction * -get_addr(struct ir3_context *ctx, struct ir3_instruction *src, int align) -{ - struct ir3_instruction *addr; - unsigned idx = align - 1; - - compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr_ht)); - - if (!ctx->addr_ht[idx]) { - ctx->addr_ht[idx] = _mesa_hash_table_create(ctx, - _mesa_hash_pointer, _mesa_key_pointer_equal); - } else { - struct hash_entry *entry; - entry = _mesa_hash_table_search(ctx->addr_ht[idx], src); - if (entry) - return entry->data; - } - - addr = create_addr(ctx->block, src, align); - _mesa_hash_table_insert(ctx->addr_ht[idx], src, addr); - - return addr; -} - -static struct ir3_instruction * -get_predicate(struct ir3_context *ctx, struct ir3_instruction *src) -{ - struct ir3_block *b = ctx->block; - struct ir3_instruction *cond; - - /* NOTE: only cmps.*.* can write p0.x: */ - cond = ir3_CMPS_S(b, src, 0, create_immed(b, 0), 0); - cond->cat2.condition = IR3_COND_NE; - - /* condition always goes in predicate register: */ - cond->regs[0]->num = regid(REG_P0, 0); - - return cond; -} - -static struct ir3_instruction * -create_uniform(struct ir3_context *ctx, unsigned n) -{ - struct ir3_instruction *mov; - - mov = ir3_instr_create(ctx->block, OPC_MOV); - /* TODO get types right? */ - mov->cat1.src_type = TYPE_F32; - mov->cat1.dst_type = TYPE_F32; - ir3_reg_create(mov, 0, 0); - ir3_reg_create(mov, n, IR3_REG_CONST); - - return mov; -} - -static struct ir3_instruction * -create_uniform_indirect(struct ir3_context *ctx, int n, - struct ir3_instruction *address) -{ - struct ir3_instruction *mov; - - mov = ir3_instr_create(ctx->block, OPC_MOV); - mov->cat1.src_type = TYPE_U32; - mov->cat1.dst_type = TYPE_U32; - ir3_reg_create(mov, 0, 0); - ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n; - - ir3_instr_set_address(mov, address); - - return mov; -} - -static struct ir3_instruction * -create_collect(struct ir3_context *ctx, struct ir3_instruction *const *arr, - unsigned arrsz) -{ - struct ir3_block *block = ctx->block; - struct ir3_instruction *collect; - - if (arrsz == 0) - return NULL; - - unsigned flags = arr[0]->regs[0]->flags & IR3_REG_HALF; - - collect = ir3_instr_create2(block, OPC_META_FI, 1 + arrsz); - ir3_reg_create(collect, 0, flags); /* dst */ - for (unsigned i = 0; i < arrsz; i++) { - struct ir3_instruction *elem = arr[i]; - - /* Since arrays are pre-colored in RA, we can't assume that - * things will end up in the right place. (Ie. if a collect - * joins elements from two different arrays.) So insert an - * extra mov. - * - * We could possibly skip this if all the collected elements - * are contiguous elements in a single array.. not sure how - * likely that is to happen. - * - * Fixes a problem with glamor shaders, that in effect do - * something like: - * - * if (foo) - * texcoord = .. - * else - * texcoord = .. - * color = texture2D(tex, texcoord); - * - * In this case, texcoord will end up as nir registers (which - * translate to ir3 array's of length 1. And we can't assume - * the two (or more) arrays will get allocated in consecutive - * scalar registers. - * - */ - if (elem->regs[0]->flags & IR3_REG_ARRAY) { - type_t type = (flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; - elem = ir3_MOV(block, elem, type); - } - - compile_assert(ctx, (elem->regs[0]->flags & IR3_REG_HALF) == flags); - ir3_reg_create(collect, 0, IR3_REG_SSA | flags)->instr = elem; - } - - return collect; -} - -static struct ir3_instruction * -create_indirect_load(struct ir3_context *ctx, unsigned arrsz, int n, - struct ir3_instruction *address, struct ir3_instruction *collect) -{ - struct ir3_block *block = ctx->block; - struct ir3_instruction *mov; - struct ir3_register *src; - - mov = ir3_instr_create(block, OPC_MOV); - mov->cat1.src_type = TYPE_U32; - mov->cat1.dst_type = TYPE_U32; - ir3_reg_create(mov, 0, 0); - src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV); - src->instr = collect; - src->size = arrsz; - src->array.offset = n; - - ir3_instr_set_address(mov, address); - - return mov; -} - -static struct ir3_instruction * -create_input_compmask(struct ir3_context *ctx, unsigned n, unsigned compmask) -{ - struct ir3_instruction *in; - - in = ir3_instr_create(ctx->in_block, OPC_META_INPUT); - in->inout.block = ctx->in_block; - ir3_reg_create(in, n, 0); - - in->regs[0]->wrmask = compmask; - - return in; -} - -static struct ir3_instruction * -create_input(struct ir3_context *ctx, unsigned n) -{ - return create_input_compmask(ctx, n, 0x1); -} - -static struct ir3_instruction * -create_frag_input(struct ir3_context *ctx, bool use_ldlv) -{ - struct ir3_block *block = ctx->block; - struct ir3_instruction *instr; - /* actual inloc is assigned and fixed up later: */ - struct ir3_instruction *inloc = create_immed(block, 0); - - if (use_ldlv) { - instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0); - instr->cat6.type = TYPE_U32; - instr->cat6.iim_val = 1; - } else { - instr = ir3_BARY_F(block, inloc, 0, ctx->frag_vcoord, 0); - instr->regs[2]->wrmask = 0x3; - } - - return instr; -} - -static struct ir3_instruction * -create_driver_param(struct ir3_context *ctx, enum ir3_driver_param dp) -{ - /* first four vec4 sysval's reserved for UBOs: */ - /* NOTE: dp is in scalar, but there can be >4 dp components: */ - unsigned n = ctx->so->constbase.driver_param; - unsigned r = regid(n + dp / 4, dp % 4); - return create_uniform(ctx, r); -} - -/* helper for instructions that produce multiple consecutive scalar - * outputs which need to have a split/fanout meta instruction inserted - */ -static void -split_dest(struct ir3_block *block, struct ir3_instruction **dst, - struct ir3_instruction *src, unsigned base, unsigned n) -{ - struct ir3_instruction *prev = NULL; - - if ((n == 1) && (src->regs[0]->wrmask == 0x1)) { - dst[0] = src; - return; - } - - for (int i = 0, j = 0; i < n; i++) { - struct ir3_instruction *split = ir3_instr_create(block, OPC_META_FO); - ir3_reg_create(split, 0, IR3_REG_SSA); - ir3_reg_create(split, 0, IR3_REG_SSA)->instr = src; - split->fo.off = i + base; - - if (prev) { - split->cp.left = prev; - split->cp.left_cnt++; - prev->cp.right = split; - prev->cp.right_cnt++; - } - prev = split; - - if (src->regs[0]->wrmask & (1 << (i + base))) - dst[j++] = split; - } -} - -/* - * Adreno uses uint rather than having dedicated bool type, - * which (potentially) requires some conversion, in particular - * when using output of an bool instr to int input, or visa - * versa. - * - * | Adreno | NIR | - * -------+---------+-------+- - * true | 1 | ~0 | - * false | 0 | 0 | - * - * To convert from an adreno bool (uint) to nir, use: - * - * absneg.s dst, (neg)src - * - * To convert back in the other direction: - * - * absneg.s dst, (abs)arc - * - * The CP step can clean up the absneg.s that cancel each other - * out, and with a slight bit of extra cleverness (to recognize - * the instructions which produce either a 0 or 1) can eliminate - * the absneg.s's completely when an instruction that wants - * 0/1 consumes the result. For example, when a nir 'bcsel' - * consumes the result of 'feq'. So we should be able to get by - * without a boolean resolve step, and without incuring any - * extra penalty in instruction count. - */ - -/* NIR bool -> native (adreno): */ -static struct ir3_instruction * -ir3_b2n(struct ir3_block *block, struct ir3_instruction *instr) -{ - return ir3_ABSNEG_S(block, instr, IR3_REG_SABS); -} - -/* native (adreno) -> NIR bool: */ -static struct ir3_instruction * -ir3_n2b(struct ir3_block *block, struct ir3_instruction *instr) -{ - return ir3_ABSNEG_S(block, instr, IR3_REG_SNEG); -} - -/* - * alu/sfu instructions: - */ - -static struct ir3_instruction * -create_cov(struct ir3_context *ctx, struct ir3_instruction *src, - unsigned src_bitsize, nir_op op) -{ - type_t src_type, dst_type; - - switch (op) { - case nir_op_f2f32: - case nir_op_f2f16_rtne: - case nir_op_f2f16_rtz: - case nir_op_f2f16: - case nir_op_f2i32: - case nir_op_f2i16: - case nir_op_f2i8: - case nir_op_f2u32: - case nir_op_f2u16: - case nir_op_f2u8: - switch (src_bitsize) { - case 32: - src_type = TYPE_F32; - break; - case 16: - src_type = TYPE_F16; - break; - default: - compile_error(ctx, "invalid src bit size: %u", src_bitsize); - } - break; - - case nir_op_i2f32: - case nir_op_i2f16: - case nir_op_i2i32: - case nir_op_i2i16: - case nir_op_i2i8: - switch (src_bitsize) { - case 32: - src_type = TYPE_S32; - break; - case 16: - src_type = TYPE_S16; - break; - case 8: - src_type = TYPE_S8; - break; - default: - compile_error(ctx, "invalid src bit size: %u", src_bitsize); - } - break; - - case nir_op_u2f32: - case nir_op_u2f16: - case nir_op_u2u32: - case nir_op_u2u16: - case nir_op_u2u8: - switch (src_bitsize) { - case 32: - src_type = TYPE_U32; - break; - case 16: - src_type = TYPE_U16; - break; - case 8: - src_type = TYPE_U8; - break; - default: - compile_error(ctx, "invalid src bit size: %u", src_bitsize); - } - break; - - default: - compile_error(ctx, "invalid conversion op: %u", op); - } - - switch (op) { - case nir_op_f2f32: - case nir_op_i2f32: - case nir_op_u2f32: - dst_type = TYPE_F32; - break; - - case nir_op_f2f16_rtne: - case nir_op_f2f16_rtz: - case nir_op_f2f16: - /* TODO how to handle rounding mode? */ - case nir_op_i2f16: - case nir_op_u2f16: - dst_type = TYPE_F16; - break; - - case nir_op_f2i32: - case nir_op_i2i32: - dst_type = TYPE_S32; - break; - - case nir_op_f2i16: - case nir_op_i2i16: - dst_type = TYPE_S16; - break; - - case nir_op_f2i8: - case nir_op_i2i8: - dst_type = TYPE_S8; - break; - - case nir_op_f2u32: - case nir_op_u2u32: - dst_type = TYPE_U32; - break; - - case nir_op_f2u16: - case nir_op_u2u16: - dst_type = TYPE_U16; - break; - - case nir_op_f2u8: - case nir_op_u2u8: - dst_type = TYPE_U8; - break; - - default: - compile_error(ctx, "invalid conversion op: %u", op); - } - - return ir3_COV(ctx->block, src, src_type, dst_type); -} - -static void -emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) -{ - const nir_op_info *info = &nir_op_infos[alu->op]; - struct ir3_instruction **dst, *src[info->num_inputs]; - unsigned bs[info->num_inputs]; /* bit size */ - struct ir3_block *b = ctx->block; - unsigned dst_sz, wrmask; - - if (alu->dest.dest.is_ssa) { - dst_sz = alu->dest.dest.ssa.num_components; - wrmask = (1 << dst_sz) - 1; - } else { - dst_sz = alu->dest.dest.reg.reg->num_components; - wrmask = alu->dest.write_mask; - } - - dst = get_dst(ctx, &alu->dest.dest, dst_sz); - - /* Vectors are special in that they have non-scalarized writemasks, - * and just take the first swizzle channel for each argument in - * order into each writemask channel. - */ - if ((alu->op == nir_op_vec2) || - (alu->op == nir_op_vec3) || - (alu->op == nir_op_vec4)) { - - for (int i = 0; i < info->num_inputs; i++) { - nir_alu_src *asrc = &alu->src[i]; - - compile_assert(ctx, !asrc->abs); - compile_assert(ctx, !asrc->negate); - - src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[0]]; - if (!src[i]) - src[i] = create_immed(ctx->block, 0); - dst[i] = ir3_MOV(b, src[i], TYPE_U32); - } - - put_dst(ctx, &alu->dest.dest); - return; - } - - /* We also get mov's with more than one component for mov's so - * handle those specially: - */ - if ((alu->op == nir_op_imov) || (alu->op == nir_op_fmov)) { - type_t type = (alu->op == nir_op_imov) ? TYPE_U32 : TYPE_F32; - nir_alu_src *asrc = &alu->src[0]; - struct ir3_instruction *const *src0 = get_src(ctx, &asrc->src); - - for (unsigned i = 0; i < dst_sz; i++) { - if (wrmask & (1 << i)) { - dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], type); - } else { - dst[i] = NULL; - } - } - - put_dst(ctx, &alu->dest.dest); - return; - } - - /* General case: We can just grab the one used channel per src. */ - for (int i = 0; i < info->num_inputs; i++) { - unsigned chan = ffs(alu->dest.write_mask) - 1; - nir_alu_src *asrc = &alu->src[i]; - - compile_assert(ctx, !asrc->abs); - compile_assert(ctx, !asrc->negate); - - src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[chan]]; - bs[i] = nir_src_bit_size(asrc->src); - - compile_assert(ctx, src[i]); - } - - switch (alu->op) { - case nir_op_f2f32: - case nir_op_f2f16_rtne: - case nir_op_f2f16_rtz: - case nir_op_f2f16: - case nir_op_f2i32: - case nir_op_f2i16: - case nir_op_f2i8: - case nir_op_f2u32: - case nir_op_f2u16: - case nir_op_f2u8: - case nir_op_i2f32: - case nir_op_i2f16: - case nir_op_i2i32: - case nir_op_i2i16: - case nir_op_i2i8: - case nir_op_u2f32: - case nir_op_u2f16: - case nir_op_u2u32: - case nir_op_u2u16: - case nir_op_u2u8: - dst[0] = create_cov(ctx, src[0], bs[0], alu->op); - break; - case nir_op_f2b: - dst[0] = ir3_CMPS_F(b, src[0], 0, create_immed(b, fui(0.0)), 0); - dst[0]->cat2.condition = IR3_COND_NE; - dst[0] = ir3_n2b(b, dst[0]); - break; - case nir_op_b2f: - dst[0] = ir3_COV(b, ir3_b2n(b, src[0]), TYPE_U32, TYPE_F32); - break; - case nir_op_b2i: - dst[0] = ir3_b2n(b, src[0]); - break; - case nir_op_i2b: - dst[0] = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0); - dst[0]->cat2.condition = IR3_COND_NE; - dst[0] = ir3_n2b(b, dst[0]); - break; - - case nir_op_fneg: - dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG); - break; - case nir_op_fabs: - dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS); - break; - case nir_op_fmax: - dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0); - break; - case nir_op_fmin: - dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0); - break; - case nir_op_fsat: - /* if there is just a single use of the src, and it supports - * (sat) bit, we can just fold the (sat) flag back to the - * src instruction and create a mov. This is easier for cp - * to eliminate. - * - * TODO probably opc_cat==4 is ok too - */ - if (alu->src[0].src.is_ssa && - (list_length(&alu->src[0].src.ssa->uses) == 1) && - ((opc_cat(src[0]->opc) == 2) || (opc_cat(src[0]->opc) == 3))) { - src[0]->flags |= IR3_INSTR_SAT; - dst[0] = ir3_MOV(b, src[0], TYPE_U32); - } else { - /* otherwise generate a max.f that saturates.. blob does - * similar (generating a cat2 mov using max.f) - */ - dst[0] = ir3_MAX_F(b, src[0], 0, src[0], 0); - dst[0]->flags |= IR3_INSTR_SAT; - } - break; - case nir_op_fmul: - dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0); - break; - case nir_op_fadd: - dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0); - break; - case nir_op_fsub: - dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG); - break; - case nir_op_ffma: - dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0); - break; - case nir_op_fddx: - dst[0] = ir3_DSX(b, src[0], 0); - dst[0]->cat5.type = TYPE_F32; - break; - case nir_op_fddy: - dst[0] = ir3_DSY(b, src[0], 0); - dst[0]->cat5.type = TYPE_F32; - break; - break; - case nir_op_flt: - dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); - dst[0]->cat2.condition = IR3_COND_LT; - dst[0] = ir3_n2b(b, dst[0]); - break; - case nir_op_fge: - dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); - dst[0]->cat2.condition = IR3_COND_GE; - dst[0] = ir3_n2b(b, dst[0]); - break; - case nir_op_feq: - dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); - dst[0]->cat2.condition = IR3_COND_EQ; - dst[0] = ir3_n2b(b, dst[0]); - break; - case nir_op_fne: - dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); - dst[0]->cat2.condition = IR3_COND_NE; - dst[0] = ir3_n2b(b, dst[0]); - break; - case nir_op_fceil: - dst[0] = ir3_CEIL_F(b, src[0], 0); - break; - case nir_op_ffloor: - dst[0] = ir3_FLOOR_F(b, src[0], 0); - break; - case nir_op_ftrunc: - dst[0] = ir3_TRUNC_F(b, src[0], 0); - break; - case nir_op_fround_even: - dst[0] = ir3_RNDNE_F(b, src[0], 0); - break; - case nir_op_fsign: - dst[0] = ir3_SIGN_F(b, src[0], 0); - break; - - case nir_op_fsin: - dst[0] = ir3_SIN(b, src[0], 0); - break; - case nir_op_fcos: - dst[0] = ir3_COS(b, src[0], 0); - break; - case nir_op_frsq: - dst[0] = ir3_RSQ(b, src[0], 0); - break; - case nir_op_frcp: - dst[0] = ir3_RCP(b, src[0], 0); - break; - case nir_op_flog2: - dst[0] = ir3_LOG2(b, src[0], 0); - break; - case nir_op_fexp2: - dst[0] = ir3_EXP2(b, src[0], 0); - break; - case nir_op_fsqrt: - dst[0] = ir3_SQRT(b, src[0], 0); - break; - - case nir_op_iabs: - dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS); - break; - case nir_op_iadd: - dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0); - break; - case nir_op_iand: - dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0); - break; - case nir_op_imax: - dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0); - break; - case nir_op_umax: - dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0); - break; - case nir_op_imin: - dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0); - break; - case nir_op_umin: - dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0); - break; - case nir_op_imul: - /* - * dst = (al * bl) + (ah * bl << 16) + (al * bh << 16) - * mull.u tmp0, a, b ; mul low, i.e. al * bl - * madsh.m16 tmp1, a, b, tmp0 ; mul-add shift high mix, i.e. ah * bl << 16 - * madsh.m16 dst, b, a, tmp1 ; i.e. al * bh << 16 - */ - dst[0] = ir3_MADSH_M16(b, src[1], 0, src[0], 0, - ir3_MADSH_M16(b, src[0], 0, src[1], 0, - ir3_MULL_U(b, src[0], 0, src[1], 0), 0), 0); - break; - case nir_op_ineg: - dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG); - break; - case nir_op_inot: - dst[0] = ir3_NOT_B(b, src[0], 0); - break; - case nir_op_ior: - dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0); - break; - case nir_op_ishl: - dst[0] = ir3_SHL_B(b, src[0], 0, src[1], 0); - break; - case nir_op_ishr: - dst[0] = ir3_ASHR_B(b, src[0], 0, src[1], 0); - break; - case nir_op_isign: { - /* maybe this would be sane to lower in nir.. */ - struct ir3_instruction *neg, *pos; - - neg = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0); - neg->cat2.condition = IR3_COND_LT; - - pos = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0); - pos->cat2.condition = IR3_COND_GT; - - dst[0] = ir3_SUB_U(b, pos, 0, neg, 0); - - break; - } - case nir_op_isub: - dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0); - break; - case nir_op_ixor: - dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0); - break; - case nir_op_ushr: - dst[0] = ir3_SHR_B(b, src[0], 0, src[1], 0); - break; - case nir_op_ilt: - dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); - dst[0]->cat2.condition = IR3_COND_LT; - dst[0] = ir3_n2b(b, dst[0]); - break; - case nir_op_ige: - dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); - dst[0]->cat2.condition = IR3_COND_GE; - dst[0] = ir3_n2b(b, dst[0]); - break; - case nir_op_ieq: - dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); - dst[0]->cat2.condition = IR3_COND_EQ; - dst[0] = ir3_n2b(b, dst[0]); - break; - case nir_op_ine: - dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); - dst[0]->cat2.condition = IR3_COND_NE; - dst[0] = ir3_n2b(b, dst[0]); - break; - case nir_op_ult: - dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0); - dst[0]->cat2.condition = IR3_COND_LT; - dst[0] = ir3_n2b(b, dst[0]); - break; - case nir_op_uge: - dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0); - dst[0]->cat2.condition = IR3_COND_GE; - dst[0] = ir3_n2b(b, dst[0]); - break; - - case nir_op_bcsel: { - struct ir3_instruction *cond = ir3_b2n(b, src[0]); - compile_assert(ctx, bs[1] == bs[2]); - /* the boolean condition is 32b even if src[1] and src[2] are - * half-precision, but sel.b16 wants all three src's to be the - * same type. - */ - if (bs[1] < 32) - cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16); - dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0); - break; - } - case nir_op_bit_count: - dst[0] = ir3_CBITS_B(b, src[0], 0); - break; - case nir_op_ifind_msb: { - struct ir3_instruction *cmp; - dst[0] = ir3_CLZ_S(b, src[0], 0); - cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0); - cmp->cat2.condition = IR3_COND_GE; - dst[0] = ir3_SEL_B32(b, - ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0, - cmp, 0, dst[0], 0); - break; - } - case nir_op_ufind_msb: - dst[0] = ir3_CLZ_B(b, src[0], 0); - dst[0] = ir3_SEL_B32(b, - ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0, - src[0], 0, dst[0], 0); - break; - case nir_op_find_lsb: - dst[0] = ir3_BFREV_B(b, src[0], 0); - dst[0] = ir3_CLZ_B(b, dst[0], 0); - break; - case nir_op_bitfield_reverse: - dst[0] = ir3_BFREV_B(b, src[0], 0); - break; - - default: - compile_error(ctx, "Unhandled ALU op: %s\n", - nir_op_infos[alu->op].name); - break; - } - - put_dst(ctx, &alu->dest.dest); -} - -/* handles direct/indirect UBO reads: */ -static void -emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr, - struct ir3_instruction **dst) -{ - struct ir3_block *b = ctx->block; - struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1; - nir_const_value *const_offset; - /* UBO addresses are the first driver params: */ - unsigned ubo = regid(ctx->so->constbase.ubo, 0); - const unsigned ptrsz = pointer_size(ctx); - - int off = 0; - - /* First src is ubo index, which could either be an immed or not: */ - src0 = get_src(ctx, &intr->src[0])[0]; - if (is_same_type_mov(src0) && - (src0->regs[1]->flags & IR3_REG_IMMED)) { - base_lo = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz)); - base_hi = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz) + 1); - } else { - base_lo = create_uniform_indirect(ctx, ubo, get_addr(ctx, src0, 4)); - base_hi = create_uniform_indirect(ctx, ubo + 1, get_addr(ctx, src0, 4)); - } - - /* note: on 32bit gpu's base_hi is ignored and DCE'd */ - addr = base_lo; - - const_offset = nir_src_as_const_value(intr->src[1]); - if (const_offset) { - off += const_offset->u32[0]; - } else { - /* For load_ubo_indirect, second src is indirect offset: */ - src1 = get_src(ctx, &intr->src[1])[0]; - - /* and add offset to addr: */ - addr = ir3_ADD_S(b, addr, 0, src1, 0); - } - - /* if offset is to large to encode in the ldg, split it out: */ - if ((off + (intr->num_components * 4)) > 1024) { - /* split out the minimal amount to improve the odds that - * cp can fit the immediate in the add.s instruction: - */ - unsigned off2 = off + (intr->num_components * 4) - 1024; - addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0); - off -= off2; - } - - if (ptrsz == 2) { - struct ir3_instruction *carry; - - /* handle 32b rollover, ie: - * if (addr < base_lo) - * base_hi++ - */ - carry = ir3_CMPS_U(b, addr, 0, base_lo, 0); - carry->cat2.condition = IR3_COND_LT; - base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0); - - addr = create_collect(ctx, (struct ir3_instruction*[]){ addr, base_hi }, 2); - } - - for (int i = 0; i < intr->num_components; i++) { - struct ir3_instruction *load = - ir3_LDG(b, addr, 0, create_immed(b, 1), 0); - load->cat6.type = TYPE_U32; - load->cat6.src_offset = off + i * 4; /* byte offset */ - dst[i] = load; - } -} - -/* src[] = { buffer_index, offset }. No const_index */ -static void -emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr, - struct ir3_instruction **dst) -{ - struct ir3_block *b = ctx->block; - struct ir3_instruction *ldgb, *src0, *src1, *offset; - nir_const_value *const_offset; - - /* can this be non-const buffer_index? how do we handle that? */ - const_offset = nir_src_as_const_value(intr->src[0]); - compile_assert(ctx, const_offset); - - offset = get_src(ctx, &intr->src[1])[0]; - - /* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */ - src0 = create_collect(ctx, (struct ir3_instruction*[]){ - offset, - create_immed(b, 0), - }, 2); - src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0); - - ldgb = ir3_LDGB(b, create_immed(b, const_offset->u32[0]), 0, - src0, 0, src1, 0); - ldgb->regs[0]->wrmask = MASK(intr->num_components); - ldgb->cat6.iim_val = intr->num_components; - ldgb->cat6.d = 4; - ldgb->cat6.type = TYPE_U32; - ldgb->barrier_class = IR3_BARRIER_BUFFER_R; - ldgb->barrier_conflict = IR3_BARRIER_BUFFER_W; - - split_dest(b, dst, ldgb, 0, intr->num_components); -} - -/* src[] = { value, block_index, offset }. const_index[] = { write_mask } */ -static void -emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr) -{ - struct ir3_block *b = ctx->block; - struct ir3_instruction *stgb, *src0, *src1, *src2, *offset; - nir_const_value *const_offset; - /* TODO handle wrmask properly, see _store_shared().. but I think - * it is more a PITA than that, since blob ends up loading the - * masked components and writing them back out. - */ - unsigned wrmask = intr->const_index[0]; - unsigned ncomp = ffs(~wrmask) - 1; - - /* can this be non-const buffer_index? how do we handle that? */ - const_offset = nir_src_as_const_value(intr->src[1]); - compile_assert(ctx, const_offset); - - offset = get_src(ctx, &intr->src[2])[0]; - - /* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0).. - * nir already *= 4: - */ - src0 = create_collect(ctx, get_src(ctx, &intr->src[0]), ncomp); - src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0); - src2 = create_collect(ctx, (struct ir3_instruction*[]){ - offset, - create_immed(b, 0), - }, 2); - - stgb = ir3_STGB(b, create_immed(b, const_offset->u32[0]), 0, - src0, 0, src1, 0, src2, 0); - stgb->cat6.iim_val = ncomp; - stgb->cat6.d = 4; - stgb->cat6.type = TYPE_U32; - stgb->barrier_class = IR3_BARRIER_BUFFER_W; - stgb->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; - - array_insert(b, b->keeps, stgb); -} - -/* src[] = { block_index } */ -static void -emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr, - struct ir3_instruction **dst) -{ - /* SSBO size stored as a const starting at ssbo_sizes: */ - unsigned blk_idx = nir_src_as_const_value(intr->src[0])->u32[0]; - unsigned idx = regid(ctx->so->constbase.ssbo_sizes, 0) + - ctx->so->const_layout.ssbo_size.off[blk_idx]; - - debug_assert(ctx->so->const_layout.ssbo_size.mask & (1 << blk_idx)); - - dst[0] = create_uniform(ctx, idx); -} - -/* - * SSBO atomic intrinsics - * - * All of the SSBO atomic memory operations read a value from memory, - * compute a new value using one of the operations below, write the new - * value to memory, and return the original value read. - * - * All operations take 3 sources except CompSwap that takes 4. These - * sources represent: - * - * 0: The SSBO buffer index. - * 1: The offset into the SSBO buffer of the variable that the atomic - * operation will operate on. - * 2: The data parameter to the atomic function (i.e. the value to add - * in ssbo_atomic_add, etc). - * 3: For CompSwap only: the second data parameter. - */ -static struct ir3_instruction * -emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr) -{ - struct ir3_block *b = ctx->block; - struct ir3_instruction *atomic, *ssbo, *src0, *src1, *src2, *offset; - nir_const_value *const_offset; - type_t type = TYPE_U32; - - /* can this be non-const buffer_index? how do we handle that? */ - const_offset = nir_src_as_const_value(intr->src[0]); - compile_assert(ctx, const_offset); - ssbo = create_immed(b, const_offset->u32[0]); - - offset = get_src(ctx, &intr->src[1])[0]; - - /* src0 is data (or uvec2(data, compare)) - * src1 is offset - * src2 is uvec2(offset*4, 0) (appears to be 64b byte offset) - * - * Note that nir already multiplies the offset by four - */ - src0 = get_src(ctx, &intr->src[2])[0]; - src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0); - src2 = create_collect(ctx, (struct ir3_instruction*[]){ - offset, - create_immed(b, 0), - }, 2); - - switch (intr->intrinsic) { - case nir_intrinsic_ssbo_atomic_add: - atomic = ir3_ATOMIC_ADD_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); - break; - case nir_intrinsic_ssbo_atomic_imin: - atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); - type = TYPE_S32; - break; - case nir_intrinsic_ssbo_atomic_umin: - atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); - break; - case nir_intrinsic_ssbo_atomic_imax: - atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); - type = TYPE_S32; - break; - case nir_intrinsic_ssbo_atomic_umax: - atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); - break; - case nir_intrinsic_ssbo_atomic_and: - atomic = ir3_ATOMIC_AND_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); - break; - case nir_intrinsic_ssbo_atomic_or: - atomic = ir3_ATOMIC_OR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); - break; - case nir_intrinsic_ssbo_atomic_xor: - atomic = ir3_ATOMIC_XOR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); - break; - case nir_intrinsic_ssbo_atomic_exchange: - atomic = ir3_ATOMIC_XCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); - break; - case nir_intrinsic_ssbo_atomic_comp_swap: - /* for cmpxchg, src0 is [ui]vec2(data, compare): */ - src0 = create_collect(ctx, (struct ir3_instruction*[]){ - get_src(ctx, &intr->src[3])[0], - src0, - }, 2); - atomic = ir3_ATOMIC_CMPXCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); - break; - default: - unreachable("boo"); - } - - atomic->cat6.iim_val = 1; - atomic->cat6.d = 4; - atomic->cat6.type = type; - atomic->barrier_class = IR3_BARRIER_BUFFER_W; - atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; - - /* even if nothing consume the result, we can't DCE the instruction: */ - array_insert(b, b->keeps, atomic); - - return atomic; -} - -/* src[] = { offset }. const_index[] = { base } */ -static void -emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr, - struct ir3_instruction **dst) -{ - struct ir3_block *b = ctx->block; - struct ir3_instruction *ldl, *offset; - unsigned base; - - offset = get_src(ctx, &intr->src[0])[0]; - base = nir_intrinsic_base(intr); - - ldl = ir3_LDL(b, offset, 0, create_immed(b, intr->num_components), 0); - ldl->cat6.src_offset = base; - ldl->cat6.type = utype_dst(intr->dest); - ldl->regs[0]->wrmask = MASK(intr->num_components); - - ldl->barrier_class = IR3_BARRIER_SHARED_R; - ldl->barrier_conflict = IR3_BARRIER_SHARED_W; - - split_dest(b, dst, ldl, 0, intr->num_components); -} - -/* src[] = { value, offset }. const_index[] = { base, write_mask } */ -static void -emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr) -{ - struct ir3_block *b = ctx->block; - struct ir3_instruction *stl, *offset; - struct ir3_instruction * const *value; - unsigned base, wrmask; - - value = get_src(ctx, &intr->src[0]); - offset = get_src(ctx, &intr->src[1])[0]; - - base = nir_intrinsic_base(intr); - wrmask = nir_intrinsic_write_mask(intr); - - /* Combine groups of consecutive enabled channels in one write - * message. We use ffs to find the first enabled channel and then ffs on - * the bit-inverse, down-shifted writemask to determine the length of - * the block of enabled bits. - * - * (trick stolen from i965's fs_visitor::nir_emit_cs_intrinsic()) - */ - while (wrmask) { - unsigned first_component = ffs(wrmask) - 1; - unsigned length = ffs(~(wrmask >> first_component)) - 1; - - stl = ir3_STL(b, offset, 0, - create_collect(ctx, &value[first_component], length), 0, - create_immed(b, length), 0); - stl->cat6.dst_offset = first_component + base; - stl->cat6.type = utype_src(intr->src[0]); - stl->barrier_class = IR3_BARRIER_SHARED_W; - stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W; - - array_insert(b, b->keeps, stl); - - /* Clear the bits in the writemask that we just wrote, then try - * again to see if more channels are left. - */ - wrmask &= (15 << (first_component + length)); - } -} - -/* - * CS shared variable atomic intrinsics - * - * All of the shared variable atomic memory operations read a value from - * memory, compute a new value using one of the operations below, write the - * new value to memory, and return the original value read. - * - * All operations take 2 sources except CompSwap that takes 3. These - * sources represent: - * - * 0: The offset into the shared variable storage region that the atomic - * operation will operate on. - * 1: The data parameter to the atomic function (i.e. the value to add - * in shared_atomic_add, etc). - * 2: For CompSwap only: the second data parameter. - */ -static struct ir3_instruction * -emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr) -{ - struct ir3_block *b = ctx->block; - struct ir3_instruction *atomic, *src0, *src1; - type_t type = TYPE_U32; - - src0 = get_src(ctx, &intr->src[0])[0]; /* offset */ - src1 = get_src(ctx, &intr->src[1])[0]; /* value */ - - switch (intr->intrinsic) { - case nir_intrinsic_shared_atomic_add: - atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0); - break; - case nir_intrinsic_shared_atomic_imin: - atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0); - type = TYPE_S32; - break; - case nir_intrinsic_shared_atomic_umin: - atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0); - break; - case nir_intrinsic_shared_atomic_imax: - atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0); - type = TYPE_S32; - break; - case nir_intrinsic_shared_atomic_umax: - atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0); - break; - case nir_intrinsic_shared_atomic_and: - atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0); - break; - case nir_intrinsic_shared_atomic_or: - atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0); - break; - case nir_intrinsic_shared_atomic_xor: - atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0); - break; - case nir_intrinsic_shared_atomic_exchange: - atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0); - break; - case nir_intrinsic_shared_atomic_comp_swap: - /* for cmpxchg, src1 is [ui]vec2(data, compare): */ - src1 = create_collect(ctx, (struct ir3_instruction*[]){ - get_src(ctx, &intr->src[2])[0], - src1, - }, 2); - atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0); - break; - default: - unreachable("boo"); - } - - atomic->cat6.iim_val = 1; - atomic->cat6.d = 1; - atomic->cat6.type = type; - atomic->barrier_class = IR3_BARRIER_SHARED_W; - atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W; - - /* even if nothing consume the result, we can't DCE the instruction: */ - array_insert(b, b->keeps, atomic); - - return atomic; -} - -/* Images get mapped into SSBO/image state (for store/atomic) and texture - * state block (for load). To simplify things, invert the image id and - * map it from end of state block, ie. image 0 becomes num-1, image 1 - * becomes num-2, etc. This potentially avoids needing to re-emit texture - * state when switching shaders. - * - * TODO is max # of samplers and SSBOs the same. This shouldn't be hard- - * coded. Also, since all the gl shader stages (ie. everything but CS) - * share the same SSBO/image state block, this might require some more - * logic if we supported images in anything other than FS.. - */ -static unsigned -get_image_slot(struct ir3_context *ctx, nir_deref_instr *deref) -{ - unsigned int loc = 0; - unsigned inner_size = 1; - - while (deref->deref_type != nir_deref_type_var) { - assert(deref->deref_type == nir_deref_type_array); - nir_const_value *const_index = nir_src_as_const_value(deref->arr.index); - assert(const_index); - - /* Go to the next instruction */ - deref = nir_deref_instr_parent(deref); - - assert(glsl_type_is_array(deref->type)); - const unsigned array_len = glsl_get_length(deref->type); - loc += MIN2(const_index->u32[0], array_len - 1) * inner_size; - - /* Update the inner size */ - inner_size *= array_len; - } - - loc += deref->var->data.driver_location; - - /* TODO figure out real limit per generation, and don't hardcode: */ - const unsigned max_samplers = 16; - return max_samplers - loc - 1; -} - -/* see tex_info() for equiv logic for texture instructions.. it would be - * nice if this could be better unified.. - */ -static unsigned -get_image_coords(const nir_variable *var, unsigned *flagsp) -{ - const struct glsl_type *type = glsl_without_array(var->type); - unsigned coords, flags = 0; - - switch (glsl_get_sampler_dim(type)) { - case GLSL_SAMPLER_DIM_1D: - case GLSL_SAMPLER_DIM_BUF: - coords = 1; - break; - case GLSL_SAMPLER_DIM_2D: - case GLSL_SAMPLER_DIM_RECT: - case GLSL_SAMPLER_DIM_EXTERNAL: - case GLSL_SAMPLER_DIM_MS: - coords = 2; - break; - case GLSL_SAMPLER_DIM_3D: - case GLSL_SAMPLER_DIM_CUBE: - flags |= IR3_INSTR_3D; - coords = 3; - break; - default: - unreachable("bad sampler dim"); - return 0; - } - - if (glsl_sampler_type_is_array(type)) { - /* note: unlike tex_info(), adjust # of coords to include array idx: */ - coords++; - flags |= IR3_INSTR_A; - } - - if (flagsp) - *flagsp = flags; - - return coords; -} - -static type_t -get_image_type(const nir_variable *var) -{ - switch (glsl_get_sampler_result_type(glsl_without_array(var->type))) { - case GLSL_TYPE_UINT: - return TYPE_U32; - case GLSL_TYPE_INT: - return TYPE_S32; - case GLSL_TYPE_FLOAT: - return TYPE_F32; - default: - unreachable("bad sampler type."); - return 0; - } -} - -static struct ir3_instruction * -get_image_offset(struct ir3_context *ctx, const nir_variable *var, - struct ir3_instruction * const *coords, bool byteoff) -{ - struct ir3_block *b = ctx->block; - struct ir3_instruction *offset; - unsigned ncoords = get_image_coords(var, NULL); - - /* to calculate the byte offset (yes, uggg) we need (up to) three - * const values to know the bytes per pixel, and y and z stride: - */ - unsigned cb = regid(ctx->so->constbase.image_dims, 0) + - ctx->so->const_layout.image_dims.off[var->data.driver_location]; - - debug_assert(ctx->so->const_layout.image_dims.mask & - (1 << var->data.driver_location)); - - /* offset = coords.x * bytes_per_pixel: */ - offset = ir3_MUL_S(b, coords[0], 0, create_uniform(ctx, cb + 0), 0); - if (ncoords > 1) { - /* offset += coords.y * y_pitch: */ - offset = ir3_MAD_S24(b, create_uniform(ctx, cb + 1), 0, - coords[1], 0, offset, 0); - } - if (ncoords > 2) { - /* offset += coords.z * z_pitch: */ - offset = ir3_MAD_S24(b, create_uniform(ctx, cb + 2), 0, - coords[2], 0, offset, 0); - } - - if (!byteoff) { - /* Some cases, like atomics, seem to use dword offset instead - * of byte offsets.. blob just puts an extra shr.b in there - * in those cases: - */ - offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0); - } - - return create_collect(ctx, (struct ir3_instruction*[]){ - offset, - create_immed(b, 0), - }, 2); -} - -/* src[] = { deref, coord, sample_index }. const_index[] = {} */ -static void -emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr, - struct ir3_instruction **dst) -{ - struct ir3_block *b = ctx->block; - const nir_variable *var = nir_intrinsic_get_var(intr, 0); - struct ir3_instruction *sam; - struct ir3_instruction * const *src0 = get_src(ctx, &intr->src[1]); - struct ir3_instruction *coords[4]; - unsigned flags, ncoords = get_image_coords(var, &flags); - unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0])); - type_t type = get_image_type(var); - - /* hmm, this seems a bit odd, but it is what blob does and (at least - * a5xx) just faults on bogus addresses otherwise: - */ - if (flags & IR3_INSTR_3D) { - flags &= ~IR3_INSTR_3D; - flags |= IR3_INSTR_A; - } - - for (unsigned i = 0; i < ncoords; i++) - coords[i] = src0[i]; - - if (ncoords == 1) - coords[ncoords++] = create_immed(b, 0); - - sam = ir3_SAM(b, OPC_ISAM, type, 0b1111, flags, - tex_idx, tex_idx, create_collect(ctx, coords, ncoords), NULL); - - sam->barrier_class = IR3_BARRIER_IMAGE_R; - sam->barrier_conflict = IR3_BARRIER_IMAGE_W; - - split_dest(b, dst, sam, 0, 4); -} - -/* src[] = { deref, coord, sample_index, value }. const_index[] = {} */ -static void -emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr) -{ - struct ir3_block *b = ctx->block; - const nir_variable *var = nir_intrinsic_get_var(intr, 0); - struct ir3_instruction *stib, *offset; - struct ir3_instruction * const *value = get_src(ctx, &intr->src[3]); - struct ir3_instruction * const *coords = get_src(ctx, &intr->src[1]); - unsigned ncoords = get_image_coords(var, NULL); - unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0])); - - /* src0 is value - * src1 is coords - * src2 is 64b byte offset - */ - - offset = get_image_offset(ctx, var, coords, true); - - /* NOTE: stib seems to take byte offset, but stgb.typed can be used - * too and takes a dword offset.. not quite sure yet why blob uses - * one over the other in various cases. - */ - - stib = ir3_STIB(b, create_immed(b, tex_idx), 0, - create_collect(ctx, value, 4), 0, - create_collect(ctx, coords, ncoords), 0, - offset, 0); - stib->cat6.iim_val = 4; - stib->cat6.d = ncoords; - stib->cat6.type = get_image_type(var); - stib->cat6.typed = true; - stib->barrier_class = IR3_BARRIER_IMAGE_W; - stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W; - - array_insert(b, b->keeps, stib); -} - -static void -emit_intrinsic_image_size(struct ir3_context *ctx, nir_intrinsic_instr *intr, - struct ir3_instruction **dst) -{ - struct ir3_block *b = ctx->block; - const nir_variable *var = nir_intrinsic_get_var(intr, 0); - unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0])); - struct ir3_instruction *sam, *lod; - unsigned flags, ncoords = get_image_coords(var, &flags); - - lod = create_immed(b, 0); - sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, 0b1111, flags, - tex_idx, tex_idx, lod, NULL); - - /* Array size actually ends up in .w rather than .z. This doesn't - * matter for miplevel 0, but for higher mips the value in z is - * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is - * returned, which means that we have to add 1 to it for arrays for - * a3xx. - * - * Note use a temporary dst and then copy, since the size of the dst - * array that is passed in is based on nir's understanding of the - * result size, not the hardware's - */ - struct ir3_instruction *tmp[4]; - - split_dest(b, tmp, sam, 0, 4); - - /* get_size instruction returns size in bytes instead of texels - * for imageBuffer, so we need to divide it by the pixel size - * of the image format. - * - * TODO: This is at least true on a5xx. Check other gens. - */ - enum glsl_sampler_dim dim = - glsl_get_sampler_dim(glsl_without_array(var->type)); - if (dim == GLSL_SAMPLER_DIM_BUF) { - /* Since all the possible values the divisor can take are - * power-of-two (4, 8, or 16), the division is implemented - * as a shift-right. - * During shader setup, the log2 of the image format's - * bytes-per-pixel should have been emitted in 2nd slot of - * image_dims. See ir3_shader::emit_image_dims(). - */ - unsigned cb = regid(ctx->so->constbase.image_dims, 0) + - ctx->so->const_layout.image_dims.off[var->data.driver_location]; - struct ir3_instruction *aux = create_uniform(ctx, cb + 1); - - tmp[0] = ir3_SHR_B(b, tmp[0], 0, aux, 0); - } - - for (unsigned i = 0; i < ncoords; i++) - dst[i] = tmp[i]; - - if (flags & IR3_INSTR_A) { - if (ctx->compiler->levels_add_one) { - dst[ncoords-1] = ir3_ADD_U(b, tmp[3], 0, create_immed(b, 1), 0); - } else { - dst[ncoords-1] = ir3_MOV(b, tmp[3], TYPE_U32); - } - } -} - -/* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */ -static struct ir3_instruction * -emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr) -{ - struct ir3_block *b = ctx->block; - const nir_variable *var = nir_intrinsic_get_var(intr, 0); - struct ir3_instruction *atomic, *image, *src0, *src1, *src2; - struct ir3_instruction * const *coords = get_src(ctx, &intr->src[1]); - unsigned ncoords = get_image_coords(var, NULL); - - image = create_immed(b, get_image_slot(ctx, nir_src_as_deref(intr->src[0]))); - - /* src0 is value (or uvec2(value, compare)) - * src1 is coords - * src2 is 64b byte offset - */ - src0 = get_src(ctx, &intr->src[3])[0]; - src1 = create_collect(ctx, coords, ncoords); - src2 = get_image_offset(ctx, var, coords, false); - - switch (intr->intrinsic) { - case nir_intrinsic_image_deref_atomic_add: - atomic = ir3_ATOMIC_ADD_G(b, image, 0, src0, 0, src1, 0, src2, 0); - break; - case nir_intrinsic_image_deref_atomic_min: - atomic = ir3_ATOMIC_MIN_G(b, image, 0, src0, 0, src1, 0, src2, 0); - break; - case nir_intrinsic_image_deref_atomic_max: - atomic = ir3_ATOMIC_MAX_G(b, image, 0, src0, 0, src1, 0, src2, 0); - break; - case nir_intrinsic_image_deref_atomic_and: - atomic = ir3_ATOMIC_AND_G(b, image, 0, src0, 0, src1, 0, src2, 0); - break; - case nir_intrinsic_image_deref_atomic_or: - atomic = ir3_ATOMIC_OR_G(b, image, 0, src0, 0, src1, 0, src2, 0); - break; - case nir_intrinsic_image_deref_atomic_xor: - atomic = ir3_ATOMIC_XOR_G(b, image, 0, src0, 0, src1, 0, src2, 0); - break; - case nir_intrinsic_image_deref_atomic_exchange: - atomic = ir3_ATOMIC_XCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0); - break; - case nir_intrinsic_image_deref_atomic_comp_swap: - /* for cmpxchg, src0 is [ui]vec2(data, compare): */ - src0 = create_collect(ctx, (struct ir3_instruction*[]){ - get_src(ctx, &intr->src[4])[0], - src0, - }, 2); - atomic = ir3_ATOMIC_CMPXCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0); - break; - default: - unreachable("boo"); - } - - atomic->cat6.iim_val = 1; - atomic->cat6.d = ncoords; - atomic->cat6.type = get_image_type(var); - atomic->cat6.typed = true; - atomic->barrier_class = IR3_BARRIER_IMAGE_W; - atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W; - - /* even if nothing consume the result, we can't DCE the instruction: */ - array_insert(b, b->keeps, atomic); - - return atomic; -} - -static void -emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr) -{ - struct ir3_block *b = ctx->block; - struct ir3_instruction *barrier; - - switch (intr->intrinsic) { - case nir_intrinsic_barrier: - barrier = ir3_BAR(b); - barrier->cat7.g = true; - barrier->cat7.l = true; - barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY; - barrier->barrier_class = IR3_BARRIER_EVERYTHING; - break; - case nir_intrinsic_memory_barrier: - barrier = ir3_FENCE(b); - barrier->cat7.g = true; - barrier->cat7.r = true; - barrier->cat7.w = true; - barrier->barrier_class = IR3_BARRIER_IMAGE_W | - IR3_BARRIER_BUFFER_W; - barrier->barrier_conflict = - IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W | - IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; - break; - case nir_intrinsic_memory_barrier_atomic_counter: - case nir_intrinsic_memory_barrier_buffer: - barrier = ir3_FENCE(b); - barrier->cat7.g = true; - barrier->cat7.r = true; - barrier->cat7.w = true; - barrier->barrier_class = IR3_BARRIER_BUFFER_W; - barrier->barrier_conflict = IR3_BARRIER_BUFFER_R | - IR3_BARRIER_BUFFER_W; - break; - case nir_intrinsic_memory_barrier_image: - // TODO double check if this should have .g set - barrier = ir3_FENCE(b); - barrier->cat7.g = true; - barrier->cat7.r = true; - barrier->cat7.w = true; - barrier->barrier_class = IR3_BARRIER_IMAGE_W; - barrier->barrier_conflict = IR3_BARRIER_IMAGE_R | - IR3_BARRIER_IMAGE_W; - break; - case nir_intrinsic_memory_barrier_shared: - barrier = ir3_FENCE(b); - barrier->cat7.g = true; - barrier->cat7.l = true; - barrier->cat7.r = true; - barrier->cat7.w = true; - barrier->barrier_class = IR3_BARRIER_SHARED_W; - barrier->barrier_conflict = IR3_BARRIER_SHARED_R | - IR3_BARRIER_SHARED_W; - break; - case nir_intrinsic_group_memory_barrier: - barrier = ir3_FENCE(b); - barrier->cat7.g = true; - barrier->cat7.l = true; - barrier->cat7.r = true; - barrier->cat7.w = true; - barrier->barrier_class = IR3_BARRIER_SHARED_W | - IR3_BARRIER_IMAGE_W | - IR3_BARRIER_BUFFER_W; - barrier->barrier_conflict = - IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W | - IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W | - IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; - break; - default: - unreachable("boo"); - } - - /* make sure barrier doesn't get DCE'd */ - array_insert(b, b->keeps, barrier); -} - -static void add_sysval_input_compmask(struct ir3_context *ctx, - gl_system_value slot, unsigned compmask, - struct ir3_instruction *instr) -{ - struct ir3_shader_variant *so = ctx->so; - unsigned r = regid(so->inputs_count, 0); - unsigned n = so->inputs_count++; - - so->inputs[n].sysval = true; - so->inputs[n].slot = slot; - so->inputs[n].compmask = compmask; - so->inputs[n].regid = r; - so->inputs[n].interpolate = INTERP_MODE_FLAT; - so->total_in++; - - ctx->ir->ninputs = MAX2(ctx->ir->ninputs, r + 1); - ctx->ir->inputs[r] = instr; -} - -static void add_sysval_input(struct ir3_context *ctx, gl_system_value slot, - struct ir3_instruction *instr) -{ - add_sysval_input_compmask(ctx, slot, 0x1, instr); -} - -static void -emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) -{ - const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic]; - struct ir3_instruction **dst; - struct ir3_instruction * const *src; - struct ir3_block *b = ctx->block; - nir_const_value *const_offset; - int idx, comp; - - if (info->has_dest) { - unsigned n = nir_intrinsic_dest_components(intr); - dst = get_dst(ctx, &intr->dest, n); - } else { - dst = NULL; - } - - switch (intr->intrinsic) { - case nir_intrinsic_load_uniform: - idx = nir_intrinsic_base(intr); - const_offset = nir_src_as_const_value(intr->src[0]); - if (const_offset) { - idx += const_offset->u32[0]; - for (int i = 0; i < intr->num_components; i++) { - unsigned n = idx * 4 + i; - dst[i] = create_uniform(ctx, n); - } - } else { - src = get_src(ctx, &intr->src[0]); - for (int i = 0; i < intr->num_components; i++) { - int n = idx * 4 + i; - dst[i] = create_uniform_indirect(ctx, n, - get_addr(ctx, src[0], 4)); - } - /* NOTE: if relative addressing is used, we set - * constlen in the compiler (to worst-case value) - * since we don't know in the assembler what the max - * addr reg value can be: - */ - ctx->so->constlen = ctx->s->num_uniforms; - } - break; - case nir_intrinsic_load_ubo: - emit_intrinsic_load_ubo(ctx, intr, dst); - break; - case nir_intrinsic_load_input: - idx = nir_intrinsic_base(intr); - comp = nir_intrinsic_component(intr); - const_offset = nir_src_as_const_value(intr->src[0]); - if (const_offset) { - idx += const_offset->u32[0]; - for (int i = 0; i < intr->num_components; i++) { - unsigned n = idx * 4 + i + comp; - dst[i] = ctx->ir->inputs[n]; - } - } else { - src = get_src(ctx, &intr->src[0]); - struct ir3_instruction *collect = - create_collect(ctx, ctx->ir->inputs, ctx->ir->ninputs); - struct ir3_instruction *addr = get_addr(ctx, src[0], 4); - for (int i = 0; i < intr->num_components; i++) { - unsigned n = idx * 4 + i + comp; - dst[i] = create_indirect_load(ctx, ctx->ir->ninputs, - n, addr, collect); - } - } - break; - case nir_intrinsic_load_ssbo: - emit_intrinsic_load_ssbo(ctx, intr, dst); - break; - case nir_intrinsic_store_ssbo: - emit_intrinsic_store_ssbo(ctx, intr); - break; - case nir_intrinsic_get_buffer_size: - emit_intrinsic_ssbo_size(ctx, intr, dst); - break; - case nir_intrinsic_ssbo_atomic_add: - case nir_intrinsic_ssbo_atomic_imin: - case nir_intrinsic_ssbo_atomic_umin: - case nir_intrinsic_ssbo_atomic_imax: - case nir_intrinsic_ssbo_atomic_umax: - case nir_intrinsic_ssbo_atomic_and: - case nir_intrinsic_ssbo_atomic_or: - case nir_intrinsic_ssbo_atomic_xor: - case nir_intrinsic_ssbo_atomic_exchange: - case nir_intrinsic_ssbo_atomic_comp_swap: - dst[0] = emit_intrinsic_atomic_ssbo(ctx, intr); - break; - case nir_intrinsic_load_shared: - emit_intrinsic_load_shared(ctx, intr, dst); - break; - case nir_intrinsic_store_shared: - emit_intrinsic_store_shared(ctx, intr); - break; - case nir_intrinsic_shared_atomic_add: - case nir_intrinsic_shared_atomic_imin: - case nir_intrinsic_shared_atomic_umin: - case nir_intrinsic_shared_atomic_imax: - case nir_intrinsic_shared_atomic_umax: - case nir_intrinsic_shared_atomic_and: - case nir_intrinsic_shared_atomic_or: - case nir_intrinsic_shared_atomic_xor: - case nir_intrinsic_shared_atomic_exchange: - case nir_intrinsic_shared_atomic_comp_swap: - dst[0] = emit_intrinsic_atomic_shared(ctx, intr); - break; - case nir_intrinsic_image_deref_load: - emit_intrinsic_load_image(ctx, intr, dst); - break; - case nir_intrinsic_image_deref_store: - emit_intrinsic_store_image(ctx, intr); - break; - case nir_intrinsic_image_deref_size: - emit_intrinsic_image_size(ctx, intr, dst); - break; - case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_min: - case nir_intrinsic_image_deref_atomic_max: - case nir_intrinsic_image_deref_atomic_and: - case nir_intrinsic_image_deref_atomic_or: - case nir_intrinsic_image_deref_atomic_xor: - case nir_intrinsic_image_deref_atomic_exchange: - case nir_intrinsic_image_deref_atomic_comp_swap: - dst[0] = emit_intrinsic_atomic_image(ctx, intr); - break; - case nir_intrinsic_barrier: - case nir_intrinsic_memory_barrier: - case nir_intrinsic_group_memory_barrier: - case nir_intrinsic_memory_barrier_atomic_counter: - case nir_intrinsic_memory_barrier_buffer: - case nir_intrinsic_memory_barrier_image: - case nir_intrinsic_memory_barrier_shared: - emit_intrinsic_barrier(ctx, intr); - /* note that blk ptr no longer valid, make that obvious: */ - b = NULL; - break; - case nir_intrinsic_store_output: - idx = nir_intrinsic_base(intr); - comp = nir_intrinsic_component(intr); - const_offset = nir_src_as_const_value(intr->src[1]); - compile_assert(ctx, const_offset != NULL); - idx += const_offset->u32[0]; - - src = get_src(ctx, &intr->src[0]); - for (int i = 0; i < intr->num_components; i++) { - unsigned n = idx * 4 + i + comp; - ctx->ir->outputs[n] = src[i]; - } - break; - case nir_intrinsic_load_base_vertex: - case nir_intrinsic_load_first_vertex: - if (!ctx->basevertex) { - ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE); - add_sysval_input(ctx, SYSTEM_VALUE_FIRST_VERTEX, ctx->basevertex); - } - dst[0] = ctx->basevertex; - break; - case nir_intrinsic_load_vertex_id_zero_base: - case nir_intrinsic_load_vertex_id: - if (!ctx->vertex_id) { - gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id) ? - SYSTEM_VALUE_VERTEX_ID : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE; - ctx->vertex_id = create_input(ctx, 0); - add_sysval_input(ctx, sv, ctx->vertex_id); - } - dst[0] = ctx->vertex_id; - break; - case nir_intrinsic_load_instance_id: - if (!ctx->instance_id) { - ctx->instance_id = create_input(ctx, 0); - add_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID, - ctx->instance_id); - } - dst[0] = ctx->instance_id; - break; - case nir_intrinsic_load_sample_id: - case nir_intrinsic_load_sample_id_no_per_sample: - if (!ctx->samp_id) { - ctx->samp_id = create_input(ctx, 0); - ctx->samp_id->regs[0]->flags |= IR3_REG_HALF; - add_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_ID, - ctx->samp_id); - } - dst[0] = ir3_COV(b, ctx->samp_id, TYPE_U16, TYPE_U32); - break; - case nir_intrinsic_load_sample_mask_in: - if (!ctx->samp_mask_in) { - ctx->samp_mask_in = create_input(ctx, 0); - add_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN, - ctx->samp_mask_in); - } - dst[0] = ctx->samp_mask_in; - break; - case nir_intrinsic_load_user_clip_plane: - idx = nir_intrinsic_ucp_id(intr); - for (int i = 0; i < intr->num_components; i++) { - unsigned n = idx * 4 + i; - dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n); - } - break; - case nir_intrinsic_load_front_face: - if (!ctx->frag_face) { - ctx->so->frag_face = true; - ctx->frag_face = create_input(ctx, 0); - add_sysval_input(ctx, SYSTEM_VALUE_FRONT_FACE, ctx->frag_face); - ctx->frag_face->regs[0]->flags |= IR3_REG_HALF; - } - /* for fragface, we get -1 for back and 0 for front. However this is - * the inverse of what nir expects (where ~0 is true). - */ - dst[0] = ir3_COV(b, ctx->frag_face, TYPE_S16, TYPE_S32); - dst[0] = ir3_NOT_B(b, dst[0], 0); - break; - case nir_intrinsic_load_local_invocation_id: - if (!ctx->local_invocation_id) { - ctx->local_invocation_id = create_input_compmask(ctx, 0, 0x7); - add_sysval_input_compmask(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID, - 0x7, ctx->local_invocation_id); - } - split_dest(b, dst, ctx->local_invocation_id, 0, 3); - break; - case nir_intrinsic_load_work_group_id: - if (!ctx->work_group_id) { - ctx->work_group_id = create_input_compmask(ctx, 0, 0x7); - add_sysval_input_compmask(ctx, SYSTEM_VALUE_WORK_GROUP_ID, - 0x7, ctx->work_group_id); - ctx->work_group_id->regs[0]->flags |= IR3_REG_HIGH; - } - split_dest(b, dst, ctx->work_group_id, 0, 3); - break; - case nir_intrinsic_load_num_work_groups: - for (int i = 0; i < intr->num_components; i++) { - dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i); - } - break; - case nir_intrinsic_load_local_group_size: - for (int i = 0; i < intr->num_components; i++) { - dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i); - } - break; - case nir_intrinsic_discard_if: - case nir_intrinsic_discard: { - struct ir3_instruction *cond, *kill; - - if (intr->intrinsic == nir_intrinsic_discard_if) { - /* conditional discard: */ - src = get_src(ctx, &intr->src[0]); - cond = ir3_b2n(b, src[0]); - } else { - /* unconditional discard: */ - cond = create_immed(b, 1); - } - - /* NOTE: only cmps.*.* can write p0.x: */ - cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0); - cond->cat2.condition = IR3_COND_NE; - - /* condition always goes in predicate register: */ - cond->regs[0]->num = regid(REG_P0, 0); - - kill = ir3_KILL(b, cond, 0); - array_insert(ctx->ir, ctx->ir->predicates, kill); - - array_insert(b, b->keeps, kill); - ctx->so->has_kill = true; - - break; - } - default: - compile_error(ctx, "Unhandled intrinsic type: %s\n", - nir_intrinsic_infos[intr->intrinsic].name); - break; - } - - if (info->has_dest) - put_dst(ctx, &intr->dest); -} - -static void -emit_load_const(struct ir3_context *ctx, nir_load_const_instr *instr) -{ - struct ir3_instruction **dst = get_dst_ssa(ctx, &instr->def, - instr->def.num_components); - type_t type = (instr->def.bit_size < 32) ? TYPE_U16 : TYPE_U32; - - for (int i = 0; i < instr->def.num_components; i++) - dst[i] = create_immed_typed(ctx->block, instr->value.u32[i], type); -} - -static void -emit_undef(struct ir3_context *ctx, nir_ssa_undef_instr *undef) -{ - struct ir3_instruction **dst = get_dst_ssa(ctx, &undef->def, - undef->def.num_components); - type_t type = (undef->def.bit_size < 32) ? TYPE_U16 : TYPE_U32; - - /* backend doesn't want undefined instructions, so just plug - * in 0.0.. - */ - for (int i = 0; i < undef->def.num_components; i++) - dst[i] = create_immed_typed(ctx->block, fui(0.0), type); -} - -/* - * texture fetch/sample instructions: - */ - -static void -tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp) -{ - unsigned coords, flags = 0; - - /* note: would use tex->coord_components.. except txs.. also, - * since array index goes after shadow ref, we don't want to - * count it: - */ - switch (tex->sampler_dim) { - case GLSL_SAMPLER_DIM_1D: - case GLSL_SAMPLER_DIM_BUF: - coords = 1; - break; - case GLSL_SAMPLER_DIM_2D: - case GLSL_SAMPLER_DIM_RECT: - case GLSL_SAMPLER_DIM_EXTERNAL: - case GLSL_SAMPLER_DIM_MS: - coords = 2; - break; - case GLSL_SAMPLER_DIM_3D: - case GLSL_SAMPLER_DIM_CUBE: - coords = 3; - flags |= IR3_INSTR_3D; - break; - default: - unreachable("bad sampler_dim"); - } - - if (tex->is_shadow && tex->op != nir_texop_lod) - flags |= IR3_INSTR_S; - - if (tex->is_array && tex->op != nir_texop_lod) - flags |= IR3_INSTR_A; - - *flagsp = flags; - *coordsp = coords; -} - -static void -emit_tex(struct ir3_context *ctx, nir_tex_instr *tex) -{ - struct ir3_block *b = ctx->block; - struct ir3_instruction **dst, *sam, *src0[12], *src1[4]; - struct ir3_instruction * const *coord, * const *off, * const *ddx, * const *ddy; - struct ir3_instruction *lod, *compare, *proj, *sample_index; - bool has_bias = false, has_lod = false, has_proj = false, has_off = false; - unsigned i, coords, flags; - unsigned nsrc0 = 0, nsrc1 = 0; - type_t type; - opc_t opc = 0; - - coord = off = ddx = ddy = NULL; - lod = proj = compare = sample_index = NULL; - - /* TODO: might just be one component for gathers? */ - dst = get_dst(ctx, &tex->dest, 4); - - for (unsigned i = 0; i < tex->num_srcs; i++) { - switch (tex->src[i].src_type) { - case nir_tex_src_coord: - coord = get_src(ctx, &tex->src[i].src); - break; - case nir_tex_src_bias: - lod = get_src(ctx, &tex->src[i].src)[0]; - has_bias = true; - break; - case nir_tex_src_lod: - lod = get_src(ctx, &tex->src[i].src)[0]; - has_lod = true; - break; - case nir_tex_src_comparator: /* shadow comparator */ - compare = get_src(ctx, &tex->src[i].src)[0]; - break; - case nir_tex_src_projector: - proj = get_src(ctx, &tex->src[i].src)[0]; - has_proj = true; - break; - case nir_tex_src_offset: - off = get_src(ctx, &tex->src[i].src); - has_off = true; - break; - case nir_tex_src_ddx: - ddx = get_src(ctx, &tex->src[i].src); - break; - case nir_tex_src_ddy: - ddy = get_src(ctx, &tex->src[i].src); - break; - case nir_tex_src_ms_index: - sample_index = get_src(ctx, &tex->src[i].src)[0]; - break; - default: - compile_error(ctx, "Unhandled NIR tex src type: %d\n", - tex->src[i].src_type); - return; - } - } - - switch (tex->op) { - case nir_texop_tex: opc = has_lod ? OPC_SAML : OPC_SAM; break; - case nir_texop_txb: opc = OPC_SAMB; break; - case nir_texop_txl: opc = OPC_SAML; break; - case nir_texop_txd: opc = OPC_SAMGQ; break; - case nir_texop_txf: opc = OPC_ISAML; break; - case nir_texop_lod: opc = OPC_GETLOD; break; - case nir_texop_tg4: - /* NOTE: a4xx might need to emulate gather w/ txf (this is - * what blob does, seems gather is broken?), and a3xx did - * not support it (but probably could also emulate). - */ - switch (tex->component) { - case 0: opc = OPC_GATHER4R; break; - case 1: opc = OPC_GATHER4G; break; - case 2: opc = OPC_GATHER4B; break; - case 3: opc = OPC_GATHER4A; break; - } - break; - case nir_texop_txf_ms: opc = OPC_ISAMM; break; - case nir_texop_txs: - case nir_texop_query_levels: - case nir_texop_texture_samples: - case nir_texop_samples_identical: - case nir_texop_txf_ms_mcs: - compile_error(ctx, "Unhandled NIR tex type: %d\n", tex->op); - return; - } - - tex_info(tex, &flags, &coords); - - /* - * lay out the first argument in the proper order: - * - actual coordinates first - * - shadow reference - * - array index - * - projection w - * - starting at offset 4, dpdx.xy, dpdy.xy - * - * bias/lod go into the second arg - */ - - /* insert tex coords: */ - for (i = 0; i < coords; i++) - src0[i] = coord[i]; - - nsrc0 = i; - - /* NOTE a3xx (and possibly a4xx?) might be different, using isaml - * with scaled x coord according to requested sample: - */ - if (tex->op == nir_texop_txf_ms) { - if (ctx->compiler->txf_ms_with_isaml) { - /* the samples are laid out in x dimension as - * 0 1 2 3 - * x_ms = (x << ms) + sample_index; - */ - struct ir3_instruction *ms; - ms = create_immed(b, (ctx->samples >> (2 * tex->texture_index)) & 3); - - src0[0] = ir3_SHL_B(b, src0[0], 0, ms, 0); - src0[0] = ir3_ADD_U(b, src0[0], 0, sample_index, 0); - - opc = OPC_ISAML; - } else { - src0[nsrc0++] = sample_index; - } - } - - /* scale up integer coords for TXF based on the LOD */ - if (ctx->compiler->unminify_coords && (opc == OPC_ISAML)) { - assert(has_lod); - for (i = 0; i < coords; i++) - src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0); - } - - if (coords == 1) { - /* hw doesn't do 1d, so we treat it as 2d with - * height of 1, and patch up the y coord. - * TODO: y coord should be (int)0 in some cases.. - */ - src0[nsrc0++] = create_immed(b, fui(0.5)); - } - - if (tex->is_shadow && tex->op != nir_texop_lod) - src0[nsrc0++] = compare; - - if (tex->is_array && tex->op != nir_texop_lod) { - struct ir3_instruction *idx = coord[coords]; - - /* the array coord for cube arrays needs 0.5 added to it */ - if (ctx->compiler->array_index_add_half && (opc != OPC_ISAML)) - idx = ir3_ADD_F(b, idx, 0, create_immed(b, fui(0.5)), 0); - - src0[nsrc0++] = idx; - } - - if (has_proj) { - src0[nsrc0++] = proj; - flags |= IR3_INSTR_P; - } - - /* pad to 4, then ddx/ddy: */ - if (tex->op == nir_texop_txd) { - while (nsrc0 < 4) - src0[nsrc0++] = create_immed(b, fui(0.0)); - for (i = 0; i < coords; i++) - src0[nsrc0++] = ddx[i]; - if (coords < 2) - src0[nsrc0++] = create_immed(b, fui(0.0)); - for (i = 0; i < coords; i++) - src0[nsrc0++] = ddy[i]; - if (coords < 2) - src0[nsrc0++] = create_immed(b, fui(0.0)); - } - - /* - * second argument (if applicable): - * - offsets - * - lod - * - bias - */ - if (has_off | has_lod | has_bias) { - if (has_off) { - unsigned off_coords = coords; - if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE) - off_coords--; - for (i = 0; i < off_coords; i++) - src1[nsrc1++] = off[i]; - if (off_coords < 2) - src1[nsrc1++] = create_immed(b, fui(0.0)); - flags |= IR3_INSTR_O; - } - - if (has_lod | has_bias) - src1[nsrc1++] = lod; - } - - switch (tex->dest_type) { - case nir_type_invalid: - case nir_type_float: - type = TYPE_F32; - break; - case nir_type_int: - type = TYPE_S32; - break; - case nir_type_uint: - case nir_type_bool: - type = TYPE_U32; - break; - default: - unreachable("bad dest_type"); - } - - if (opc == OPC_GETLOD) - type = TYPE_U32; - - unsigned tex_idx = tex->texture_index; - - ctx->max_texture_index = MAX2(ctx->max_texture_index, tex_idx); - - struct ir3_instruction *col0 = create_collect(ctx, src0, nsrc0); - struct ir3_instruction *col1 = create_collect(ctx, src1, nsrc1); - - sam = ir3_SAM(b, opc, type, 0b1111, flags, - tex_idx, tex_idx, col0, col1); - - if ((ctx->astc_srgb & (1 << tex_idx)) && !nir_tex_instr_is_query(tex)) { - /* only need first 3 components: */ - sam->regs[0]->wrmask = 0x7; - split_dest(b, dst, sam, 0, 3); - - /* we need to sample the alpha separately with a non-ASTC - * texture state: - */ - sam = ir3_SAM(b, opc, type, 0b1000, flags, - tex_idx, tex_idx, col0, col1); - - array_insert(ctx->ir, ctx->ir->astc_srgb, sam); - - /* fixup .w component: */ - split_dest(b, &dst[3], sam, 3, 1); - } else { - /* normal (non-workaround) case: */ - split_dest(b, dst, sam, 0, 4); - } - - /* GETLOD returns results in 4.8 fixed point */ - if (opc == OPC_GETLOD) { - struct ir3_instruction *factor = create_immed(b, fui(1.0 / 256)); - - compile_assert(ctx, tex->dest_type == nir_type_float); - for (i = 0; i < 2; i++) { - dst[i] = ir3_MUL_F(b, ir3_COV(b, dst[i], TYPE_U32, TYPE_F32), 0, - factor, 0); - } - } - - put_dst(ctx, &tex->dest); -} - -static void -emit_tex_query_levels(struct ir3_context *ctx, nir_tex_instr *tex) -{ - struct ir3_block *b = ctx->block; - struct ir3_instruction **dst, *sam; - - dst = get_dst(ctx, &tex->dest, 1); - - sam = ir3_SAM(b, OPC_GETINFO, TYPE_U32, 0b0100, 0, - tex->texture_index, tex->texture_index, NULL, NULL); - - /* even though there is only one component, since it ends - * up in .z rather than .x, we need a split_dest() - */ - split_dest(b, dst, sam, 0, 3); - - /* The # of levels comes from getinfo.z. We need to add 1 to it, since - * the value in TEX_CONST_0 is zero-based. - */ - if (ctx->compiler->levels_add_one) - dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0); - - put_dst(ctx, &tex->dest); -} - -static void -emit_tex_txs(struct ir3_context *ctx, nir_tex_instr *tex) -{ - struct ir3_block *b = ctx->block; - struct ir3_instruction **dst, *sam; - struct ir3_instruction *lod; - unsigned flags, coords; - - tex_info(tex, &flags, &coords); - - /* Actually we want the number of dimensions, not coordinates. This - * distinction only matters for cubes. - */ - if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE) - coords = 2; - - dst = get_dst(ctx, &tex->dest, 4); - - compile_assert(ctx, tex->num_srcs == 1); - compile_assert(ctx, tex->src[0].src_type == nir_tex_src_lod); - - lod = get_src(ctx, &tex->src[0].src)[0]; - - sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, 0b1111, flags, - tex->texture_index, tex->texture_index, lod, NULL); - - split_dest(b, dst, sam, 0, 4); - - /* Array size actually ends up in .w rather than .z. This doesn't - * matter for miplevel 0, but for higher mips the value in z is - * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is - * returned, which means that we have to add 1 to it for arrays. - */ - if (tex->is_array) { - if (ctx->compiler->levels_add_one) { - dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0); - } else { - dst[coords] = ir3_MOV(b, dst[3], TYPE_U32); - } - } - - put_dst(ctx, &tex->dest); -} - -static void -emit_jump(struct ir3_context *ctx, nir_jump_instr *jump) -{ - switch (jump->type) { - case nir_jump_break: - case nir_jump_continue: - case nir_jump_return: - /* I *think* we can simply just ignore this, and use the - * successor block link to figure out where we need to - * jump to for break/continue - */ - break; - default: - compile_error(ctx, "Unhandled NIR jump type: %d\n", jump->type); - break; - } -} - -static void -emit_instr(struct ir3_context *ctx, nir_instr *instr) -{ - switch (instr->type) { - case nir_instr_type_alu: - emit_alu(ctx, nir_instr_as_alu(instr)); - break; - case nir_instr_type_deref: - /* ignored, handled as part of the intrinsic they are src to */ - break; - case nir_instr_type_intrinsic: - emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); - break; - case nir_instr_type_load_const: - emit_load_const(ctx, nir_instr_as_load_const(instr)); - break; - case nir_instr_type_ssa_undef: - emit_undef(ctx, nir_instr_as_ssa_undef(instr)); - break; - case nir_instr_type_tex: { - nir_tex_instr *tex = nir_instr_as_tex(instr); - /* couple tex instructions get special-cased: - */ - switch (tex->op) { - case nir_texop_txs: - emit_tex_txs(ctx, tex); - break; - case nir_texop_query_levels: - emit_tex_query_levels(ctx, tex); - break; - default: - emit_tex(ctx, tex); - break; - } - break; - } - case nir_instr_type_jump: - emit_jump(ctx, nir_instr_as_jump(instr)); - break; - case nir_instr_type_phi: - /* we have converted phi webs to regs in NIR by now */ - compile_error(ctx, "Unexpected NIR instruction type: %d\n", instr->type); - break; - case nir_instr_type_call: - case nir_instr_type_parallel_copy: - compile_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type); - break; - } -} - -static struct ir3_block * -get_block(struct ir3_context *ctx, const nir_block *nblock) -{ - struct ir3_block *block; - struct hash_entry *hentry; - unsigned i; - - hentry = _mesa_hash_table_search(ctx->block_ht, nblock); - if (hentry) - return hentry->data; - - block = ir3_block_create(ctx->ir); - block->nblock = nblock; - _mesa_hash_table_insert(ctx->block_ht, nblock, block); - - block->predecessors_count = nblock->predecessors->entries; - block->predecessors = ralloc_array_size(block, - sizeof(block->predecessors[0]), block->predecessors_count); - i = 0; - set_foreach(nblock->predecessors, sentry) { - block->predecessors[i++] = get_block(ctx, sentry->key); - } - - return block; -} - -static void -emit_block(struct ir3_context *ctx, nir_block *nblock) -{ - struct ir3_block *block = get_block(ctx, nblock); - - for (int i = 0; i < ARRAY_SIZE(block->successors); i++) { - if (nblock->successors[i]) { - block->successors[i] = - get_block(ctx, nblock->successors[i]); - } - } - - ctx->block = block; - list_addtail(&block->node, &ctx->ir->block_list); - - /* re-emit addr register in each block if needed: */ - for (int i = 0; i < ARRAY_SIZE(ctx->addr_ht); i++) { - _mesa_hash_table_destroy(ctx->addr_ht[i], NULL); - ctx->addr_ht[i] = NULL; - } - - nir_foreach_instr(instr, nblock) { - ctx->cur_instr = instr; - emit_instr(ctx, instr); - ctx->cur_instr = NULL; - if (ctx->error) - return; - } -} - -static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list); - -static void -emit_if(struct ir3_context *ctx, nir_if *nif) -{ - struct ir3_instruction *condition = get_src(ctx, &nif->condition)[0]; - - ctx->block->condition = - get_predicate(ctx, ir3_b2n(condition->block, condition)); - - emit_cf_list(ctx, &nif->then_list); - emit_cf_list(ctx, &nif->else_list); -} - -static void -emit_loop(struct ir3_context *ctx, nir_loop *nloop) -{ - emit_cf_list(ctx, &nloop->body); -} - -static void -emit_cf_list(struct ir3_context *ctx, struct exec_list *list) -{ - foreach_list_typed(nir_cf_node, node, node, list) { - switch (node->type) { - case nir_cf_node_block: - emit_block(ctx, nir_cf_node_as_block(node)); - break; - case nir_cf_node_if: - emit_if(ctx, nir_cf_node_as_if(node)); - break; - case nir_cf_node_loop: - emit_loop(ctx, nir_cf_node_as_loop(node)); - break; - case nir_cf_node_function: - compile_error(ctx, "TODO\n"); - break; - } - } -} - -/* emit stream-out code. At this point, the current block is the original - * (nir) end block, and nir ensures that all flow control paths terminate - * into the end block. We re-purpose the original end block to generate - * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional - * block holding stream-out write instructions, followed by the new end - * block: - * - * blockOrigEnd { - * p0.x = (vtxcnt < maxvtxcnt) - * // succs: blockStreamOut, blockNewEnd - * } - * blockStreamOut { - * ... stream-out instructions ... - * // succs: blockNewEnd - * } - * blockNewEnd { - * } - */ -static void -emit_stream_out(struct ir3_context *ctx) -{ - struct ir3_shader_variant *v = ctx->so; - struct ir3 *ir = ctx->ir; - struct ir3_stream_output_info *strmout = - &ctx->so->shader->stream_output; - struct ir3_block *orig_end_block, *stream_out_block, *new_end_block; - struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond; - struct ir3_instruction *bases[IR3_MAX_SO_BUFFERS]; - - /* create vtxcnt input in input block at top of shader, - * so that it is seen as live over the entire duration - * of the shader: - */ - vtxcnt = create_input(ctx, 0); - add_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, vtxcnt); - - maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX); - - /* at this point, we are at the original 'end' block, - * re-purpose this block to stream-out condition, then - * append stream-out block and new-end block - */ - orig_end_block = ctx->block; - -// TODO these blocks need to update predecessors.. -// maybe w/ store_global intrinsic, we could do this -// stuff in nir->nir pass - - stream_out_block = ir3_block_create(ir); - list_addtail(&stream_out_block->node, &ir->block_list); - - new_end_block = ir3_block_create(ir); - list_addtail(&new_end_block->node, &ir->block_list); - - orig_end_block->successors[0] = stream_out_block; - orig_end_block->successors[1] = new_end_block; - stream_out_block->successors[0] = new_end_block; - - /* setup 'if (vtxcnt < maxvtxcnt)' condition: */ - cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0); - cond->regs[0]->num = regid(REG_P0, 0); - cond->cat2.condition = IR3_COND_LT; - - /* condition goes on previous block to the conditional, - * since it is used to pick which of the two successor - * paths to take: - */ - orig_end_block->condition = cond; - - /* switch to stream_out_block to generate the stream-out - * instructions: - */ - ctx->block = stream_out_block; - - /* Calculate base addresses based on vtxcnt. Instructions - * generated for bases not used in following loop will be - * stripped out in the backend. - */ - for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) { - unsigned stride = strmout->stride[i]; - struct ir3_instruction *base, *off; - - base = create_uniform(ctx, regid(v->constbase.tfbo, i)); - - /* 24-bit should be enough: */ - off = ir3_MUL_U(ctx->block, vtxcnt, 0, - create_immed(ctx->block, stride * 4), 0); - - bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0); - } - - /* Generate the per-output store instructions: */ - for (unsigned i = 0; i < strmout->num_outputs; i++) { - for (unsigned j = 0; j < strmout->output[i].num_components; j++) { - unsigned c = j + strmout->output[i].start_component; - struct ir3_instruction *base, *out, *stg; - - base = bases[strmout->output[i].output_buffer]; - out = ctx->ir->outputs[regid(strmout->output[i].register_index, c)]; - - stg = ir3_STG(ctx->block, base, 0, out, 0, - create_immed(ctx->block, 1), 0); - stg->cat6.type = TYPE_U32; - stg->cat6.dst_offset = (strmout->output[i].dst_offset + j) * 4; - - array_insert(ctx->block, ctx->block->keeps, stg); - } - } - - /* and finally switch to the new_end_block: */ - ctx->block = new_end_block; -} - -static void -emit_function(struct ir3_context *ctx, nir_function_impl *impl) -{ - nir_metadata_require(impl, nir_metadata_block_index); - - emit_cf_list(ctx, &impl->body); - emit_block(ctx, impl->end_block); - - /* at this point, we should have a single empty block, - * into which we emit the 'end' instruction. - */ - compile_assert(ctx, list_empty(&ctx->block->instr_list)); - - /* If stream-out (aka transform-feedback) enabled, emit the - * stream-out instructions, followed by a new empty block (into - * which the 'end' instruction lands). - * - * NOTE: it is done in this order, rather than inserting before - * we emit end_block, because NIR guarantees that all blocks - * flow into end_block, and that end_block has no successors. - * So by re-purposing end_block as the first block of stream- - * out, we guarantee that all exit paths flow into the stream- - * out instructions. - */ - if ((ctx->compiler->gpu_id < 500) && - (ctx->so->shader->stream_output.num_outputs > 0) && - !ctx->so->binning_pass) { - debug_assert(ctx->so->type == MESA_SHADER_VERTEX); - emit_stream_out(ctx); - } - - ir3_END(ctx->block); -} - -static struct ir3_instruction * -create_frag_coord(struct ir3_context *ctx, unsigned comp) -{ - struct ir3_block *block = ctx->block; - struct ir3_instruction *instr; - - if (!ctx->frag_coord) { - ctx->frag_coord = create_input_compmask(ctx, 0, 0xf); - /* defer add_sysval_input() until after all inputs created */ - } - - split_dest(block, &instr, ctx->frag_coord, comp, 1); - - switch (comp) { - case 0: /* .x */ - case 1: /* .y */ - /* for frag_coord, we get unsigned values.. we need - * to subtract (integer) 8 and divide by 16 (right- - * shift by 4) then convert to float: - * - * sub.s tmp, src, 8 - * shr.b tmp, tmp, 4 - * mov.u32f32 dst, tmp - * - */ - instr = ir3_SUB_S(block, instr, 0, - create_immed(block, 8), 0); - instr = ir3_SHR_B(block, instr, 0, - create_immed(block, 4), 0); - instr = ir3_COV(block, instr, TYPE_U32, TYPE_F32); - - return instr; - case 2: /* .z */ - case 3: /* .w */ - default: - /* seems that we can use these as-is: */ - return instr; - } -} - -static void -setup_input(struct ir3_context *ctx, nir_variable *in) -{ - struct ir3_shader_variant *so = ctx->so; - unsigned ncomp = glsl_get_components(in->type); - unsigned n = in->data.driver_location; - unsigned slot = in->data.location; - - /* let's pretend things other than vec4 don't exist: */ - ncomp = MAX2(ncomp, 4); - - /* skip unread inputs, we could end up with (for example), unsplit - * matrix/etc inputs in the case they are not read, so just silently - * skip these. - */ - if (ncomp > 4) - return; - - compile_assert(ctx, ncomp == 4); - - so->inputs[n].slot = slot; - so->inputs[n].compmask = (1 << ncomp) - 1; - so->inputs_count = MAX2(so->inputs_count, n + 1); - so->inputs[n].interpolate = in->data.interpolation; - - if (ctx->so->type == MESA_SHADER_FRAGMENT) { - for (int i = 0; i < ncomp; i++) { - struct ir3_instruction *instr = NULL; - unsigned idx = (n * 4) + i; - - if (slot == VARYING_SLOT_POS) { - so->inputs[n].bary = false; - so->frag_coord = true; - instr = create_frag_coord(ctx, i); - } else if (slot == VARYING_SLOT_PNTC) { - /* see for example st_nir_fixup_varying_slots().. this is - * maybe a bit mesa/st specific. But we need things to line - * up for this in fdN_program: - * unsigned texmask = 1 << (slot - VARYING_SLOT_VAR0); - * if (emit->sprite_coord_enable & texmask) { - * ... - * } - */ - so->inputs[n].slot = VARYING_SLOT_VAR8; - so->inputs[n].bary = true; - instr = create_frag_input(ctx, false); - } else { - bool use_ldlv = false; - - /* detect the special case for front/back colors where - * we need to do flat vs smooth shading depending on - * rast state: - */ - if (in->data.interpolation == INTERP_MODE_NONE) { - switch (slot) { - case VARYING_SLOT_COL0: - case VARYING_SLOT_COL1: - case VARYING_SLOT_BFC0: - case VARYING_SLOT_BFC1: - so->inputs[n].rasterflat = true; - break; - default: - break; - } - } - - if (ctx->compiler->flat_bypass) { - if ((so->inputs[n].interpolate == INTERP_MODE_FLAT) || - (so->inputs[n].rasterflat && ctx->so->key.rasterflat)) - use_ldlv = true; - } - - so->inputs[n].bary = true; - - instr = create_frag_input(ctx, use_ldlv); - } - - compile_assert(ctx, idx < ctx->ir->ninputs); - - ctx->ir->inputs[idx] = instr; - } - } else if (ctx->so->type == MESA_SHADER_VERTEX) { - for (int i = 0; i < ncomp; i++) { - unsigned idx = (n * 4) + i; - compile_assert(ctx, idx < ctx->ir->ninputs); - ctx->ir->inputs[idx] = create_input(ctx, idx); - } - } else { - compile_error(ctx, "unknown shader type: %d\n", ctx->so->type); - } - - if (so->inputs[n].bary || (ctx->so->type == MESA_SHADER_VERTEX)) { - so->total_in += ncomp; - } -} - -static void -setup_output(struct ir3_context *ctx, nir_variable *out) -{ - struct ir3_shader_variant *so = ctx->so; - unsigned ncomp = glsl_get_components(out->type); - unsigned n = out->data.driver_location; - unsigned slot = out->data.location; - unsigned comp = 0; - - /* let's pretend things other than vec4 don't exist: */ - ncomp = MAX2(ncomp, 4); - compile_assert(ctx, ncomp == 4); - - if (ctx->so->type == MESA_SHADER_FRAGMENT) { - switch (slot) { - case FRAG_RESULT_DEPTH: - comp = 2; /* tgsi will write to .z component */ - so->writes_pos = true; - break; - case FRAG_RESULT_COLOR: - so->color0_mrt = 1; - break; - default: - if (slot >= FRAG_RESULT_DATA0) - break; - compile_error(ctx, "unknown FS output name: %s\n", - gl_frag_result_name(slot)); - } - } else if (ctx->so->type == MESA_SHADER_VERTEX) { - switch (slot) { - case VARYING_SLOT_POS: - so->writes_pos = true; - break; - case VARYING_SLOT_PSIZ: - so->writes_psize = true; - break; - case VARYING_SLOT_COL0: - case VARYING_SLOT_COL1: - case VARYING_SLOT_BFC0: - case VARYING_SLOT_BFC1: - case VARYING_SLOT_FOGC: - case VARYING_SLOT_CLIP_DIST0: - case VARYING_SLOT_CLIP_DIST1: - case VARYING_SLOT_CLIP_VERTEX: - break; - default: - if (slot >= VARYING_SLOT_VAR0) - break; - if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7)) - break; - compile_error(ctx, "unknown VS output name: %s\n", - gl_varying_slot_name(slot)); - } - } else { - compile_error(ctx, "unknown shader type: %d\n", ctx->so->type); - } - - compile_assert(ctx, n < ARRAY_SIZE(so->outputs)); - - so->outputs[n].slot = slot; - so->outputs[n].regid = regid(n, comp); - so->outputs_count = MAX2(so->outputs_count, n + 1); - - for (int i = 0; i < ncomp; i++) { - unsigned idx = (n * 4) + i; - compile_assert(ctx, idx < ctx->ir->noutputs); - ctx->ir->outputs[idx] = create_immed(ctx->block, fui(0.0)); - } -} - -static int -max_drvloc(struct exec_list *vars) -{ - int drvloc = -1; - nir_foreach_variable(var, vars) { - drvloc = MAX2(drvloc, (int)var->data.driver_location); - } - return drvloc; -} - -static const unsigned max_sysvals[] = { - [MESA_SHADER_FRAGMENT] = 24, // TODO - [MESA_SHADER_VERTEX] = 16, - [MESA_SHADER_COMPUTE] = 16, // TODO how many do we actually need? -}; - -static void -emit_instructions(struct ir3_context *ctx) -{ - unsigned ninputs, noutputs; - nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s); - - ninputs = (max_drvloc(&ctx->s->inputs) + 1) * 4; - noutputs = (max_drvloc(&ctx->s->outputs) + 1) * 4; - - /* we need to leave room for sysvals: - */ - ninputs += max_sysvals[ctx->so->type]; - - ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs); - - /* Create inputs in first block: */ - ctx->block = get_block(ctx, nir_start_block(fxn)); - ctx->in_block = ctx->block; - list_addtail(&ctx->block->node, &ctx->ir->block_list); - - ninputs -= max_sysvals[ctx->so->type]; - - /* for fragment shader, the vcoord input register is used as the - * base for bary.f varying fetch instrs: - */ - struct ir3_instruction *vcoord = NULL; - if (ctx->so->type == MESA_SHADER_FRAGMENT) { - struct ir3_instruction *xy[2]; - - vcoord = create_input_compmask(ctx, 0, 0x3); - split_dest(ctx->block, xy, vcoord, 0, 2); - - ctx->frag_vcoord = create_collect(ctx, xy, 2); - } - - /* Setup inputs: */ - nir_foreach_variable(var, &ctx->s->inputs) { - setup_input(ctx, var); - } - - /* Defer add_sysval_input() stuff until after setup_inputs(), - * because sysvals need to be appended after varyings: - */ - if (vcoord) { - add_sysval_input_compmask(ctx, SYSTEM_VALUE_VARYING_COORD, - 0x3, vcoord); - } - - if (ctx->frag_coord) { - add_sysval_input_compmask(ctx, SYSTEM_VALUE_FRAG_COORD, - 0xf, ctx->frag_coord); - } - - /* Setup outputs: */ - nir_foreach_variable(var, &ctx->s->outputs) { - setup_output(ctx, var); - } - - /* Setup registers (which should only be arrays): */ - nir_foreach_register(reg, &ctx->s->registers) { - declare_array(ctx, reg); - } - - /* NOTE: need to do something more clever when we support >1 fxn */ - nir_foreach_register(reg, &fxn->registers) { - declare_array(ctx, reg); - } - /* And emit the body: */ - ctx->impl = fxn; - emit_function(ctx, fxn); -} - -/* from NIR perspective, we actually have varying inputs. But the varying - * inputs, from an IR standpoint, are just bary.f/ldlv instructions. The - * only actual inputs are the sysvals. - */ -static void -fixup_frag_inputs(struct ir3_context *ctx) -{ - struct ir3_shader_variant *so = ctx->so; - struct ir3 *ir = ctx->ir; - unsigned i = 0; - - /* sysvals should appear at the end of the inputs, drop everything else: */ - while ((i < so->inputs_count) && !so->inputs[i].sysval) - i++; - - /* at IR level, inputs are always blocks of 4 scalars: */ - i *= 4; - - ir->inputs = &ir->inputs[i]; - ir->ninputs -= i; -} - -/* Fixup tex sampler state for astc/srgb workaround instructions. We - * need to assign the tex state indexes for these after we know the - * max tex index. - */ -static void -fixup_astc_srgb(struct ir3_context *ctx) -{ - struct ir3_shader_variant *so = ctx->so; - /* indexed by original tex idx, value is newly assigned alpha sampler - * state tex idx. Zero is invalid since there is at least one sampler - * if we get here. - */ - unsigned alt_tex_state[16] = {0}; - unsigned tex_idx = ctx->max_texture_index + 1; - unsigned idx = 0; - - so->astc_srgb.base = tex_idx; - - for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) { - struct ir3_instruction *sam = ctx->ir->astc_srgb[i]; - - compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state)); - - if (alt_tex_state[sam->cat5.tex] == 0) { - /* assign new alternate/alpha tex state slot: */ - alt_tex_state[sam->cat5.tex] = tex_idx++; - so->astc_srgb.orig_idx[idx++] = sam->cat5.tex; - so->astc_srgb.count++; - } - - sam->cat5.tex = alt_tex_state[sam->cat5.tex]; - } -} - -static void -fixup_binning_pass(struct ir3_context *ctx) -{ - struct ir3_shader_variant *so = ctx->so; - struct ir3 *ir = ctx->ir; - unsigned i, j; - - for (i = 0, j = 0; i < so->outputs_count; i++) { - unsigned slot = so->outputs[i].slot; - - /* throw away everything but first position/psize */ - if ((slot == VARYING_SLOT_POS) || (slot == VARYING_SLOT_PSIZ)) { - if (i != j) { - so->outputs[j] = so->outputs[i]; - ir->outputs[(j*4)+0] = ir->outputs[(i*4)+0]; - ir->outputs[(j*4)+1] = ir->outputs[(i*4)+1]; - ir->outputs[(j*4)+2] = ir->outputs[(i*4)+2]; - ir->outputs[(j*4)+3] = ir->outputs[(i*4)+3]; - } - j++; - } - } - so->outputs_count = j; - ir->noutputs = j * 4; -} - -int -ir3_compile_shader_nir(struct ir3_compiler *compiler, - struct ir3_shader_variant *so) -{ - struct ir3_context *ctx; - struct ir3 *ir; - struct ir3_instruction **inputs; - unsigned i, actual_in, inloc; - int ret = 0, max_bary; - - assert(!so->ir); - - ctx = compile_init(compiler, so); - if (!ctx) { - DBG("INIT failed!"); - ret = -1; - goto out; - } - - emit_instructions(ctx); - - if (ctx->error) { - DBG("EMIT failed!"); - ret = -1; - goto out; - } - - ir = so->ir = ctx->ir; - - /* keep track of the inputs from TGSI perspective.. */ - inputs = ir->inputs; - - /* but fixup actual inputs for frag shader: */ - if (so->type == MESA_SHADER_FRAGMENT) - fixup_frag_inputs(ctx); - - /* at this point, for binning pass, throw away unneeded outputs: */ - if (so->binning_pass && (ctx->compiler->gpu_id < 600)) - fixup_binning_pass(ctx); - - /* if we want half-precision outputs, mark the output registers - * as half: - */ - if (so->key.half_precision) { - for (i = 0; i < ir->noutputs; i++) { - struct ir3_instruction *out = ir->outputs[i]; - - if (!out) - continue; - - /* if frag shader writes z, that needs to be full precision: */ - if (so->outputs[i/4].slot == FRAG_RESULT_DEPTH) - continue; - - out->regs[0]->flags |= IR3_REG_HALF; - /* output could be a fanout (ie. texture fetch output) - * in which case we need to propagate the half-reg flag - * up to the definer so that RA sees it: - */ - if (out->opc == OPC_META_FO) { - out = out->regs[1]->instr; - out->regs[0]->flags |= IR3_REG_HALF; - } - - if (out->opc == OPC_MOV) { - out->cat1.dst_type = half_type(out->cat1.dst_type); - } - } - } - - if (ir3_shader_debug & IR3_DBG_OPTMSGS) { - printf("BEFORE CP:\n"); - ir3_print(ir); - } - - ir3_cp(ir, so); - - /* at this point, for binning pass, throw away unneeded outputs: - * Note that for a6xx and later, we do this after ir3_cp to ensure - * that the uniform/constant layout for BS and VS matches, so that - * we can re-use same VS_CONST state group. - */ - if (so->binning_pass && (ctx->compiler->gpu_id >= 600)) - fixup_binning_pass(ctx); - - /* Insert mov if there's same instruction for each output. - * eg. dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.const_expression.vertex.sampler2dshadow - */ - for (int i = ir->noutputs - 1; i >= 0; i--) { - if (!ir->outputs[i]) - continue; - for (unsigned j = 0; j < i; j++) { - if (ir->outputs[i] == ir->outputs[j]) { - ir->outputs[i] = - ir3_MOV(ir->outputs[i]->block, ir->outputs[i], TYPE_F32); - } - } - } - - if (ir3_shader_debug & IR3_DBG_OPTMSGS) { - printf("BEFORE GROUPING:\n"); - ir3_print(ir); - } - - ir3_sched_add_deps(ir); - - /* Group left/right neighbors, inserting mov's where needed to - * solve conflicts: - */ - ir3_group(ir); - - if (ir3_shader_debug & IR3_DBG_OPTMSGS) { - printf("AFTER GROUPING:\n"); - ir3_print(ir); - } - - ir3_depth(ir); - - if (ir3_shader_debug & IR3_DBG_OPTMSGS) { - printf("AFTER DEPTH:\n"); - ir3_print(ir); - } - - ret = ir3_sched(ir); - if (ret) { - DBG("SCHED failed!"); - goto out; - } - - if (ir3_shader_debug & IR3_DBG_OPTMSGS) { - printf("AFTER SCHED:\n"); - ir3_print(ir); - } - - ret = ir3_ra(ir, so->type, so->frag_coord, so->frag_face); - if (ret) { - DBG("RA failed!"); - goto out; - } - - if (ir3_shader_debug & IR3_DBG_OPTMSGS) { - printf("AFTER RA:\n"); - ir3_print(ir); - } - - /* fixup input/outputs: */ - for (i = 0; i < so->outputs_count; i++) { - so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num; - } - - /* Note that some or all channels of an input may be unused: */ - actual_in = 0; - inloc = 0; - for (i = 0; i < so->inputs_count; i++) { - unsigned j, reg = regid(63,0), compmask = 0, maxcomp = 0; - so->inputs[i].ncomp = 0; - so->inputs[i].inloc = inloc; - for (j = 0; j < 4; j++) { - struct ir3_instruction *in = inputs[(i*4) + j]; - if (in && !(in->flags & IR3_INSTR_UNUSED)) { - compmask |= (1 << j); - reg = in->regs[0]->num - j; - actual_in++; - so->inputs[i].ncomp++; - if ((so->type == MESA_SHADER_FRAGMENT) && so->inputs[i].bary) { - /* assign inloc: */ - assert(in->regs[1]->flags & IR3_REG_IMMED); - in->regs[1]->iim_val = inloc + j; - maxcomp = j + 1; - } - } - } - if ((so->type == MESA_SHADER_FRAGMENT) && compmask && so->inputs[i].bary) { - so->varying_in++; - so->inputs[i].compmask = (1 << maxcomp) - 1; - inloc += maxcomp; - } else if (!so->inputs[i].sysval) { - so->inputs[i].compmask = compmask; - } - so->inputs[i].regid = reg; - } - - if (ctx->astc_srgb) - fixup_astc_srgb(ctx); - - /* We need to do legalize after (for frag shader's) the "bary.f" - * offsets (inloc) have been assigned. - */ - ir3_legalize(ir, &so->num_samp, &so->has_ssbo, &max_bary); - - if (ir3_shader_debug & IR3_DBG_OPTMSGS) { - printf("AFTER LEGALIZE:\n"); - ir3_print(ir); - } - - /* Note that actual_in counts inputs that are not bary.f'd for FS: */ - if (so->type == MESA_SHADER_VERTEX) - so->total_in = actual_in; - else - so->total_in = max_bary + 1; - -out: - if (ret) { - if (so->ir) - ir3_destroy(so->ir); - so->ir = NULL; - } - compile_free(ctx); - - return ret; -} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c deleted file mode 100644 index e8e8cc311e3..00000000000 --- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c +++ /dev/null @@ -1,653 +0,0 @@ -/* - * Copyright (C) 2014 Rob Clark <[email protected]> - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark <[email protected]> - */ - -#include <math.h> - -#include "ir3.h" -#include "ir3_shader.h" - -/* - * Copy Propagate: - */ - -struct ir3_cp_ctx { - struct ir3 *shader; - struct ir3_shader_variant *so; - unsigned immediate_idx; -}; - -/* is it a type preserving mov, with ok flags? */ -static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags) -{ - if (is_same_type_mov(instr)) { - struct ir3_register *dst = instr->regs[0]; - struct ir3_register *src = instr->regs[1]; - struct ir3_instruction *src_instr = ssa(src); - - /* only if mov src is SSA (not const/immed): */ - if (!src_instr) - return false; - - /* no indirect: */ - if (dst->flags & IR3_REG_RELATIV) - return false; - if (src->flags & IR3_REG_RELATIV) - return false; - - if (src->flags & IR3_REG_ARRAY) - return false; - - if (!allow_flags) - if (src->flags & (IR3_REG_FABS | IR3_REG_FNEG | - IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT)) - return false; - - /* TODO: remove this hack: */ - if (src_instr->opc == OPC_META_FO) - return false; - - return true; - } - return false; -} - -static unsigned cp_flags(unsigned flags) -{ - /* only considering these flags (at least for now): */ - flags &= (IR3_REG_CONST | IR3_REG_IMMED | - IR3_REG_FNEG | IR3_REG_FABS | - IR3_REG_SNEG | IR3_REG_SABS | - IR3_REG_BNOT | IR3_REG_RELATIV); - return flags; -} - -static bool valid_flags(struct ir3_instruction *instr, unsigned n, - unsigned flags) -{ - unsigned valid_flags; - flags = cp_flags(flags); - - /* If destination is indirect, then source cannot be.. at least - * I don't think so.. - */ - if ((instr->regs[0]->flags & IR3_REG_RELATIV) && - (flags & IR3_REG_RELATIV)) - return false; - - /* TODO it seems to *mostly* work to cp RELATIV, except we get some - * intermittent piglit variable-indexing fails. Newer blob driver - * doesn't seem to cp these. Possibly this is hw workaround? Not - * sure, but until that is understood better, lets just switch off - * cp for indirect src's: - */ - if (flags & IR3_REG_RELATIV) - return false; - - switch (opc_cat(instr->opc)) { - case 1: - valid_flags = IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV; - if (flags & ~valid_flags) - return false; - break; - case 2: - valid_flags = ir3_cat2_absneg(instr->opc) | - IR3_REG_CONST | IR3_REG_RELATIV; - - if (ir3_cat2_int(instr->opc)) - valid_flags |= IR3_REG_IMMED; - - if (flags & ~valid_flags) - return false; - - if (flags & (IR3_REG_CONST | IR3_REG_IMMED)) { - unsigned m = (n ^ 1) + 1; - /* cannot deal w/ const in both srcs: - * (note that some cat2 actually only have a single src) - */ - if (m < instr->regs_count) { - struct ir3_register *reg = instr->regs[m]; - if ((flags & IR3_REG_CONST) && (reg->flags & IR3_REG_CONST)) - return false; - if ((flags & IR3_REG_IMMED) && (reg->flags & IR3_REG_IMMED)) - return false; - } - /* cannot be const + ABS|NEG: */ - if (flags & (IR3_REG_FABS | IR3_REG_FNEG | - IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT)) - return false; - } - break; - case 3: - valid_flags = ir3_cat3_absneg(instr->opc) | - IR3_REG_CONST | IR3_REG_RELATIV; - - if (flags & ~valid_flags) - return false; - - if (flags & (IR3_REG_CONST | IR3_REG_RELATIV)) { - /* cannot deal w/ const/relativ in 2nd src: */ - if (n == 1) - return false; - } - - if (flags & IR3_REG_CONST) { - /* cannot be const + ABS|NEG: */ - if (flags & (IR3_REG_FABS | IR3_REG_FNEG | - IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT)) - return false; - } - break; - case 4: - /* seems like blob compiler avoids const as src.. */ - /* TODO double check if this is still the case on a4xx */ - if (flags & (IR3_REG_CONST | IR3_REG_IMMED)) - return false; - if (flags & (IR3_REG_SABS | IR3_REG_SNEG)) - return false; - break; - case 5: - /* no flags allowed */ - if (flags) - return false; - break; - case 6: - valid_flags = IR3_REG_IMMED; - if (flags & ~valid_flags) - return false; - - if (flags & IR3_REG_IMMED) { - /* doesn't seem like we can have immediate src for store - * instructions: - * - * TODO this restriction could also apply to load instructions, - * but for load instructions this arg is the address (and not - * really sure any good way to test a hard-coded immed addr src) - */ - if (is_store(instr) && (n == 1)) - return false; - - if ((instr->opc == OPC_LDL) && (n != 1)) - return false; - - if ((instr->opc == OPC_STL) && (n != 2)) - return false; - - /* disallow CP into anything but the SSBO slot argument for - * atomics: - */ - if (is_atomic(instr->opc) && (n != 0)) - return false; - - if (is_atomic(instr->opc) && !(instr->flags & IR3_INSTR_G)) - return false; - } - - break; - } - - return true; -} - -/* propagate register flags from src to dst.. negates need special - * handling to cancel each other out. - */ -static void combine_flags(unsigned *dstflags, struct ir3_instruction *src) -{ - unsigned srcflags = src->regs[1]->flags; - - /* if what we are combining into already has (abs) flags, - * we can drop (neg) from src: - */ - if (*dstflags & IR3_REG_FABS) - srcflags &= ~IR3_REG_FNEG; - if (*dstflags & IR3_REG_SABS) - srcflags &= ~IR3_REG_SNEG; - - if (srcflags & IR3_REG_FABS) - *dstflags |= IR3_REG_FABS; - if (srcflags & IR3_REG_SABS) - *dstflags |= IR3_REG_SABS; - if (srcflags & IR3_REG_FNEG) - *dstflags ^= IR3_REG_FNEG; - if (srcflags & IR3_REG_SNEG) - *dstflags ^= IR3_REG_SNEG; - if (srcflags & IR3_REG_BNOT) - *dstflags ^= IR3_REG_BNOT; - - *dstflags &= ~IR3_REG_SSA; - *dstflags |= srcflags & IR3_REG_SSA; - *dstflags |= srcflags & IR3_REG_CONST; - *dstflags |= srcflags & IR3_REG_IMMED; - *dstflags |= srcflags & IR3_REG_RELATIV; - *dstflags |= srcflags & IR3_REG_ARRAY; - - /* if src of the src is boolean we can drop the (abs) since we know - * the source value is already a postitive integer. This cleans - * up the absnegs that get inserted when converting between nir and - * native boolean (see ir3_b2n/n2b) - */ - struct ir3_instruction *srcsrc = ssa(src->regs[1]); - if (srcsrc && is_bool(srcsrc)) - *dstflags &= ~IR3_REG_SABS; -} - -static struct ir3_register * -lower_immed(struct ir3_cp_ctx *ctx, struct ir3_register *reg, unsigned new_flags) -{ - unsigned swiz, idx, i; - - reg = ir3_reg_clone(ctx->shader, reg); - - /* in some cases, there are restrictions on (abs)/(neg) plus const.. - * so just evaluate those and clear the flags: - */ - if (new_flags & IR3_REG_SABS) { - reg->iim_val = abs(reg->iim_val); - new_flags &= ~IR3_REG_SABS; - } - - if (new_flags & IR3_REG_FABS) { - reg->fim_val = fabs(reg->fim_val); - new_flags &= ~IR3_REG_FABS; - } - - if (new_flags & IR3_REG_SNEG) { - reg->iim_val = -reg->iim_val; - new_flags &= ~IR3_REG_SNEG; - } - - if (new_flags & IR3_REG_FNEG) { - reg->fim_val = -reg->fim_val; - new_flags &= ~IR3_REG_FNEG; - } - - /* Reallocate for 4 more elements whenever it's necessary */ - if (ctx->immediate_idx == ctx->so->immediates_size * 4) { - ctx->so->immediates_size += 4; - ctx->so->immediates = realloc (ctx->so->immediates, - ctx->so->immediates_size * sizeof (ctx->so->immediates[0])); - } - - for (i = 0; i < ctx->immediate_idx; i++) { - swiz = i % 4; - idx = i / 4; - - if (ctx->so->immediates[idx].val[swiz] == reg->uim_val) { - break; - } - } - - if (i == ctx->immediate_idx) { - /* need to generate a new immediate: */ - swiz = i % 4; - idx = i / 4; - ctx->so->immediates[idx].val[swiz] = reg->uim_val; - ctx->so->immediates_count = idx + 1; - ctx->immediate_idx++; - } - - new_flags &= ~IR3_REG_IMMED; - new_flags |= IR3_REG_CONST; - reg->flags = new_flags; - reg->num = i + (4 * ctx->so->constbase.immediate); - - return reg; -} - -static void -unuse(struct ir3_instruction *instr) -{ - debug_assert(instr->use_count > 0); - - if (--instr->use_count == 0) { - struct ir3_block *block = instr->block; - - instr->barrier_class = 0; - instr->barrier_conflict = 0; - - /* we don't want to remove anything in keeps (which could - * be things like array store's) - */ - for (unsigned i = 0; i < block->keeps_count; i++) { - debug_assert(block->keeps[i] != instr); - } - } -} - -/** - * Handle cp for a given src register. This additionally handles - * the cases of collapsing immedate/const (which replace the src - * register with a non-ssa src) or collapsing mov's from relative - * src (which needs to also fixup the address src reference by the - * instruction). - */ -static void -reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr, - struct ir3_register *reg, unsigned n) -{ - struct ir3_instruction *src = ssa(reg); - - if (is_eligible_mov(src, true)) { - /* simple case, no immed/const/relativ, only mov's w/ ssa src: */ - struct ir3_register *src_reg = src->regs[1]; - unsigned new_flags = reg->flags; - - combine_flags(&new_flags, src); - - if (valid_flags(instr, n, new_flags)) { - if (new_flags & IR3_REG_ARRAY) { - debug_assert(!(reg->flags & IR3_REG_ARRAY)); - reg->array = src_reg->array; - } - reg->flags = new_flags; - reg->instr = ssa(src_reg); - - instr->barrier_class |= src->barrier_class; - instr->barrier_conflict |= src->barrier_conflict; - - unuse(src); - reg->instr->use_count++; - } - - } else if (is_same_type_mov(src) && - /* cannot collapse const/immed/etc into meta instrs: */ - !is_meta(instr)) { - /* immed/const/etc cases, which require some special handling: */ - struct ir3_register *src_reg = src->regs[1]; - unsigned new_flags = reg->flags; - - combine_flags(&new_flags, src); - - if (!valid_flags(instr, n, new_flags)) { - /* See if lowering an immediate to const would help. */ - if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) { - debug_assert(new_flags & IR3_REG_IMMED); - instr->regs[n + 1] = lower_immed(ctx, src_reg, new_flags); - return; - } - - /* special case for "normal" mad instructions, we can - * try swapping the first two args if that fits better. - * - * the "plain" MAD's (ie. the ones that don't shift first - * src prior to multiply) can swap their first two srcs if - * src[0] is !CONST and src[1] is CONST: - */ - if ((n == 1) && is_mad(instr->opc) && - !(instr->regs[0 + 1]->flags & (IR3_REG_CONST | IR3_REG_RELATIV)) && - valid_flags(instr, 0, new_flags & ~IR3_REG_IMMED)) { - /* swap src[0] and src[1]: */ - struct ir3_register *tmp; - tmp = instr->regs[0 + 1]; - instr->regs[0 + 1] = instr->regs[1 + 1]; - instr->regs[1 + 1] = tmp; - - n = 0; - } else { - return; - } - } - - /* Here we handle the special case of mov from - * CONST and/or RELATIV. These need to be handled - * specially, because in the case of move from CONST - * there is no src ir3_instruction so we need to - * replace the ir3_register. And in the case of - * RELATIV we need to handle the address register - * dependency. - */ - if (src_reg->flags & IR3_REG_CONST) { - /* an instruction cannot reference two different - * address registers: - */ - if ((src_reg->flags & IR3_REG_RELATIV) && - conflicts(instr->address, reg->instr->address)) - return; - - /* This seems to be a hw bug, or something where the timings - * just somehow don't work out. This restriction may only - * apply if the first src is also CONST. - */ - if ((opc_cat(instr->opc) == 3) && (n == 2) && - (src_reg->flags & IR3_REG_RELATIV) && - (src_reg->array.offset == 0)) - return; - - src_reg = ir3_reg_clone(instr->block->shader, src_reg); - src_reg->flags = new_flags; - instr->regs[n+1] = src_reg; - - if (src_reg->flags & IR3_REG_RELATIV) - ir3_instr_set_address(instr, reg->instr->address); - - return; - } - - if ((src_reg->flags & IR3_REG_RELATIV) && - !conflicts(instr->address, reg->instr->address)) { - src_reg = ir3_reg_clone(instr->block->shader, src_reg); - src_reg->flags = new_flags; - instr->regs[n+1] = src_reg; - ir3_instr_set_address(instr, reg->instr->address); - - return; - } - - /* NOTE: seems we can only do immed integers, so don't - * need to care about float. But we do need to handle - * abs/neg *before* checking that the immediate requires - * few enough bits to encode: - * - * TODO: do we need to do something to avoid accidentally - * catching a float immed? - */ - if (src_reg->flags & IR3_REG_IMMED) { - int32_t iim_val = src_reg->iim_val; - - debug_assert((opc_cat(instr->opc) == 1) || - (opc_cat(instr->opc) == 6) || - ir3_cat2_int(instr->opc) || - (is_mad(instr->opc) && (n == 0))); - - if (new_flags & IR3_REG_SABS) - iim_val = abs(iim_val); - - if (new_flags & IR3_REG_SNEG) - iim_val = -iim_val; - - if (new_flags & IR3_REG_BNOT) - iim_val = ~iim_val; - - /* other than category 1 (mov) we can only encode up to 10 bits: */ - if ((instr->opc == OPC_MOV) || - !((iim_val & ~0x3ff) && (-iim_val & ~0x3ff))) { - new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT); - src_reg = ir3_reg_clone(instr->block->shader, src_reg); - src_reg->flags = new_flags; - src_reg->iim_val = iim_val; - instr->regs[n+1] = src_reg; - } else if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) { - /* See if lowering an immediate to const would help. */ - instr->regs[n+1] = lower_immed(ctx, src_reg, new_flags); - } - - return; - } - } -} - -/* Handle special case of eliminating output mov, and similar cases where - * there isn't a normal "consuming" instruction. In this case we cannot - * collapse flags (ie. output mov from const, or w/ abs/neg flags, cannot - * be eliminated) - */ -static struct ir3_instruction * -eliminate_output_mov(struct ir3_instruction *instr) -{ - if (is_eligible_mov(instr, false)) { - struct ir3_register *reg = instr->regs[1]; - if (!(reg->flags & IR3_REG_ARRAY)) { - struct ir3_instruction *src_instr = ssa(reg); - debug_assert(src_instr); - return src_instr; - } - } - return instr; -} - -/** - * Find instruction src's which are mov's that can be collapsed, replacing - * the mov dst with the mov src - */ -static void -instr_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr) -{ - struct ir3_register *reg; - - if (instr->regs_count == 0) - return; - - if (ir3_instr_check_mark(instr)) - return; - - /* walk down the graph from each src: */ - foreach_src_n(reg, n, instr) { - struct ir3_instruction *src = ssa(reg); - - if (!src) - continue; - - instr_cp(ctx, src); - - /* TODO non-indirect access we could figure out which register - * we actually want and allow cp.. - */ - if (reg->flags & IR3_REG_ARRAY) - continue; - - /* Don't CP absneg into meta instructions, that won't end well: */ - if (is_meta(instr) && (src->opc != OPC_MOV)) - continue; - - reg_cp(ctx, instr, reg, n); - } - - if (instr->regs[0]->flags & IR3_REG_ARRAY) { - struct ir3_instruction *src = ssa(instr->regs[0]); - if (src) - instr_cp(ctx, src); - } - - if (instr->address) { - instr_cp(ctx, instr->address); - ir3_instr_set_address(instr, eliminate_output_mov(instr->address)); - } - - /* we can end up with extra cmps.s from frontend, which uses a - * - * cmps.s p0.x, cond, 0 - * - * as a way to mov into the predicate register. But frequently 'cond' - * is itself a cmps.s/cmps.f/cmps.u. So detect this special case and - * just re-write the instruction writing predicate register to get rid - * of the double cmps. - */ - if ((instr->opc == OPC_CMPS_S) && - (instr->regs[0]->num == regid(REG_P0, 0)) && - ssa(instr->regs[1]) && - (instr->regs[2]->flags & IR3_REG_IMMED) && - (instr->regs[2]->iim_val == 0)) { - struct ir3_instruction *cond = ssa(instr->regs[1]); - switch (cond->opc) { - case OPC_CMPS_S: - case OPC_CMPS_F: - case OPC_CMPS_U: - instr->opc = cond->opc; - instr->flags = cond->flags; - instr->cat2 = cond->cat2; - instr->address = cond->address; - instr->regs[1] = cond->regs[1]; - instr->regs[2] = cond->regs[2]; - instr->barrier_class |= cond->barrier_class; - instr->barrier_conflict |= cond->barrier_conflict; - unuse(cond); - break; - default: - break; - } - } -} - -void -ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so) -{ - struct ir3_cp_ctx ctx = { - .shader = ir, - .so = so, - }; - - /* This is a bit annoying, and probably wouldn't be necessary if we - * tracked a reverse link from producing instruction to consumer. - * But we need to know when we've eliminated the last consumer of - * a mov, so we need to do a pass to first count consumers of a - * mov. - */ - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { - struct ir3_instruction *src; - - /* by the way, we don't account for false-dep's, so the CP - * pass should always happen before false-dep's are inserted - */ - debug_assert(instr->deps_count == 0); - - foreach_ssa_src(src, instr) { - src->use_count++; - } - } - } - - ir3_clear_mark(ir); - - for (unsigned i = 0; i < ir->noutputs; i++) { - if (ir->outputs[i]) { - instr_cp(&ctx, ir->outputs[i]); - ir->outputs[i] = eliminate_output_mov(ir->outputs[i]); - } - } - - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - if (block->condition) { - instr_cp(&ctx, block->condition); - block->condition = eliminate_output_mov(block->condition); - } - - for (unsigned i = 0; i < block->keeps_count; i++) { - instr_cp(&ctx, block->keeps[i]); - block->keeps[i] = eliminate_output_mov(block->keeps[i]); - } - } -} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c deleted file mode 100644 index 73bf5e19926..00000000000 --- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c +++ /dev/null @@ -1,245 +0,0 @@ -/* - * Copyright (C) 2014 Rob Clark <[email protected]> - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark <[email protected]> - */ - -#include "util/u_math.h" - -#include "ir3.h" - -/* - * Instruction Depth: - * - * Calculates weighted instruction depth, ie. the sum of # of needed - * instructions plus delay slots back to original input (ie INPUT or - * CONST). That is to say, an instructions depth is: - * - * depth(instr) { - * d = 0; - * // for each src register: - * foreach (src in instr->regs[1..n]) - * d = max(d, delayslots(src->instr, n) + depth(src->instr)); - * return d + 1; - * } - * - * After an instruction's depth is calculated, it is inserted into the - * blocks depth sorted list, which is used by the scheduling pass. - */ - -/* generally don't count false dependencies, since this can just be - * something like a barrier, or SSBO store. The exception is array - * dependencies if the assigner is an array write and the consumer - * reads the same array. - */ -static bool -ignore_dep(struct ir3_instruction *assigner, - struct ir3_instruction *consumer, unsigned n) -{ - if (!__is_false_dep(consumer, n)) - return false; - - if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) { - struct ir3_register *dst = assigner->regs[0]; - struct ir3_register *src; - - debug_assert(dst->flags & IR3_REG_ARRAY); - - foreach_src(src, consumer) { - if ((src->flags & IR3_REG_ARRAY) && - (dst->array.id == src->array.id)) { - return false; - } - } - } - - return true; -} - -/* calculate required # of delay slots between the instruction that - * assigns a value and the one that consumes - */ -int ir3_delayslots(struct ir3_instruction *assigner, - struct ir3_instruction *consumer, unsigned n) -{ - if (ignore_dep(assigner, consumer, n)) - return 0; - - /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal - * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch - * handled with sync bits - */ - - if (is_meta(assigner)) - return 0; - - if (writes_addr(assigner)) - return 6; - - /* handled via sync flags: */ - if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner)) - return 0; - - /* assigner must be alu: */ - if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) || - is_mem(consumer)) { - return 6; - } else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) && - (n == 3)) { - /* special case, 3rd src to cat3 not required on first cycle */ - return 1; - } else { - return 3; - } -} - -void -ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list) -{ - /* remove from existing spot in list: */ - list_delinit(&instr->node); - - /* find where to re-insert instruction: */ - list_for_each_entry (struct ir3_instruction, pos, list, node) { - if (pos->depth > instr->depth) { - list_add(&instr->node, &pos->node); - return; - } - } - /* if we get here, we didn't find an insertion spot: */ - list_addtail(&instr->node, list); -} - -static void -ir3_instr_depth(struct ir3_instruction *instr, unsigned boost, bool falsedep) -{ - struct ir3_instruction *src; - - /* don't mark falsedep's as used, but otherwise process them normally: */ - if (!falsedep) - instr->flags &= ~IR3_INSTR_UNUSED; - - if (ir3_instr_check_mark(instr)) - return; - - instr->depth = 0; - - foreach_ssa_src_n(src, i, instr) { - unsigned sd; - - /* visit child to compute it's depth: */ - ir3_instr_depth(src, boost, __is_false_dep(instr, i)); - - /* for array writes, no need to delay on previous write: */ - if (i == 0) - continue; - - sd = ir3_delayslots(src, instr, i) + src->depth; - sd += boost; - - instr->depth = MAX2(instr->depth, sd); - } - - if (!is_meta(instr)) - instr->depth++; - - ir3_insert_by_depth(instr, &instr->block->instr_list); -} - -static bool -remove_unused_by_block(struct ir3_block *block) -{ - bool progress = false; - list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) { - if (instr->opc == OPC_END) - continue; - if (instr->flags & IR3_INSTR_UNUSED) { - list_delinit(&instr->node); - progress = true; - } - } - return progress; -} - -static bool -compute_depth_and_remove_unused(struct ir3 *ir) -{ - unsigned i; - bool progress = false; - - ir3_clear_mark(ir); - - /* initially mark everything as unused, we'll clear the flag as we - * visit the instructions: - */ - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { - instr->flags |= IR3_INSTR_UNUSED; - } - } - - for (i = 0; i < ir->noutputs; i++) - if (ir->outputs[i]) - ir3_instr_depth(ir->outputs[i], 0, false); - - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - for (i = 0; i < block->keeps_count; i++) - ir3_instr_depth(block->keeps[i], 0, false); - - /* We also need to account for if-condition: */ - if (block->condition) - ir3_instr_depth(block->condition, 6, false); - } - - /* mark un-used instructions: */ - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - progress |= remove_unused_by_block(block); - } - - /* note that we can end up with unused indirects, but we should - * not end up with unused predicates. - */ - for (i = 0; i < ir->indirects_count; i++) { - struct ir3_instruction *instr = ir->indirects[i]; - if (instr && (instr->flags & IR3_INSTR_UNUSED)) - ir->indirects[i] = NULL; - } - - /* cleanup unused inputs: */ - for (i = 0; i < ir->ninputs; i++) { - struct ir3_instruction *in = ir->inputs[i]; - if (in && (in->flags & IR3_INSTR_UNUSED)) - ir->inputs[i] = NULL; - } - - return progress; -} - -void -ir3_depth(struct ir3 *ir) -{ - bool progress; - do { - progress = compute_depth_and_remove_unused(ir); - } while (progress); -} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c index 3a1b857e010..cc6efa1ca17 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c @@ -37,10 +37,10 @@ #include "freedreno_context.h" #include "freedreno_util.h" -#include "ir3_shader.h" -#include "ir3_gallium.h" -#include "ir3_compiler.h" -#include "ir3_nir.h" +#include "ir3/ir3_shader.h" +#include "ir3/ir3_gallium.h" +#include "ir3/ir3_compiler.h" +#include "ir3/ir3_nir.h" static void dump_shader_info(struct ir3_shader_variant *v, struct pipe_debug_callback *debug) diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.h b/src/gallium/drivers/freedreno/ir3/ir3_gallium.h index cf1d48d97ba..5fb74596781 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.h @@ -28,7 +28,7 @@ #define IR3_GALLIUM_H_ #include "pipe/p_state.h" -#include "ir3_shader.h" +#include "ir3/ir3_shader.h" struct ir3_shader * ir3_shader_create(struct ir3_compiler *compiler, const struct pipe_shader_state *cso, gl_shader_stage type, diff --git a/src/gallium/drivers/freedreno/ir3/ir3_group.c b/src/gallium/drivers/freedreno/ir3/ir3_group.c deleted file mode 100644 index 570055973e8..00000000000 --- a/src/gallium/drivers/freedreno/ir3/ir3_group.c +++ /dev/null @@ -1,274 +0,0 @@ -/* - * Copyright (C) 2014 Rob Clark <[email protected]> - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark <[email protected]> - */ - -#include "ir3.h" - -/* - * Find/group instruction neighbors: - */ - -/* bleh.. we need to do the same group_n() thing for both inputs/outputs - * (where we have a simple instr[] array), and fanin nodes (where we have - * an extra indirection via reg->instr). - */ -struct group_ops { - struct ir3_instruction *(*get)(void *arr, int idx); - void (*insert_mov)(void *arr, int idx, struct ir3_instruction *instr); -}; - -static struct ir3_instruction *arr_get(void *arr, int idx) -{ - return ((struct ir3_instruction **)arr)[idx]; -} -static void arr_insert_mov_out(void *arr, int idx, struct ir3_instruction *instr) -{ - ((struct ir3_instruction **)arr)[idx] = - ir3_MOV(instr->block, instr, TYPE_F32); -} -static void arr_insert_mov_in(void *arr, int idx, struct ir3_instruction *instr) -{ - /* so, we can't insert a mov in front of a meta:in.. and the downstream - * instruction already has a pointer to 'instr'. So we cheat a bit and - * morph the meta:in instruction into a mov and insert a new meta:in - * in front. - */ - struct ir3_instruction *in; - - debug_assert(instr->regs_count == 1); - - in = ir3_instr_create(instr->block, OPC_META_INPUT); - in->inout.block = instr->block; - ir3_reg_create(in, instr->regs[0]->num, 0); - - /* create src reg for meta:in and fixup to now be a mov: */ - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = in; - instr->opc = OPC_MOV; - instr->cat1.src_type = TYPE_F32; - instr->cat1.dst_type = TYPE_F32; - - ((struct ir3_instruction **)arr)[idx] = in; -} -static struct group_ops arr_ops_out = { arr_get, arr_insert_mov_out }; -static struct group_ops arr_ops_in = { arr_get, arr_insert_mov_in }; - -static struct ir3_instruction *instr_get(void *arr, int idx) -{ - return ssa(((struct ir3_instruction *)arr)->regs[idx+1]); -} -static void -instr_insert_mov(void *arr, int idx, struct ir3_instruction *instr) -{ - ((struct ir3_instruction *)arr)->regs[idx+1]->instr = - ir3_MOV(instr->block, instr, TYPE_F32); -} -static struct group_ops instr_ops = { instr_get, instr_insert_mov }; - -/* verify that cur != instr, but cur is also not in instr's neighbor-list: */ -static bool -in_neighbor_list(struct ir3_instruction *instr, struct ir3_instruction *cur, int pos) -{ - int idx = 0; - - if (!instr) - return false; - - if (instr == cur) - return true; - - for (instr = ir3_neighbor_first(instr); instr; instr = instr->cp.right) - if ((idx++ != pos) && (instr == cur)) - return true; - - return false; -} - -static void -group_n(struct group_ops *ops, void *arr, unsigned n) -{ - unsigned i, j; - - /* first pass, figure out what has conflicts and needs a mov - * inserted. Do this up front, before starting to setup - * left/right neighbor pointers. Trying to do it in a single - * pass could result in a situation where we can't even setup - * the mov's right neighbor ptr if the next instr also needs - * a mov. - */ -restart: - for (i = 0; i < n; i++) { - struct ir3_instruction *instr = ops->get(arr, i); - if (instr) { - struct ir3_instruction *left = (i > 0) ? ops->get(arr, i - 1) : NULL; - struct ir3_instruction *right = (i < (n-1)) ? ops->get(arr, i + 1) : NULL; - bool conflict; - - /* check for left/right neighbor conflicts: */ - conflict = conflicts(instr->cp.left, left) || - conflicts(instr->cp.right, right); - - /* Mixing array elements and higher register classes - * (ie. groups) doesn't really work out in RA. See: - * - * https://trello.com/c/DqeDkeVf/156-bug-with-stk-70frag - */ - if (instr->regs[0]->flags & IR3_REG_ARRAY) - conflict = true; - - /* we also can't have an instr twice in the group: */ - for (j = i + 1; (j < n) && !conflict; j++) - if (in_neighbor_list(ops->get(arr, j), instr, i)) - conflict = true; - - if (conflict) { - ops->insert_mov(arr, i, instr); - /* inserting the mov may have caused a conflict - * against the previous: - */ - goto restart; - } - } - } - - /* second pass, now that we've inserted mov's, fixup left/right - * neighbors. This is guaranteed to succeed, since by definition - * the newly inserted mov's cannot conflict with anything. - */ - for (i = 0; i < n; i++) { - struct ir3_instruction *instr = ops->get(arr, i); - if (instr) { - struct ir3_instruction *left = (i > 0) ? ops->get(arr, i - 1) : NULL; - struct ir3_instruction *right = (i < (n-1)) ? ops->get(arr, i + 1) : NULL; - - debug_assert(!conflicts(instr->cp.left, left)); - if (left) { - instr->cp.left_cnt++; - instr->cp.left = left; - } - - debug_assert(!conflicts(instr->cp.right, right)); - if (right) { - instr->cp.right_cnt++; - instr->cp.right = right; - } - } - } -} - -static void -instr_find_neighbors(struct ir3_instruction *instr) -{ - struct ir3_instruction *src; - - if (ir3_instr_check_mark(instr)) - return; - - if (instr->opc == OPC_META_FI) - group_n(&instr_ops, instr, instr->regs_count - 1); - - foreach_ssa_src(src, instr) - instr_find_neighbors(src); -} - -/* a bit of sadness.. we can't have "holes" in inputs from PoV of - * register assignment, they still need to be grouped together. So - * we need to insert dummy/padding instruction for grouping, and - * then take it back out again before anyone notices. - */ -static void -pad_and_group_input(struct ir3_instruction **input, unsigned n) -{ - int i, mask = 0; - struct ir3_block *block = NULL; - - for (i = n - 1; i >= 0; i--) { - struct ir3_instruction *instr = input[i]; - if (instr) { - block = instr->block; - } else if (block) { - instr = ir3_NOP(block); - ir3_reg_create(instr, 0, IR3_REG_SSA); /* dummy dst */ - input[i] = instr; - mask |= (1 << i); - } - } - - group_n(&arr_ops_in, input, n); - - for (i = 0; i < n; i++) { - if (mask & (1 << i)) - input[i] = NULL; - } -} - -static void -find_neighbors(struct ir3 *ir) -{ - unsigned i; - - /* shader inputs/outputs themselves must be contiguous as well: - * - * NOTE: group inputs first, since we only insert mov's - * *before* the conflicted instr (and that would go badly - * for inputs). By doing inputs first, we should never - * have a conflict on inputs.. pushing any conflict to - * resolve to the outputs, for stuff like: - * - * MOV OUT[n], IN[m].wzyx - * - * NOTE: we assume here inputs/outputs are grouped in vec4. - * This logic won't quite cut it if we don't align smaller - * on vec4 boundaries - */ - for (i = 0; i < ir->ninputs; i += 4) - pad_and_group_input(&ir->inputs[i], 4); - for (i = 0; i < ir->noutputs; i += 4) - group_n(&arr_ops_out, &ir->outputs[i], 4); - - for (i = 0; i < ir->noutputs; i++) { - if (ir->outputs[i]) { - struct ir3_instruction *instr = ir->outputs[i]; - instr_find_neighbors(instr); - } - } - - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - for (i = 0; i < block->keeps_count; i++) { - struct ir3_instruction *instr = block->keeps[i]; - instr_find_neighbors(instr); - } - - /* We also need to account for if-condition: */ - if (block->condition) - instr_find_neighbors(block->condition); - } -} - -void -ir3_group(struct ir3 *ir) -{ - ir3_clear_mark(ir); - find_neighbors(ir); -} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c deleted file mode 100644 index ff4c644eab5..00000000000 --- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c +++ /dev/null @@ -1,496 +0,0 @@ -/* - * Copyright (C) 2014 Rob Clark <[email protected]> - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark <[email protected]> - */ - -#include "util/ralloc.h" -#include "util/u_math.h" - -#include "ir3.h" - -/* - * Legalize: - * - * We currently require that scheduling ensures that we have enough nop's - * in all the right places. The legalize step mostly handles fixing up - * instruction flags ((ss)/(sy)/(ei)), and collapses sequences of nop's - * into fewer nop's w/ rpt flag. - */ - -struct ir3_legalize_ctx { - int num_samp; - bool has_ssbo; - int max_bary; -}; - -struct ir3_legalize_state { - regmask_t needs_ss; - regmask_t needs_ss_war; /* write after read */ - regmask_t needs_sy; -}; - -struct ir3_legalize_block_data { - bool valid; - struct ir3_legalize_state state; -}; - -/* We want to evaluate each block from the position of any other - * predecessor block, in order that the flags set are the union of - * all possible program paths. - * - * To do this, we need to know the output state (needs_ss/ss_war/sy) - * of all predecessor blocks. The tricky thing is loops, which mean - * that we can't simply recursively process each predecessor block - * before legalizing the current block. - * - * How we handle that is by looping over all the blocks until the - * results converge. If the output state of a given block changes - * in a given pass, this means that all successor blocks are not - * yet fully legalized. - */ - -static bool -legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) -{ - struct ir3_legalize_block_data *bd = block->data; - - if (bd->valid) - return false; - - struct ir3_instruction *last_input = NULL; - struct ir3_instruction *last_rel = NULL; - struct ir3_instruction *last_n = NULL; - struct list_head instr_list; - struct ir3_legalize_state prev_state = bd->state; - struct ir3_legalize_state *state = &bd->state; - - /* our input state is the OR of all predecessor blocks' state: */ - for (unsigned i = 0; i < block->predecessors_count; i++) { - struct ir3_legalize_block_data *pbd = block->predecessors[i]->data; - struct ir3_legalize_state *pstate = &pbd->state; - - /* Our input (ss)/(sy) state is based on OR'ing the output - * state of all our predecessor blocks - */ - regmask_or(&state->needs_ss, - &state->needs_ss, &pstate->needs_ss); - regmask_or(&state->needs_ss_war, - &state->needs_ss_war, &pstate->needs_ss_war); - regmask_or(&state->needs_sy, - &state->needs_sy, &pstate->needs_sy); - } - - /* remove all the instructions from the list, we'll be adding - * them back in as we go - */ - list_replace(&block->instr_list, &instr_list); - list_inithead(&block->instr_list); - - list_for_each_entry_safe (struct ir3_instruction, n, &instr_list, node) { - struct ir3_register *reg; - unsigned i; - - n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY); - - if (is_meta(n)) - continue; - - if (is_input(n)) { - struct ir3_register *inloc = n->regs[1]; - assert(inloc->flags & IR3_REG_IMMED); - ctx->max_bary = MAX2(ctx->max_bary, inloc->iim_val); - } - - if (last_n && is_barrier(last_n)) - n->flags |= IR3_INSTR_SS | IR3_INSTR_SY; - - /* NOTE: consider dst register too.. it could happen that - * texture sample instruction (for example) writes some - * components which are unused. A subsequent instruction - * that writes the same register can race w/ the sam instr - * resulting in undefined results: - */ - for (i = 0; i < n->regs_count; i++) { - reg = n->regs[i]; - - if (reg_gpr(reg)) { - - /* TODO: we probably only need (ss) for alu - * instr consuming sfu result.. need to make - * some tests for both this and (sy).. - */ - if (regmask_get(&state->needs_ss, reg)) { - n->flags |= IR3_INSTR_SS; - regmask_init(&state->needs_ss_war); - regmask_init(&state->needs_ss); - } - - if (regmask_get(&state->needs_sy, reg)) { - n->flags |= IR3_INSTR_SY; - regmask_init(&state->needs_sy); - } - } - - /* TODO: is it valid to have address reg loaded from a - * relative src (ie. mova a0, c<a0.x+4>)? If so, the - * last_rel check below should be moved ahead of this: - */ - if (reg->flags & IR3_REG_RELATIV) - last_rel = n; - } - - if (n->regs_count > 0) { - reg = n->regs[0]; - if (regmask_get(&state->needs_ss_war, reg)) { - n->flags |= IR3_INSTR_SS; - regmask_init(&state->needs_ss_war); - regmask_init(&state->needs_ss); - } - - if (last_rel && (reg->num == regid(REG_A0, 0))) { - last_rel->flags |= IR3_INSTR_UL; - last_rel = NULL; - } - } - - /* cat5+ does not have an (ss) bit, if needed we need to - * insert a nop to carry the sync flag. Would be kinda - * clever if we were aware of this during scheduling, but - * this should be a pretty rare case: - */ - if ((n->flags & IR3_INSTR_SS) && (opc_cat(n->opc) >= 5)) { - struct ir3_instruction *nop; - nop = ir3_NOP(block); - nop->flags |= IR3_INSTR_SS; - n->flags &= ~IR3_INSTR_SS; - } - - /* need to be able to set (ss) on first instruction: */ - if (list_empty(&block->instr_list) && (opc_cat(n->opc) >= 5)) - ir3_NOP(block); - - if (is_nop(n) && !list_empty(&block->instr_list)) { - struct ir3_instruction *last = list_last_entry(&block->instr_list, - struct ir3_instruction, node); - if (is_nop(last) && (last->repeat < 5)) { - last->repeat++; - last->flags |= n->flags; - continue; - } - } - - list_addtail(&n->node, &block->instr_list); - - if (is_sfu(n)) - regmask_set(&state->needs_ss, n->regs[0]); - - if (is_tex(n)) { - /* this ends up being the # of samp instructions.. but that - * is ok, everything else only cares whether it is zero or - * not. We do this here, rather than when we encounter a - * SAMP decl, because (especially in binning pass shader) - * the samp instruction(s) could get eliminated if the - * result is not used. - */ - ctx->num_samp = MAX2(ctx->num_samp, n->cat5.samp + 1); - regmask_set(&state->needs_sy, n->regs[0]); - } else if (n->opc == OPC_RESINFO) { - regmask_set(&state->needs_ss, n->regs[0]); - ir3_NOP(block)->flags |= IR3_INSTR_SS; - } else if (is_load(n)) { - /* seems like ldlv needs (ss) bit instead?? which is odd but - * makes a bunch of flat-varying tests start working on a4xx. - */ - if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL)) - regmask_set(&state->needs_ss, n->regs[0]); - else - regmask_set(&state->needs_sy, n->regs[0]); - } else if (is_atomic(n->opc)) { - if (n->flags & IR3_INSTR_G) - regmask_set(&state->needs_sy, n->regs[0]); - else - regmask_set(&state->needs_ss, n->regs[0]); - } - - if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G))) - ctx->has_ssbo = true; - - /* both tex/sfu appear to not always immediately consume - * their src register(s): - */ - if (is_tex(n) || is_sfu(n) || is_mem(n)) { - foreach_src(reg, n) { - if (reg_gpr(reg)) - regmask_set(&state->needs_ss_war, reg); - } - } - - if (is_input(n)) - last_input = n; - - last_n = n; - } - - if (last_input) { - /* special hack.. if using ldlv to bypass interpolation, - * we need to insert a dummy bary.f on which we can set - * the (ei) flag: - */ - if (is_mem(last_input) && (last_input->opc == OPC_LDLV)) { - struct ir3_instruction *baryf; - - /* (ss)bary.f (ei)r63.x, 0, r0.x */ - baryf = ir3_instr_create(block, OPC_BARY_F); - baryf->flags |= IR3_INSTR_SS; - ir3_reg_create(baryf, regid(63, 0), 0); - ir3_reg_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0; - ir3_reg_create(baryf, regid(0, 0), 0); - - /* insert the dummy bary.f after last_input: */ - list_delinit(&baryf->node); - list_add(&baryf->node, &last_input->node); - - last_input = baryf; - } - last_input->regs[0]->flags |= IR3_REG_EI; - } - - if (last_rel) - last_rel->flags |= IR3_INSTR_UL; - - bd->valid = true; - - if (memcmp(&prev_state, state, sizeof(*state))) { - /* our output state changed, this invalidates all of our - * successors: - */ - for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) { - if (!block->successors[i]) - break; - struct ir3_legalize_block_data *pbd = block->successors[i]->data; - pbd->valid = false; - } - } - - return true; -} - -/* NOTE: branch instructions are always the last instruction(s) - * in the block. We take advantage of this as we resolve the - * branches, since "if (foo) break;" constructs turn into - * something like: - * - * block3 { - * ... - * 0029:021: mov.s32s32 r62.x, r1.y - * 0082:022: br !p0.x, target=block5 - * 0083:023: br p0.x, target=block4 - * // succs: if _[0029:021: mov.s32s32] block4; else block5; - * } - * block4 { - * 0084:024: jump, target=block6 - * // succs: block6; - * } - * block5 { - * 0085:025: jump, target=block7 - * // succs: block7; - * } - * - * ie. only instruction in block4/block5 is a jump, so when - * resolving branches we can easily detect this by checking - * that the first instruction in the target block is itself - * a jump, and setup the br directly to the jump's target - * (and strip back out the now unreached jump) - * - * TODO sometimes we end up with things like: - * - * br !p0.x, #2 - * br p0.x, #12 - * add.u r0.y, r0.y, 1 - * - * If we swapped the order of the branches, we could drop one. - */ -static struct ir3_block * -resolve_dest_block(struct ir3_block *block) -{ - /* special case for last block: */ - if (!block->successors[0]) - return block; - - /* NOTE that we may or may not have inserted the jump - * in the target block yet, so conditions to resolve - * the dest to the dest block's successor are: - * - * (1) successor[1] == NULL && - * (2) (block-is-empty || only-instr-is-jump) - */ - if (block->successors[1] == NULL) { - if (list_empty(&block->instr_list)) { - return block->successors[0]; - } else if (list_length(&block->instr_list) == 1) { - struct ir3_instruction *instr = list_first_entry( - &block->instr_list, struct ir3_instruction, node); - if (instr->opc == OPC_JUMP) - return block->successors[0]; - } - } - return block; -} - -static bool -resolve_jump(struct ir3_instruction *instr) -{ - struct ir3_block *tblock = - resolve_dest_block(instr->cat0.target); - struct ir3_instruction *target; - - if (tblock != instr->cat0.target) { - list_delinit(&instr->cat0.target->node); - instr->cat0.target = tblock; - return true; - } - - target = list_first_entry(&tblock->instr_list, - struct ir3_instruction, node); - - /* TODO maybe a less fragile way to do this. But we are expecting - * a pattern from sched_block() that looks like: - * - * br !p0.x, #else-block - * br p0.x, #if-block - * - * if the first branch target is +2, or if 2nd branch target is +1 - * then we can just drop the jump. - */ - unsigned next_block; - if (instr->cat0.inv == true) - next_block = 2; - else - next_block = 1; - - if ((!target) || (target->ip == (instr->ip + next_block))) { - list_delinit(&instr->node); - return true; - } else { - instr->cat0.immed = - (int)target->ip - (int)instr->ip; - } - return false; -} - -/* resolve jumps, removing jumps/branches to immediately following - * instruction which we end up with from earlier stages. Since - * removing an instruction can invalidate earlier instruction's - * branch offsets, we need to do this iteratively until no more - * branches are removed. - */ -static bool -resolve_jumps(struct ir3 *ir) -{ - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) - list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) - if (is_flow(instr) && instr->cat0.target) - if (resolve_jump(instr)) - return true; - - return false; -} - -/* we want to mark points where divergent flow control re-converges - * with (jp) flags. For now, since we don't do any optimization for - * things that start out as a 'do {} while()', re-convergence points - * will always be a branch or jump target. Note that this is overly - * conservative, since unconditional jump targets are not convergence - * points, we are just assuming that the other path to reach the jump - * target was divergent. If we were clever enough to optimize the - * jump at end of a loop back to a conditional branch into a single - * conditional branch, ie. like: - * - * add.f r1.w, r0.x, (neg)(r)c2.x <= loop start - * mul.f r1.z, r1.z, r0.x - * mul.f r1.y, r1.y, r0.x - * mul.f r0.z, r1.x, r0.x - * mul.f r0.w, r0.y, r0.x - * cmps.f.ge r0.x, (r)c2.y, (r)r1.w - * add.s r0.x, (r)r0.x, (r)-1 - * sel.f32 r0.x, (r)c3.y, (r)r0.x, c3.x - * cmps.f.eq p0.x, r0.x, c3.y - * mov.f32f32 r0.x, r1.w - * mov.f32f32 r0.y, r0.w - * mov.f32f32 r1.x, r0.z - * (rpt2)nop - * br !p0.x, #-13 - * (jp)mul.f r0.x, c263.y, r1.y - * - * Then we'd have to be more clever, as the convergence point is no - * longer a branch or jump target. - */ -static void -mark_convergence_points(struct ir3 *ir) -{ - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { - if (is_flow(instr) && instr->cat0.target) { - struct ir3_instruction *target = - list_first_entry(&instr->cat0.target->instr_list, - struct ir3_instruction, node); - target->flags |= IR3_INSTR_JP; - } - } - } -} - -void -ir3_legalize(struct ir3 *ir, int *num_samp, bool *has_ssbo, int *max_bary) -{ - struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx); - bool progress; - - ctx->max_bary = -1; - - /* allocate per-block data: */ - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - block->data = rzalloc(ctx, struct ir3_legalize_block_data); - } - - /* process each block: */ - do { - progress = false; - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - progress |= legalize_block(ctx, block); - } - } while (progress); - - *num_samp = ctx->num_samp; - *has_ssbo = ctx->has_ssbo; - *max_bary = ctx->max_bary; - - do { - ir3_count_instructions(ir); - } while(resolve_jumps(ir)); - - mark_convergence_points(ir); - - ralloc_free(ctx); -} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_nir.c deleted file mode 100644 index 70c01ee0593..00000000000 --- a/src/gallium/drivers/freedreno/ir3/ir3_nir.c +++ /dev/null @@ -1,263 +0,0 @@ -/* - * Copyright (C) 2015 Rob Clark <[email protected]> - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark <[email protected]> - */ - - -#include "util/debug.h" - -#include "ir3_nir.h" -#include "ir3_compiler.h" -#include "ir3_shader.h" - -static const nir_shader_compiler_options options = { - .lower_fpow = true, - .lower_scmp = true, - .lower_flrp32 = true, - .lower_flrp64 = true, - .lower_ffract = true, - .lower_fmod32 = true, - .lower_fmod64 = true, - .lower_fdiv = true, - .lower_ldexp = true, - .fuse_ffma = true, - .native_integers = true, - .vertex_id_zero_based = true, - .lower_extract_byte = true, - .lower_extract_word = true, - .lower_all_io_to_temps = true, - .lower_helper_invocation = true, -}; - -const nir_shader_compiler_options * -ir3_get_compiler_options(struct ir3_compiler *compiler) -{ - return &options; -} - -/* for given shader key, are any steps handled in nir? */ -bool -ir3_key_lowers_nir(const struct ir3_shader_key *key) -{ - return key->fsaturate_s | key->fsaturate_t | key->fsaturate_r | - key->vsaturate_s | key->vsaturate_t | key->vsaturate_r | - key->ucp_enables | key->color_two_side | - key->fclamp_color | key->vclamp_color; -} - -#define OPT(nir, pass, ...) ({ \ - bool this_progress = false; \ - NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \ - this_progress; \ -}) - -#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__) - -static void -ir3_optimize_loop(nir_shader *s) -{ - bool progress; - do { - progress = false; - - OPT_V(s, nir_lower_vars_to_ssa); - progress |= OPT(s, nir_opt_copy_prop_vars); - progress |= OPT(s, nir_opt_dead_write_vars); - progress |= OPT(s, nir_lower_alu_to_scalar); - progress |= OPT(s, nir_lower_phis_to_scalar); - - progress |= OPT(s, nir_copy_prop); - progress |= OPT(s, nir_opt_dce); - progress |= OPT(s, nir_opt_cse); - static int gcm = -1; - if (gcm == -1) - gcm = env_var_as_unsigned("GCM", 0); - if (gcm == 1) - progress |= OPT(s, nir_opt_gcm, true); - else if (gcm == 2) - progress |= OPT(s, nir_opt_gcm, false); - progress |= OPT(s, nir_opt_peephole_select, 16); - progress |= OPT(s, nir_opt_intrinsics); - progress |= OPT(s, nir_opt_algebraic); - progress |= OPT(s, nir_opt_constant_folding); - progress |= OPT(s, nir_opt_dead_cf); - if (OPT(s, nir_opt_trivial_continues)) { - progress |= true; - /* If nir_opt_trivial_continues makes progress, then we need to clean - * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll - * to make progress. - */ - OPT(s, nir_copy_prop); - OPT(s, nir_opt_dce); - } - progress |= OPT(s, nir_opt_if); - progress |= OPT(s, nir_opt_remove_phis); - progress |= OPT(s, nir_opt_undef); - - } while (progress); -} - -struct nir_shader * -ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s, - const struct ir3_shader_key *key) -{ - struct nir_lower_tex_options tex_options = { - .lower_rect = 0, - }; - - if (key) { - switch (shader->type) { - case MESA_SHADER_FRAGMENT: - tex_options.saturate_s = key->fsaturate_s; - tex_options.saturate_t = key->fsaturate_t; - tex_options.saturate_r = key->fsaturate_r; - break; - case MESA_SHADER_VERTEX: - tex_options.saturate_s = key->vsaturate_s; - tex_options.saturate_t = key->vsaturate_t; - tex_options.saturate_r = key->vsaturate_r; - break; - default: - /* TODO */ - break; - } - } - - if (shader->compiler->gpu_id >= 400) { - /* a4xx seems to have *no* sam.p */ - tex_options.lower_txp = ~0; /* lower all txp */ - } else { - /* a3xx just needs to avoid sam.p for 3d tex */ - tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D); - } - - if (ir3_shader_debug & IR3_DBG_DISASM) { - debug_printf("----------------------\n"); - nir_print_shader(s, stdout); - debug_printf("----------------------\n"); - } - - OPT_V(s, nir_opt_global_to_local); - OPT_V(s, nir_lower_regs_to_ssa); - - if (key) { - if (s->info.stage == MESA_SHADER_VERTEX) { - OPT_V(s, nir_lower_clip_vs, key->ucp_enables, false); - if (key->vclamp_color) - OPT_V(s, nir_lower_clamp_color_outputs); - } else if (s->info.stage == MESA_SHADER_FRAGMENT) { - OPT_V(s, nir_lower_clip_fs, key->ucp_enables); - if (key->fclamp_color) - OPT_V(s, nir_lower_clamp_color_outputs); - } - if (key->color_two_side) { - OPT_V(s, nir_lower_two_sided_color); - } - } else { - /* only want to do this the first time (when key is null) - * and not again on any potential 2nd variant lowering pass: - */ - OPT_V(s, ir3_nir_apply_trig_workarounds); - } - - OPT_V(s, nir_lower_tex, &tex_options); - OPT_V(s, nir_lower_load_const_to_scalar); - if (shader->compiler->gpu_id < 500) - OPT_V(s, ir3_nir_lower_tg4_to_tex); - - ir3_optimize_loop(s); - - /* do idiv lowering after first opt loop to give a chance for - * divide by immed power-of-two to be caught first: - */ - if (OPT(s, nir_lower_idiv)) - ir3_optimize_loop(s); - - OPT_V(s, nir_remove_dead_variables, nir_var_local); - - OPT_V(s, nir_move_load_const); - - if (ir3_shader_debug & IR3_DBG_DISASM) { - debug_printf("----------------------\n"); - nir_print_shader(s, stdout); - debug_printf("----------------------\n"); - } - - nir_sweep(s); - - return s; -} - -void -ir3_nir_scan_driver_consts(nir_shader *shader, - struct ir3_driver_const_layout *layout) -{ - nir_foreach_function(function, shader) { - if (!function->impl) - continue; - - nir_foreach_block(block, function->impl) { - nir_foreach_instr(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intr = - nir_instr_as_intrinsic(instr); - unsigned idx; - - switch (intr->intrinsic) { - case nir_intrinsic_get_buffer_size: - idx = nir_src_as_const_value(intr->src[0])->u32[0]; - if (layout->ssbo_size.mask & (1 << idx)) - break; - layout->ssbo_size.mask |= (1 << idx); - layout->ssbo_size.off[idx] = - layout->ssbo_size.count; - layout->ssbo_size.count += 1; /* one const per */ - break; - case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_min: - case nir_intrinsic_image_deref_atomic_max: - case nir_intrinsic_image_deref_atomic_and: - case nir_intrinsic_image_deref_atomic_or: - case nir_intrinsic_image_deref_atomic_xor: - case nir_intrinsic_image_deref_atomic_exchange: - case nir_intrinsic_image_deref_atomic_comp_swap: - case nir_intrinsic_image_deref_store: - case nir_intrinsic_image_deref_size: - idx = nir_intrinsic_get_var(intr, 0)->data.driver_location; - if (layout->image_dims.mask & (1 << idx)) - break; - layout->image_dims.mask |= (1 << idx); - layout->image_dims.off[idx] = - layout->image_dims.count; - layout->image_dims.count += 3; /* three const per */ - break; - default: - break; - } - } - } - } -} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.h b/src/gallium/drivers/freedreno/ir3/ir3_nir.h deleted file mode 100644 index 74201d34160..00000000000 --- a/src/gallium/drivers/freedreno/ir3/ir3_nir.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (C) 2015 Rob Clark <[email protected]> - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark <[email protected]> - */ - -#ifndef IR3_NIR_H_ -#define IR3_NIR_H_ - -#include "compiler/nir/nir.h" -#include "compiler/shader_enums.h" - -#include "ir3_shader.h" - -void ir3_nir_scan_driver_consts(nir_shader *shader, struct ir3_driver_const_layout *layout); - -bool ir3_nir_apply_trig_workarounds(nir_shader *shader); -bool ir3_nir_lower_tg4_to_tex(nir_shader *shader); - -const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler); -bool ir3_key_lowers_nir(const struct ir3_shader_key *key); -struct nir_shader * ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s, - const struct ir3_shader_key *key); - -#endif /* IR3_NIR_H_ */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c deleted file mode 100644 index 37a3dcb26f8..00000000000 --- a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright © 2017 Ilia Mirkin - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "ir3_nir.h" -#include "compiler/nir/nir_builder.h" - -/* A4XX has a broken GATHER4 operation. It performs the texture swizzle on the - * gather results, rather than before. As a result, it must be emulated with - * direct texture calls. - */ - -static bool -lower_tg4(nir_block *block, nir_builder *b, void *mem_ctx) -{ - bool progress = false; - - static const int offsets[3][2] = { {0, 1}, {1, 1}, {1, 0} }; - - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_tex) - continue; - - nir_tex_instr *tg4 = (nir_tex_instr *)instr; - - if (tg4->op != nir_texop_tg4) - continue; - - b->cursor = nir_before_instr(&tg4->instr); - - nir_ssa_def *results[4]; - int offset_index = nir_tex_instr_src_index(tg4, nir_tex_src_offset); - for (int i = 0; i < 4; i++) { - int num_srcs = tg4->num_srcs + 1 /* lod */; - if (offset_index < 0 && i < 3) - num_srcs++; - - nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs); - tex->op = nir_texop_txl; - tex->sampler_dim = tg4->sampler_dim; - tex->coord_components = tg4->coord_components; - tex->is_array = tg4->is_array; - tex->is_shadow = tg4->is_shadow; - tex->is_new_style_shadow = tg4->is_new_style_shadow; - tex->texture_index = tg4->texture_index; - tex->sampler_index = tg4->sampler_index; - tex->dest_type = tg4->dest_type; - - for (int j = 0; j < tg4->num_srcs; j++) { - nir_src_copy(&tex->src[j].src, &tg4->src[j].src, tex); - tex->src[j].src_type = tg4->src[j].src_type; - } - if (i != 3) { - nir_ssa_def *offset = - nir_vec2(b, nir_imm_int(b, offsets[i][0]), - nir_imm_int(b, offsets[i][1])); - if (offset_index < 0) { - tex->src[tg4->num_srcs].src = nir_src_for_ssa(offset); - tex->src[tg4->num_srcs].src_type = nir_tex_src_offset; - } else { - assert(nir_tex_instr_src_size(tex, offset_index) == 2); - nir_ssa_def *orig = nir_ssa_for_src( - b, tex->src[offset_index].src, 2); - tex->src[offset_index].src = - nir_src_for_ssa(nir_iadd(b, orig, offset)); - } - } - tex->src[num_srcs - 1].src = nir_src_for_ssa(nir_imm_float(b, 0)); - tex->src[num_srcs - 1].src_type = nir_tex_src_lod; - - nir_ssa_dest_init(&tex->instr, &tex->dest, - nir_tex_instr_dest_size(tex), 32, NULL); - nir_builder_instr_insert(b, &tex->instr); - - results[i] = nir_channel(b, &tex->dest.ssa, tg4->component); - } - - nir_ssa_def *result = nir_vec4(b, results[0], results[1], results[2], results[3]); - nir_ssa_def_rewrite_uses(&tg4->dest.ssa, nir_src_for_ssa(result)); - - nir_instr_remove(&tg4->instr); - - progress = true; - } - - return progress; -} - -static bool -lower_tg4_func(nir_function_impl *impl) -{ - void *mem_ctx = ralloc_parent(impl); - nir_builder b; - nir_builder_init(&b, impl); - - bool progress = false; - nir_foreach_block_safe(block, impl) { - progress |= lower_tg4(block, &b, mem_ctx); - } - - if (progress) - nir_metadata_preserve(impl, nir_metadata_block_index | - nir_metadata_dominance); - - return progress; -} - -bool -ir3_nir_lower_tg4_to_tex(nir_shader *shader) -{ - bool progress = false; - - nir_foreach_function(function, shader) { - if (function->impl) - progress |= lower_tg4_func(function->impl); - } - - return progress; -} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir_trig.py b/src/gallium/drivers/freedreno/ir3/ir3_nir_trig.py deleted file mode 100644 index 3968aea543c..00000000000 --- a/src/gallium/drivers/freedreno/ir3/ir3_nir_trig.py +++ /dev/null @@ -1,51 +0,0 @@ -# -# Copyright (C) 2016 Intel Corporation -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice (including the next -# paragraph) shall be included in all copies or substantial portions of the -# Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -# IN THE SOFTWARE. - -from __future__ import print_function - -import argparse -import sys - -trig_workarounds = [ - (('fsin', 'x'), ('fsin', ('fsub', ('fmul', 6.283185, ('ffract', ('fadd', ('fmul', 0.159155, 'x'), 0.5))), 3.141593))), - (('fcos', 'x'), ('fcos', ('fsub', ('fmul', 6.283185, ('ffract', ('fadd', ('fmul', 0.159155, 'x'), 0.5))), 3.141593))), -] - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('-p', '--import-path', required=True) - args = parser.parse_args() - sys.path.insert(0, args.import_path) - run() - - -def run(): - import nir_algebraic # pylint: disable=import-error - - print('#include "ir3_nir.h"') - print(nir_algebraic.AlgebraicPass("ir3_nir_apply_trig_workarounds", - trig_workarounds).render()) - - -if __name__ == '__main__': - main() diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c deleted file mode 100644 index b6ef6e4b5a7..00000000000 --- a/src/gallium/drivers/freedreno/ir3/ir3_print.c +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Copyright (C) 2014 Rob Clark <[email protected]> - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark <[email protected]> - */ - -#include <stdarg.h> -#include <stdio.h> - -#include "ir3.h" - -#define PTRID(x) ((unsigned long)(x)) - -static void print_instr_name(struct ir3_instruction *instr) -{ - if (!instr) - return; -#ifdef DEBUG - printf("%04u:", instr->serialno); -#endif - printf("%04u:", instr->name); - printf("%04u:", instr->ip); - printf("%03u: ", instr->depth); - - if (instr->flags & IR3_INSTR_SY) - printf("(sy)"); - if (instr->flags & IR3_INSTR_SS) - printf("(ss)"); - - if (is_meta(instr)) { - switch (instr->opc) { - case OPC_META_INPUT: printf("_meta:in"); break; - case OPC_META_FO: printf("_meta:fo"); break; - case OPC_META_FI: printf("_meta:fi"); break; - - /* shouldn't hit here.. just for debugging: */ - default: printf("_meta:%d", instr->opc); break; - } - } else if (instr->opc == OPC_MOV) { - static const char *type[] = { - [TYPE_F16] = "f16", - [TYPE_F32] = "f32", - [TYPE_U16] = "u16", - [TYPE_U32] = "u32", - [TYPE_S16] = "s16", - [TYPE_S32] = "s32", - [TYPE_U8] = "u8", - [TYPE_S8] = "s8", - }; - if (instr->cat1.src_type == instr->cat1.dst_type) - printf("mov"); - else - printf("cov"); - printf(".%s%s", type[instr->cat1.src_type], type[instr->cat1.dst_type]); - } else { - printf("%s", ir3_instr_name(instr)); - if (instr->flags & IR3_INSTR_3D) - printf(".3d"); - if (instr->flags & IR3_INSTR_A) - printf(".a"); - if (instr->flags & IR3_INSTR_O) - printf(".o"); - if (instr->flags & IR3_INSTR_P) - printf(".p"); - if (instr->flags & IR3_INSTR_S) - printf(".s"); - if (instr->flags & IR3_INSTR_S2EN) - printf(".s2en"); - } -} - -static void print_reg_name(struct ir3_register *reg) -{ - if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) && - (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))) - printf("(absneg)"); - else if (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)) - printf("(neg)"); - else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) - printf("(abs)"); - - if (reg->flags & IR3_REG_IMMED) { - printf("imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val); - } else if (reg->flags & IR3_REG_ARRAY) { - printf("arr[id=%u, offset=%d, size=%u", reg->array.id, - reg->array.offset, reg->size); - /* for ARRAY we could have null src, for example first write - * instruction.. - */ - if (reg->instr) { - printf(", _["); - print_instr_name(reg->instr); - printf("]"); - } - printf("]"); - } else if (reg->flags & IR3_REG_SSA) { - printf("_["); - print_instr_name(reg->instr); - printf("]"); - } else if (reg->flags & IR3_REG_RELATIV) { - if (reg->flags & IR3_REG_HALF) - printf("h"); - if (reg->flags & IR3_REG_CONST) - printf("c<a0.x + %d>", reg->array.offset); - else - printf("\x1b[0;31mr<a0.x + %d>\x1b[0m (%u)", reg->array.offset, reg->size); - } else { - if (reg->flags & IR3_REG_HALF) - printf("h"); - if (reg->flags & IR3_REG_CONST) - printf("c%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]); - else - printf("\x1b[0;31mr%u.%c\x1b[0m", reg_num(reg), "xyzw"[reg_comp(reg)]); - } -} - -static void -tab(int lvl) -{ - for (int i = 0; i < lvl; i++) - printf("\t"); -} - -static void -print_instr(struct ir3_instruction *instr, int lvl) -{ - unsigned i; - - tab(lvl); - - print_instr_name(instr); - for (i = 0; i < instr->regs_count; i++) { - struct ir3_register *reg = instr->regs[i]; - printf(i ? ", " : " "); - print_reg_name(reg); - } - - if (instr->address) { - printf(", address=_"); - printf("["); - print_instr_name(instr->address); - printf("]"); - } - - if (instr->cp.left) { - printf(", left=_"); - printf("["); - print_instr_name(instr->cp.left); - printf("]"); - } - - if (instr->cp.right) { - printf(", right=_"); - printf("["); - print_instr_name(instr->cp.right); - printf("]"); - } - - if (instr->opc == OPC_META_FO) { - printf(", off=%d", instr->fo.off); - } - - if (is_flow(instr) && instr->cat0.target) { - /* the predicate register src is implied: */ - if (instr->opc == OPC_BR) { - printf(" %sp0.x", instr->cat0.inv ? "!" : ""); - } - printf(", target=block%u", block_id(instr->cat0.target)); - } - - if (instr->deps_count) { - printf(", false-deps:"); - for (unsigned i = 0; i < instr->deps_count; i++) { - if (i > 0) - printf(", "); - printf("_["); - print_instr_name(instr->deps[i]); - printf("]"); - } - } - - printf("\n"); -} - -void ir3_print_instr(struct ir3_instruction *instr) -{ - print_instr(instr, 0); -} - -static void -print_block(struct ir3_block *block, int lvl) -{ - tab(lvl); printf("block%u {\n", block_id(block)); - - if (block->predecessors_count > 0) { - tab(lvl+1); - printf("pred: "); - for (unsigned i = 0; i < block->predecessors_count; i++) { - if (i) - printf(", "); - printf("block%u", block_id(block->predecessors[i])); - } - printf("\n"); - } - - list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { - print_instr(instr, lvl+1); - } - - tab(lvl+1); printf("/* keeps:\n"); - for (unsigned i = 0; i < block->keeps_count; i++) { - print_instr(block->keeps[i], lvl+2); - } - tab(lvl+1); printf(" */\n"); - - if (block->successors[1]) { - /* leading into if/else: */ - tab(lvl+1); - printf("/* succs: if _["); - print_instr_name(block->condition); - printf("] block%u; else block%u; */\n", - block_id(block->successors[0]), - block_id(block->successors[1])); - } else if (block->successors[0]) { - tab(lvl+1); - printf("/* succs: block%u; */\n", - block_id(block->successors[0])); - } - tab(lvl); printf("}\n"); -} - -void -ir3_print(struct ir3 *ir) -{ - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) - print_block(block, 0); - - for (unsigned i = 0; i < ir->noutputs; i++) { - if (!ir->outputs[i]) - continue; - printf("out%d: ", i); - print_instr(ir->outputs[i], 0); - } -} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c deleted file mode 100644 index ad09c4018d3..00000000000 --- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c +++ /dev/null @@ -1,1124 +0,0 @@ -/* - * Copyright (C) 2014 Rob Clark <[email protected]> - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark <[email protected]> - */ - -#include "util/u_math.h" -#include "util/register_allocate.h" -#include "util/ralloc.h" -#include "util/bitset.h" - -#include "ir3.h" -#include "ir3_compiler.h" - -/* - * Register Assignment: - * - * Uses the register_allocate util, which implements graph coloring - * algo with interference classes. To handle the cases where we need - * consecutive registers (for example, texture sample instructions), - * we model these as larger (double/quad/etc) registers which conflict - * with the corresponding registers in other classes. - * - * Additionally we create additional classes for half-regs, which - * do not conflict with the full-reg classes. We do need at least - * sizes 1-4 (to deal w/ texture sample instructions output to half- - * reg). At the moment we don't create the higher order half-reg - * classes as half-reg frequently does not have enough precision - * for texture coords at higher resolutions. - * - * There are some additional cases that we need to handle specially, - * as the graph coloring algo doesn't understand "partial writes". - * For example, a sequence like: - * - * add r0.z, ... - * sam (f32)(xy)r0.x, ... - * ... - * sam (f32)(xyzw)r0.w, r0.x, ... ; 3d texture, so r0.xyz are coord - * - * In this scenario, we treat r0.xyz as class size 3, which is written - * (from a use/def perspective) at the 'add' instruction and ignore the - * subsequent partial writes to r0.xy. So the 'add r0.z, ...' is the - * defining instruction, as it is the first to partially write r0.xyz. - * - * Note i965 has a similar scenario, which they solve with a virtual - * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after - * register assignment. But for us that is horrible from a scheduling - * standpoint. Instead what we do is use idea of 'definer' instruction. - * Ie. the first instruction (lowest ip) to write to the variable is the - * one we consider from use/def perspective when building interference - * graph. (Other instructions which write other variable components - * just define the variable some more.) - * - * Arrays of arbitrary size are handled via pre-coloring a consecutive - * sequence of registers. Additional scalar (single component) reg - * names are allocated starting at ctx->class_base[total_class_count] - * (see arr->base), which are pre-colored. In the use/def graph direct - * access is treated as a single element use/def, and indirect access - * is treated as use or def of all array elements. (Only the first - * def is tracked, in case of multiple indirect writes, etc.) - * - * TODO arrays that fit in one of the pre-defined class sizes should - * not need to be pre-colored, but instead could be given a normal - * vreg name. (Ignoring this for now since it is a good way to work - * out the kinks with arbitrary sized arrays.) - * - * TODO might be easier for debugging to split this into two passes, - * the first assigning vreg names in a way that we could ir3_print() - * the result. - */ - -static const unsigned class_sizes[] = { - 1, 2, 3, 4, - 4 + 4, /* txd + 1d/2d */ - 4 + 6, /* txd + 3d */ -}; -#define class_count ARRAY_SIZE(class_sizes) - -static const unsigned half_class_sizes[] = { - 1, 2, 3, 4, -}; -#define half_class_count ARRAY_SIZE(half_class_sizes) - -/* seems to just be used for compute shaders? Seems like vec1 and vec3 - * are sufficient (for now?) - */ -static const unsigned high_class_sizes[] = { - 1, 3, -}; -#define high_class_count ARRAY_SIZE(high_class_sizes) - -#define total_class_count (class_count + half_class_count + high_class_count) - -/* Below a0.x are normal regs. RA doesn't need to assign a0.x/p0.x. */ -#define NUM_REGS (4 * 48) /* r0 to r47 */ -#define NUM_HIGH_REGS (4 * 8) /* r48 to r55 */ -#define FIRST_HIGH_REG (4 * 48) -/* Number of virtual regs in a given class: */ -#define CLASS_REGS(i) (NUM_REGS - (class_sizes[i] - 1)) -#define HALF_CLASS_REGS(i) (NUM_REGS - (half_class_sizes[i] - 1)) -#define HIGH_CLASS_REGS(i) (NUM_HIGH_REGS - (high_class_sizes[i] - 1)) - -#define HALF_OFFSET (class_count) -#define HIGH_OFFSET (class_count + half_class_count) - -/* register-set, created one time, used for all shaders: */ -struct ir3_ra_reg_set { - struct ra_regs *regs; - unsigned int classes[class_count]; - unsigned int half_classes[half_class_count]; - unsigned int high_classes[high_class_count]; - /* maps flat virtual register space to base gpr: */ - uint16_t *ra_reg_to_gpr; - /* maps cls,gpr to flat virtual register space: */ - uint16_t **gpr_to_ra_reg; -}; - -static void -build_q_values(unsigned int **q_values, unsigned off, - const unsigned *sizes, unsigned count) -{ - for (unsigned i = 0; i < count; i++) { - q_values[i + off] = rzalloc_array(q_values, unsigned, total_class_count); - - /* From register_allocate.c: - * - * q(B,C) (indexed by C, B is this register class) in - * Runeson/Nyström paper. This is "how many registers of B could - * the worst choice register from C conflict with". - * - * If we just let the register allocation algorithm compute these - * values, is extremely expensive. However, since all of our - * registers are laid out, we can very easily compute them - * ourselves. View the register from C as fixed starting at GRF n - * somewhere in the middle, and the register from B as sliding back - * and forth. Then the first register to conflict from B is the - * one starting at n - class_size[B] + 1 and the last register to - * conflict will start at n + class_size[B] - 1. Therefore, the - * number of conflicts from B is class_size[B] + class_size[C] - 1. - * - * +-+-+-+-+-+-+ +-+-+-+-+-+-+ - * B | | | | | |n| --> | | | | | | | - * +-+-+-+-+-+-+ +-+-+-+-+-+-+ - * +-+-+-+-+-+ - * C |n| | | | | - * +-+-+-+-+-+ - * - * (Idea copied from brw_fs_reg_allocate.cpp) - */ - for (unsigned j = 0; j < count; j++) - q_values[i + off][j + off] = sizes[i] + sizes[j] - 1; - } -} - -/* One-time setup of RA register-set, which describes all the possible - * "virtual" registers and their interferences. Ie. double register - * occupies (and conflicts with) two single registers, and so forth. - * Since registers do not need to be aligned to their class size, they - * can conflict with other registers in the same class too. Ie: - * - * Single (base) | Double - * --------------+--------------- - * R0 | D0 - * R1 | D0 D1 - * R2 | D1 D2 - * R3 | D2 - * .. and so on.. - * - * (NOTE the disassembler uses notation like r0.x/y/z/w but those are - * really just four scalar registers. Don't let that confuse you.) - */ -struct ir3_ra_reg_set * -ir3_ra_alloc_reg_set(struct ir3_compiler *compiler) -{ - struct ir3_ra_reg_set *set = rzalloc(compiler, struct ir3_ra_reg_set); - unsigned ra_reg_count, reg, first_half_reg, first_high_reg, base; - unsigned int **q_values; - - /* calculate # of regs across all classes: */ - ra_reg_count = 0; - for (unsigned i = 0; i < class_count; i++) - ra_reg_count += CLASS_REGS(i); - for (unsigned i = 0; i < half_class_count; i++) - ra_reg_count += HALF_CLASS_REGS(i); - for (unsigned i = 0; i < high_class_count; i++) - ra_reg_count += HIGH_CLASS_REGS(i); - - /* allocate and populate q_values: */ - q_values = ralloc_array(set, unsigned *, total_class_count); - - build_q_values(q_values, 0, class_sizes, class_count); - build_q_values(q_values, HALF_OFFSET, half_class_sizes, half_class_count); - build_q_values(q_values, HIGH_OFFSET, high_class_sizes, high_class_count); - - /* allocate the reg-set.. */ - set->regs = ra_alloc_reg_set(set, ra_reg_count, true); - set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count); - set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count); - - /* .. and classes */ - reg = 0; - for (unsigned i = 0; i < class_count; i++) { - set->classes[i] = ra_alloc_reg_class(set->regs); - - set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i)); - - for (unsigned j = 0; j < CLASS_REGS(i); j++) { - ra_class_add_reg(set->regs, set->classes[i], reg); - - set->ra_reg_to_gpr[reg] = j; - set->gpr_to_ra_reg[i][j] = reg; - - for (unsigned br = j; br < j + class_sizes[i]; br++) - ra_add_transitive_reg_conflict(set->regs, br, reg); - - reg++; - } - } - - first_half_reg = reg; - base = HALF_OFFSET; - - for (unsigned i = 0; i < half_class_count; i++) { - set->half_classes[i] = ra_alloc_reg_class(set->regs); - - set->gpr_to_ra_reg[base + i] = - ralloc_array(set, uint16_t, HALF_CLASS_REGS(i)); - - for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) { - ra_class_add_reg(set->regs, set->half_classes[i], reg); - - set->ra_reg_to_gpr[reg] = j; - set->gpr_to_ra_reg[base + i][j] = reg; - - for (unsigned br = j; br < j + half_class_sizes[i]; br++) - ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg); - - reg++; - } - } - - first_high_reg = reg; - base = HIGH_OFFSET; - - for (unsigned i = 0; i < high_class_count; i++) { - set->high_classes[i] = ra_alloc_reg_class(set->regs); - - set->gpr_to_ra_reg[base + i] = - ralloc_array(set, uint16_t, HIGH_CLASS_REGS(i)); - - for (unsigned j = 0; j < HIGH_CLASS_REGS(i); j++) { - ra_class_add_reg(set->regs, set->high_classes[i], reg); - - set->ra_reg_to_gpr[reg] = j; - set->gpr_to_ra_reg[base + i][j] = reg; - - for (unsigned br = j; br < j + high_class_sizes[i]; br++) - ra_add_transitive_reg_conflict(set->regs, br + first_high_reg, reg); - - reg++; - } - } - - /* starting a6xx, half precision regs conflict w/ full precision regs: */ - if (compiler->gpu_id >= 600) { - /* because of transitivity, we can get away with just setting up - * conflicts between the first class of full and half regs: - */ - for (unsigned j = 0; j < CLASS_REGS(0) / 2; j++) { - unsigned freg = set->gpr_to_ra_reg[0][j]; - unsigned hreg0 = set->gpr_to_ra_reg[HALF_OFFSET][(j * 2) + 0]; - unsigned hreg1 = set->gpr_to_ra_reg[HALF_OFFSET][(j * 2) + 1]; - - ra_add_transitive_reg_conflict(set->regs, freg, hreg0); - ra_add_transitive_reg_conflict(set->regs, freg, hreg1); - } - - // TODO also need to update q_values, but for now: - ra_set_finalize(set->regs, NULL); - } else { - ra_set_finalize(set->regs, q_values); - } - - ralloc_free(q_values); - - return set; -} - -/* additional block-data (per-block) */ -struct ir3_ra_block_data { - BITSET_WORD *def; /* variables defined before used in block */ - BITSET_WORD *use; /* variables used before defined in block */ - BITSET_WORD *livein; /* which defs reach entry point of block */ - BITSET_WORD *liveout; /* which defs reach exit point of block */ -}; - -/* additional instruction-data (per-instruction) */ -struct ir3_ra_instr_data { - /* cached instruction 'definer' info: */ - struct ir3_instruction *defn; - int off, sz, cls; -}; - -/* register-assign context, per-shader */ -struct ir3_ra_ctx { - struct ir3 *ir; - gl_shader_stage type; - bool frag_face; - - struct ir3_ra_reg_set *set; - struct ra_graph *g; - unsigned alloc_count; - /* one per class, plus one slot for arrays: */ - unsigned class_alloc_count[total_class_count + 1]; - unsigned class_base[total_class_count + 1]; - unsigned instr_cnt; - unsigned *def, *use; /* def/use table */ - struct ir3_ra_instr_data *instrd; -}; - -/* does it conflict? */ -static inline bool -intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end) -{ - return !((a_start >= b_end) || (b_start >= a_end)); -} - -static bool -is_half(struct ir3_instruction *instr) -{ - return !!(instr->regs[0]->flags & IR3_REG_HALF); -} - -static bool -is_high(struct ir3_instruction *instr) -{ - return !!(instr->regs[0]->flags & IR3_REG_HIGH); -} - -static int -size_to_class(unsigned sz, bool half, bool high) -{ - if (high) { - for (unsigned i = 0; i < high_class_count; i++) - if (high_class_sizes[i] >= sz) - return i + HIGH_OFFSET; - } else if (half) { - for (unsigned i = 0; i < half_class_count; i++) - if (half_class_sizes[i] >= sz) - return i + HALF_OFFSET; - } else { - for (unsigned i = 0; i < class_count; i++) - if (class_sizes[i] >= sz) - return i; - } - debug_assert(0); - return -1; -} - -static bool -writes_gpr(struct ir3_instruction *instr) -{ - if (is_store(instr)) - return false; - /* is dest a normal temp register: */ - struct ir3_register *reg = instr->regs[0]; - if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)) - return false; - if ((reg->num == regid(REG_A0, 0)) || - (reg->num == regid(REG_P0, 0))) - return false; - return true; -} - -static bool -instr_before(struct ir3_instruction *a, struct ir3_instruction *b) -{ - if (a->flags & IR3_INSTR_UNUSED) - return false; - return (a->ip < b->ip); -} - -static struct ir3_instruction * -get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, - int *sz, int *off) -{ - struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; - struct ir3_instruction *d = NULL; - - if (id->defn) { - *sz = id->sz; - *off = id->off; - return id->defn; - } - - if (instr->opc == OPC_META_FI) { - /* What about the case where collect is subset of array, we - * need to find the distance between where actual array starts - * and fanin.. that probably doesn't happen currently. - */ - struct ir3_register *src; - int dsz, doff; - - /* note: don't use foreach_ssa_src as this gets called once - * while assigning regs (which clears SSA flag) - */ - foreach_src_n(src, n, instr) { - struct ir3_instruction *dd; - if (!src->instr) - continue; - - dd = get_definer(ctx, src->instr, &dsz, &doff); - - if ((!d) || instr_before(dd, d)) { - d = dd; - *sz = dsz; - *off = doff - n; - } - } - - } else if (instr->cp.right || instr->cp.left) { - /* covers also the meta:fo case, which ends up w/ single - * scalar instructions for each component: - */ - struct ir3_instruction *f = ir3_neighbor_first(instr); - - /* by definition, the entire sequence forms one linked list - * of single scalar register nodes (even if some of them may - * be fanouts from a texture sample (for example) instr. We - * just need to walk the list finding the first element of - * the group defined (lowest ip) - */ - int cnt = 0; - - /* need to skip over unused in the group: */ - while (f && (f->flags & IR3_INSTR_UNUSED)) { - f = f->cp.right; - cnt++; - } - - while (f) { - if ((!d) || instr_before(f, d)) - d = f; - if (f == instr) - *off = cnt; - f = f->cp.right; - cnt++; - } - - *sz = cnt; - - } else { - /* second case is looking directly at the instruction which - * produces multiple values (eg, texture sample), rather - * than the fanout nodes that point back to that instruction. - * This isn't quite right, because it may be part of a larger - * group, such as: - * - * sam (f32)(xyzw)r0.x, ... - * add r1.x, ... - * add r1.y, ... - * sam (f32)(xyzw)r2.x, r0.w <-- (r0.w, r1.x, r1.y) - * - * need to come up with a better way to handle that case. - */ - if (instr->address) { - *sz = instr->regs[0]->size; - } else { - *sz = util_last_bit(instr->regs[0]->wrmask); - } - *off = 0; - d = instr; - } - - if (d->opc == OPC_META_FO) { - struct ir3_instruction *dd; - int dsz, doff; - - dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff); - - /* by definition, should come before: */ - debug_assert(instr_before(dd, d)); - - *sz = MAX2(*sz, dsz); - - debug_assert(instr->opc == OPC_META_FO); - *off = MAX2(*off, instr->fo.off); - - d = dd; - } - - id->defn = d; - id->sz = *sz; - id->off = *off; - - return d; -} - -static void -ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block) -{ - list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { - struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; - if (instr->regs_count == 0) - continue; - /* couple special cases: */ - if (writes_addr(instr) || writes_pred(instr)) { - id->cls = -1; - } else if (instr->regs[0]->flags & IR3_REG_ARRAY) { - id->cls = total_class_count; - } else { - id->defn = get_definer(ctx, instr, &id->sz, &id->off); - id->cls = size_to_class(id->sz, is_half(id->defn), is_high(id->defn)); - } - } -} - -/* give each instruction a name (and ip), and count up the # of names - * of each class - */ -static void -ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block) -{ - list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { - struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; - -#ifdef DEBUG - instr->name = ~0; -#endif - - ctx->instr_cnt++; - - if (instr->regs_count == 0) - continue; - - if (!writes_gpr(instr)) - continue; - - if (id->defn != instr) - continue; - - /* arrays which don't fit in one of the pre-defined class - * sizes are pre-colored: - */ - if ((id->cls >= 0) && (id->cls < total_class_count)) { - instr->name = ctx->class_alloc_count[id->cls]++; - ctx->alloc_count++; - } - } -} - -static void -ra_init(struct ir3_ra_ctx *ctx) -{ - unsigned n, base; - - ir3_clear_mark(ctx->ir); - n = ir3_count_instructions(ctx->ir); - - ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n); - - list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { - ra_block_find_definers(ctx, block); - } - - list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { - ra_block_name_instructions(ctx, block); - } - - /* figure out the base register name for each class. The - * actual ra name is class_base[cls] + instr->name; - */ - ctx->class_base[0] = 0; - for (unsigned i = 1; i <= total_class_count; i++) { - ctx->class_base[i] = ctx->class_base[i-1] + - ctx->class_alloc_count[i-1]; - } - - /* and vreg names for array elements: */ - base = ctx->class_base[total_class_count]; - list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { - arr->base = base; - ctx->class_alloc_count[total_class_count] += arr->length; - base += arr->length; - } - ctx->alloc_count += ctx->class_alloc_count[total_class_count]; - - ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count); - ralloc_steal(ctx->g, ctx->instrd); - ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); - ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); -} - -static unsigned -__ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn) -{ - unsigned name; - debug_assert(cls >= 0); - debug_assert(cls < total_class_count); /* we shouldn't get arrays here.. */ - name = ctx->class_base[cls] + defn->name; - debug_assert(name < ctx->alloc_count); - return name; -} - -static int -ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id) -{ - /* TODO handle name mapping for arrays */ - return __ra_name(ctx, id->cls, id->defn); -} - -static void -ra_destroy(struct ir3_ra_ctx *ctx) -{ - ralloc_free(ctx->g); -} - -static void -ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) -{ - struct ir3_ra_block_data *bd; - unsigned bitset_words = BITSET_WORDS(ctx->alloc_count); - -#define def(name, instr) \ - do { \ - /* defined on first write: */ \ - if (!ctx->def[name]) \ - ctx->def[name] = instr->ip; \ - ctx->use[name] = instr->ip; \ - BITSET_SET(bd->def, name); \ - } while(0); - -#define use(name, instr) \ - do { \ - ctx->use[name] = MAX2(ctx->use[name], instr->ip); \ - if (!BITSET_TEST(bd->def, name)) \ - BITSET_SET(bd->use, name); \ - } while(0); - - bd = rzalloc(ctx->g, struct ir3_ra_block_data); - - bd->def = rzalloc_array(bd, BITSET_WORD, bitset_words); - bd->use = rzalloc_array(bd, BITSET_WORD, bitset_words); - bd->livein = rzalloc_array(bd, BITSET_WORD, bitset_words); - bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words); - - block->data = bd; - - list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { - struct ir3_instruction *src; - struct ir3_register *reg; - - if (instr->regs_count == 0) - continue; - - /* There are a couple special cases to deal with here: - * - * fanout: used to split values from a higher class to a lower - * class, for example split the results of a texture fetch - * into individual scalar values; We skip over these from - * a 'def' perspective, and for a 'use' we walk the chain - * up to the defining instruction. - * - * fanin: used to collect values from lower class and assemble - * them together into a higher class, for example arguments - * to texture sample instructions; We consider these to be - * defined at the earliest fanin source. - * - * Most of this is handled in the get_definer() helper. - * - * In either case, we trace the instruction back to the original - * definer and consider that as the def/use ip. - */ - - if (writes_gpr(instr)) { - struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; - struct ir3_register *dst = instr->regs[0]; - - if (dst->flags & IR3_REG_ARRAY) { - struct ir3_array *arr = - ir3_lookup_array(ctx->ir, dst->array.id); - unsigned i; - - arr->start_ip = MIN2(arr->start_ip, instr->ip); - arr->end_ip = MAX2(arr->end_ip, instr->ip); - - /* set the node class now.. in case we don't encounter - * this array dst again. From register_alloc algo's - * perspective, these are all single/scalar regs: - */ - for (i = 0; i < arr->length; i++) { - unsigned name = arr->base + i; - ra_set_node_class(ctx->g, name, ctx->set->classes[0]); - } - - /* indirect write is treated like a write to all array - * elements, since we don't know which one is actually - * written: - */ - if (dst->flags & IR3_REG_RELATIV) { - for (i = 0; i < arr->length; i++) { - unsigned name = arr->base + i; - def(name, instr); - } - } else { - unsigned name = arr->base + dst->array.offset; - def(name, instr); - } - - } else if (id->defn == instr) { - unsigned name = ra_name(ctx, id); - - /* since we are in SSA at this point: */ - debug_assert(!BITSET_TEST(bd->use, name)); - - def(name, id->defn); - - if (is_high(id->defn)) { - ra_set_node_class(ctx->g, name, - ctx->set->high_classes[id->cls - HIGH_OFFSET]); - } else if (is_half(id->defn)) { - ra_set_node_class(ctx->g, name, - ctx->set->half_classes[id->cls - HALF_OFFSET]); - } else { - ra_set_node_class(ctx->g, name, - ctx->set->classes[id->cls]); - } - } - } - - foreach_src(reg, instr) { - if (reg->flags & IR3_REG_ARRAY) { - struct ir3_array *arr = - ir3_lookup_array(ctx->ir, reg->array.id); - arr->start_ip = MIN2(arr->start_ip, instr->ip); - arr->end_ip = MAX2(arr->end_ip, instr->ip); - - /* indirect read is treated like a read fromall array - * elements, since we don't know which one is actually - * read: - */ - if (reg->flags & IR3_REG_RELATIV) { - unsigned i; - for (i = 0; i < arr->length; i++) { - unsigned name = arr->base + i; - use(name, instr); - } - } else { - unsigned name = arr->base + reg->array.offset; - use(name, instr); - /* NOTE: arrays are not SSA so unconditionally - * set use bit: - */ - BITSET_SET(bd->use, name); - debug_assert(reg->array.offset < arr->length); - } - } else if ((src = ssa(reg)) && writes_gpr(src)) { - unsigned name = ra_name(ctx, &ctx->instrd[src->ip]); - use(name, instr); - } - } - } -} - -static bool -ra_compute_livein_liveout(struct ir3_ra_ctx *ctx) -{ - unsigned bitset_words = BITSET_WORDS(ctx->alloc_count); - bool progress = false; - - list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { - struct ir3_ra_block_data *bd = block->data; - - /* update livein: */ - for (unsigned i = 0; i < bitset_words; i++) { - BITSET_WORD new_livein = - (bd->use[i] | (bd->liveout[i] & ~bd->def[i])); - - if (new_livein & ~bd->livein[i]) { - bd->livein[i] |= new_livein; - progress = true; - } - } - - /* update liveout: */ - for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) { - struct ir3_block *succ = block->successors[j]; - struct ir3_ra_block_data *succ_bd; - - if (!succ) - continue; - - succ_bd = succ->data; - - for (unsigned i = 0; i < bitset_words; i++) { - BITSET_WORD new_liveout = - (succ_bd->livein[i] & ~bd->liveout[i]); - - if (new_liveout) { - bd->liveout[i] |= new_liveout; - progress = true; - } - } - } - } - - return progress; -} - -static void -print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt) -{ - bool first = true; - debug_printf(" %s:", name); - for (unsigned i = 0; i < cnt; i++) { - if (BITSET_TEST(bs, i)) { - if (!first) - debug_printf(","); - debug_printf(" %04u", i); - first = false; - } - } - debug_printf("\n"); -} - -static void -ra_add_interference(struct ir3_ra_ctx *ctx) -{ - struct ir3 *ir = ctx->ir; - - /* initialize array live ranges: */ - list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) { - arr->start_ip = ~0; - arr->end_ip = 0; - } - - /* compute live ranges (use/def) on a block level, also updating - * block's def/use bitmasks (used below to calculate per-block - * livein/liveout): - */ - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - ra_block_compute_live_ranges(ctx, block); - } - - /* update per-block livein/liveout: */ - while (ra_compute_livein_liveout(ctx)) {} - - if (ir3_shader_debug & IR3_DBG_OPTMSGS) { - debug_printf("AFTER LIVEIN/OUT:\n"); - ir3_print(ir); - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - struct ir3_ra_block_data *bd = block->data; - debug_printf("block%u:\n", block_id(block)); - print_bitset(" def", bd->def, ctx->alloc_count); - print_bitset(" use", bd->use, ctx->alloc_count); - print_bitset(" l/i", bd->livein, ctx->alloc_count); - print_bitset(" l/o", bd->liveout, ctx->alloc_count); - } - list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) { - debug_printf("array%u:\n", arr->id); - debug_printf(" length: %u\n", arr->length); - debug_printf(" start_ip: %u\n", arr->start_ip); - debug_printf(" end_ip: %u\n", arr->end_ip); - } - } - - /* extend start/end ranges based on livein/liveout info from cfg: */ - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - struct ir3_ra_block_data *bd = block->data; - - for (unsigned i = 0; i < ctx->alloc_count; i++) { - if (BITSET_TEST(bd->livein, i)) { - ctx->def[i] = MIN2(ctx->def[i], block->start_ip); - ctx->use[i] = MAX2(ctx->use[i], block->start_ip); - } - - if (BITSET_TEST(bd->liveout, i)) { - ctx->def[i] = MIN2(ctx->def[i], block->end_ip); - ctx->use[i] = MAX2(ctx->use[i], block->end_ip); - } - } - - list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { - for (unsigned i = 0; i < arr->length; i++) { - if (BITSET_TEST(bd->livein, i + arr->base)) { - arr->start_ip = MIN2(arr->start_ip, block->start_ip); - } - if (BITSET_TEST(bd->livein, i + arr->base)) { - arr->end_ip = MAX2(arr->end_ip, block->end_ip); - } - } - } - } - - /* need to fix things up to keep outputs live: */ - for (unsigned i = 0; i < ir->noutputs; i++) { - struct ir3_instruction *instr = ir->outputs[i]; - unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]); - ctx->use[name] = ctx->instr_cnt; - } - - for (unsigned i = 0; i < ctx->alloc_count; i++) { - for (unsigned j = 0; j < ctx->alloc_count; j++) { - if (intersects(ctx->def[i], ctx->use[i], - ctx->def[j], ctx->use[j])) { - ra_add_node_interference(ctx->g, i, j); - } - } - } -} - -/* some instructions need fix-up if dst register is half precision: */ -static void fixup_half_instr_dst(struct ir3_instruction *instr) -{ - switch (opc_cat(instr->opc)) { - case 1: /* move instructions */ - instr->cat1.dst_type = half_type(instr->cat1.dst_type); - break; - case 3: - switch (instr->opc) { - case OPC_MAD_F32: - instr->opc = OPC_MAD_F16; - break; - case OPC_SEL_B32: - instr->opc = OPC_SEL_B16; - break; - case OPC_SEL_S32: - instr->opc = OPC_SEL_S16; - break; - case OPC_SEL_F32: - instr->opc = OPC_SEL_F16; - break; - case OPC_SAD_S32: - instr->opc = OPC_SAD_S16; - break; - /* instructions may already be fixed up: */ - case OPC_MAD_F16: - case OPC_SEL_B16: - case OPC_SEL_S16: - case OPC_SEL_F16: - case OPC_SAD_S16: - break; - default: - assert(0); - break; - } - break; - case 5: - instr->cat5.type = half_type(instr->cat5.type); - break; - } -} -/* some instructions need fix-up if src register is half precision: */ -static void fixup_half_instr_src(struct ir3_instruction *instr) -{ - switch (instr->opc) { - case OPC_MOV: - instr->cat1.src_type = half_type(instr->cat1.src_type); - break; - default: - break; - } -} - -/* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first - * array access(es) which do not have any previous access to depend - * on from scheduling point of view - */ -static void -reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg, - struct ir3_instruction *instr) -{ - struct ir3_ra_instr_data *id; - - if (reg->flags & IR3_REG_ARRAY) { - struct ir3_array *arr = - ir3_lookup_array(ctx->ir, reg->array.id); - unsigned name = arr->base + reg->array.offset; - unsigned r = ra_get_node_reg(ctx->g, name); - unsigned num = ctx->set->ra_reg_to_gpr[r]; - - if (reg->flags & IR3_REG_RELATIV) { - reg->array.offset = num; - } else { - reg->num = num; - reg->flags &= ~IR3_REG_SSA; - } - - reg->flags &= ~IR3_REG_ARRAY; - } else if ((id = &ctx->instrd[instr->ip]) && id->defn) { - unsigned name = ra_name(ctx, id); - unsigned r = ra_get_node_reg(ctx->g, name); - unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off; - - debug_assert(!(reg->flags & IR3_REG_RELATIV)); - - if (is_high(id->defn)) - num += FIRST_HIGH_REG; - - reg->num = num; - reg->flags &= ~IR3_REG_SSA; - - if (is_half(id->defn)) - reg->flags |= IR3_REG_HALF; - } -} - -static void -ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block) -{ - list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { - struct ir3_register *reg; - - if (instr->regs_count == 0) - continue; - - if (writes_gpr(instr)) { - reg_assign(ctx, instr->regs[0], instr); - if (instr->regs[0]->flags & IR3_REG_HALF) - fixup_half_instr_dst(instr); - } - - foreach_src_n(reg, n, instr) { - struct ir3_instruction *src = reg->instr; - /* Note: reg->instr could be null for IR3_REG_ARRAY */ - if (!(src || (reg->flags & IR3_REG_ARRAY))) - continue; - reg_assign(ctx, instr->regs[n+1], src); - if (instr->regs[n+1]->flags & IR3_REG_HALF) - fixup_half_instr_src(instr); - } - } -} - -static int -ra_alloc(struct ir3_ra_ctx *ctx) -{ - /* pre-assign array elements: - */ - list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { - unsigned base = 0; - - if (arr->end_ip == 0) - continue; - - /* figure out what else we conflict with which has already - * been assigned: - */ -retry: - list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) { - if (arr2 == arr) - break; - if (arr2->end_ip == 0) - continue; - /* if it intersects with liverange AND register range.. */ - if (intersects(arr->start_ip, arr->end_ip, - arr2->start_ip, arr2->end_ip) && - intersects(base, base + arr->length, - arr2->reg, arr2->reg + arr2->length)) { - base = MAX2(base, arr2->reg + arr2->length); - goto retry; - } - } - - arr->reg = base; - - for (unsigned i = 0; i < arr->length; i++) { - unsigned name, reg; - - name = arr->base + i; - reg = ctx->set->gpr_to_ra_reg[0][base++]; - - ra_set_node_reg(ctx->g, name, reg); - } - } - - if (!ra_allocate(ctx->g)) - return -1; - - list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { - ra_block_alloc(ctx, block); - } - - return 0; -} - -int ir3_ra(struct ir3 *ir, gl_shader_stage type, - bool frag_coord, bool frag_face) -{ - struct ir3_ra_ctx ctx = { - .ir = ir, - .type = type, - .frag_face = frag_face, - .set = ir->compiler->set, - }; - int ret; - - ra_init(&ctx); - ra_add_interference(&ctx); - ret = ra_alloc(&ctx); - ra_destroy(&ctx); - - return ret; -} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c deleted file mode 100644 index 6552980d90c..00000000000 --- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c +++ /dev/null @@ -1,818 +0,0 @@ -/* - * Copyright (C) 2014 Rob Clark <[email protected]> - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark <[email protected]> - */ - - -#include "util/u_math.h" - -#include "ir3.h" - -/* - * Instruction Scheduling: - * - * A recursive depth based scheduling algo. Recursively find an eligible - * instruction to schedule from the deepest instruction (recursing through - * it's unscheduled src instructions). Normally this would result in a - * lot of re-traversal of the same instructions, so we cache results in - * instr->data (and clear cached results that would be no longer valid - * after scheduling an instruction). - * - * There are a few special cases that need to be handled, since sched - * is currently independent of register allocation. Usages of address - * register (a0.x) or predicate register (p0.x) must be serialized. Ie. - * if you have two pairs of instructions that write the same special - * register and then read it, then those pairs cannot be interleaved. - * To solve this, when we are in such a scheduling "critical section", - * and we encounter a conflicting write to a special register, we try - * to schedule any remaining instructions that use that value first. - */ - -struct ir3_sched_ctx { - struct ir3_block *block; /* the current block */ - struct list_head depth_list; /* depth sorted unscheduled instrs */ - struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/ - struct ir3_instruction *addr; /* current a0.x user, if any */ - struct ir3_instruction *pred; /* current p0.x user, if any */ - bool error; -}; - -static bool is_sfu_or_mem(struct ir3_instruction *instr) -{ - return is_sfu(instr) || is_mem(instr); -} - -#define NULL_INSTR ((void *)~0) - -static void -clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) -{ - list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) { - if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr) - instr2->data = NULL; - } -} - -static void -schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) -{ - debug_assert(ctx->block == instr->block); - - /* maybe there is a better way to handle this than just stuffing - * a nop.. ideally we'd know about this constraint in the - * scheduling and depth calculation.. - */ - if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr)) - ir3_NOP(ctx->block); - - /* remove from depth list: - */ - list_delinit(&instr->node); - - if (writes_addr(instr)) { - debug_assert(ctx->addr == NULL); - ctx->addr = instr; - } - - if (writes_pred(instr)) { - debug_assert(ctx->pred == NULL); - ctx->pred = instr; - } - - instr->flags |= IR3_INSTR_MARK; - - list_addtail(&instr->node, &instr->block->instr_list); - ctx->scheduled = instr; - - if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) { - clear_cache(ctx, NULL); - } else { - /* invalidate only the necessary entries.. */ - clear_cache(ctx, instr); - } -} - -static struct ir3_instruction * -deepest(struct ir3_instruction **srcs, unsigned nsrcs) -{ - struct ir3_instruction *d = NULL; - unsigned i = 0, id = 0; - - while ((i < nsrcs) && !(d = srcs[id = i])) - i++; - - if (!d) - return NULL; - - for (; i < nsrcs; i++) - if (srcs[i] && (srcs[i]->depth > d->depth)) - d = srcs[id = i]; - - srcs[id] = NULL; - - return d; -} - -/** - * @block: the block to search in, starting from end; in first pass, - * this will be the block the instruction would be inserted into - * (but has not yet, ie. it only contains already scheduled - * instructions). For intra-block scheduling (second pass), this - * would be one of the predecessor blocks. - * @instr: the instruction to search for - * @maxd: max distance, bail after searching this # of instruction - * slots, since it means the instruction we are looking for is - * far enough away - * @pred: if true, recursively search into predecessor blocks to - * find the worst case (shortest) distance (only possible after - * individual blocks are all scheduled - */ -static unsigned -distance(struct ir3_block *block, struct ir3_instruction *instr, - unsigned maxd, bool pred) -{ - unsigned d = 0; - - list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) { - if ((n == instr) || (d >= maxd)) - return d; - /* NOTE: don't count branch/jump since we don't know yet if they will - * be eliminated later in resolve_jumps().. really should do that - * earlier so we don't have this constraint. - */ - if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR))) - d++; - } - - /* if coming from a predecessor block, assume it is assigned far - * enough away.. we'll fix up later. - */ - if (!pred) - return maxd; - - if (pred && (block->data != block)) { - /* Search into predecessor blocks, finding the one with the - * shortest distance, since that will be the worst case - */ - unsigned min = maxd - d; - - /* (ab)use block->data to prevent recursion: */ - block->data = block; - - for (unsigned i = 0; i < block->predecessors_count; i++) { - unsigned n; - - n = distance(block->predecessors[i], instr, min, pred); - - min = MIN2(min, n); - } - - block->data = NULL; - d += min; - } - - return d; -} - -/* calculate delay for specified src: */ -static unsigned -delay_calc_srcn(struct ir3_block *block, - struct ir3_instruction *assigner, - struct ir3_instruction *consumer, - unsigned srcn, bool soft, bool pred) -{ - unsigned delay = 0; - - if (is_meta(assigner)) { - struct ir3_instruction *src; - foreach_ssa_src(src, assigner) { - unsigned d; - d = delay_calc_srcn(block, src, consumer, srcn, soft, pred); - delay = MAX2(delay, d); - } - } else { - if (soft) { - if (is_sfu(assigner)) { - delay = 4; - } else { - delay = ir3_delayslots(assigner, consumer, srcn); - } - } else { - delay = ir3_delayslots(assigner, consumer, srcn); - } - delay -= distance(block, assigner, delay, pred); - } - - return delay; -} - -/* calculate delay for instruction (maximum of delay for all srcs): */ -static unsigned -delay_calc(struct ir3_block *block, struct ir3_instruction *instr, - bool soft, bool pred) -{ - unsigned delay = 0; - struct ir3_instruction *src; - - foreach_ssa_src_n(src, i, instr) { - unsigned d; - d = delay_calc_srcn(block, src, instr, i, soft, pred); - delay = MAX2(delay, d); - } - - return delay; -} - -struct ir3_sched_notes { - /* there is at least one kill which could be scheduled, except - * for unscheduled bary.f's: - */ - bool blocked_kill; - /* there is at least one instruction that could be scheduled, - * except for conflicting address/predicate register usage: - */ - bool addr_conflict, pred_conflict; -}; - -static bool is_scheduled(struct ir3_instruction *instr) -{ - return !!(instr->flags & IR3_INSTR_MARK); -} - -/* could an instruction be scheduled if specified ssa src was scheduled? */ -static bool -could_sched(struct ir3_instruction *instr, struct ir3_instruction *src) -{ - struct ir3_instruction *other_src; - foreach_ssa_src(other_src, instr) { - /* if dependency not scheduled, we aren't ready yet: */ - if ((src != other_src) && !is_scheduled(other_src)) { - return false; - } - } - return true; -} - -/* Check if instruction is ok to schedule. Make sure it is not blocked - * by use of addr/predicate register, etc. - */ -static bool -check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, - struct ir3_instruction *instr) -{ - /* For instructions that write address register we need to - * make sure there is at least one instruction that uses the - * addr value which is otherwise ready. - * - * TODO if any instructions use pred register and have other - * src args, we would need to do the same for writes_pred().. - */ - if (writes_addr(instr)) { - struct ir3 *ir = instr->block->shader; - bool ready = false; - for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) { - struct ir3_instruction *indirect = ir->indirects[i]; - if (!indirect) - continue; - if (indirect->address != instr) - continue; - ready = could_sched(indirect, instr); - } - - /* nothing could be scheduled, so keep looking: */ - if (!ready) - return false; - } - - /* if this is a write to address/predicate register, and that - * register is currently in use, we need to defer until it is - * free: - */ - if (writes_addr(instr) && ctx->addr) { - debug_assert(ctx->addr != instr); - notes->addr_conflict = true; - return false; - } - - if (writes_pred(instr) && ctx->pred) { - debug_assert(ctx->pred != instr); - notes->pred_conflict = true; - return false; - } - - /* if the instruction is a kill, we need to ensure *every* - * bary.f is scheduled. The hw seems unhappy if the thread - * gets killed before the end-input (ei) flag is hit. - * - * We could do this by adding each bary.f instruction as - * virtual ssa src for the kill instruction. But we have - * fixed length instr->regs[]. - * - * TODO this wouldn't be quite right if we had multiple - * basic blocks, if any block was conditional. We'd need - * to schedule the bary.f's outside of any block which - * was conditional that contained a kill.. I think.. - */ - if (is_kill(instr)) { - struct ir3 *ir = instr->block->shader; - - for (unsigned i = 0; i < ir->baryfs_count; i++) { - struct ir3_instruction *baryf = ir->baryfs[i]; - if (baryf->flags & IR3_INSTR_UNUSED) - continue; - if (!is_scheduled(baryf)) { - notes->blocked_kill = true; - return false; - } - } - } - - return true; -} - -/* Find the best instruction to schedule from specified instruction or - * recursively it's ssa sources. - */ -static struct ir3_instruction * -find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, - struct ir3_instruction *instr) -{ - struct ir3_instruction *srcs[__ssa_src_cnt(instr)]; - struct ir3_instruction *src; - unsigned nsrcs = 0; - - if (is_scheduled(instr)) - return NULL; - - /* use instr->data to cache the results of recursing up the - * instr src's. Otherwise the recursive algo can scale quite - * badly w/ shader size. But this takes some care to clear - * the cache appropriately when instructions are scheduled. - */ - if (instr->data) { - if (instr->data == NULL_INSTR) - return NULL; - return instr->data; - } - - /* find unscheduled srcs: */ - foreach_ssa_src(src, instr) { - if (!is_scheduled(src)) { - debug_assert(nsrcs < ARRAY_SIZE(srcs)); - srcs[nsrcs++] = src; - } - } - - /* if all our src's are already scheduled: */ - if (nsrcs == 0) { - if (check_instr(ctx, notes, instr)) { - instr->data = instr; - return instr; - } - return NULL; - } - - while ((src = deepest(srcs, nsrcs))) { - struct ir3_instruction *candidate; - - candidate = find_instr_recursive(ctx, notes, src); - if (!candidate) - continue; - - if (check_instr(ctx, notes, candidate)) { - instr->data = candidate; - return candidate; - } - } - - instr->data = NULL_INSTR; - return NULL; -} - -/* find instruction to schedule: */ -static struct ir3_instruction * -find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, - bool soft) -{ - struct ir3_instruction *best_instr = NULL; - unsigned min_delay = ~0; - - /* TODO we'd really rather use the list/array of block outputs. But we - * don't have such a thing. Recursing *every* instruction in the list - * will result in a lot of repeated traversal, since instructions will - * get traversed both when they appear as ssa src to a later instruction - * as well as where they appear in the depth_list. - */ - list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) { - struct ir3_instruction *candidate; - unsigned delay; - - candidate = find_instr_recursive(ctx, notes, instr); - if (!candidate) - continue; - - delay = delay_calc(ctx->block, candidate, soft, false); - if (delay < min_delay) { - best_instr = candidate; - min_delay = delay; - } - - if (min_delay == 0) - break; - } - - return best_instr; -} - -/* "spill" the address register by remapping any unscheduled - * instructions which depend on the current address register - * to a clone of the instruction which wrote the address reg. - */ -static struct ir3_instruction * -split_addr(struct ir3_sched_ctx *ctx) -{ - struct ir3 *ir; - struct ir3_instruction *new_addr = NULL; - unsigned i; - - debug_assert(ctx->addr); - - ir = ctx->addr->block->shader; - - for (i = 0; i < ir->indirects_count; i++) { - struct ir3_instruction *indirect = ir->indirects[i]; - - if (!indirect) - continue; - - /* skip instructions already scheduled: */ - if (is_scheduled(indirect)) - continue; - - /* remap remaining instructions using current addr - * to new addr: - */ - if (indirect->address == ctx->addr) { - if (!new_addr) { - new_addr = ir3_instr_clone(ctx->addr); - /* original addr is scheduled, but new one isn't: */ - new_addr->flags &= ~IR3_INSTR_MARK; - } - ir3_instr_set_address(indirect, new_addr); - } - } - - /* all remaining indirects remapped to new addr: */ - ctx->addr = NULL; - - return new_addr; -} - -/* "spill" the predicate register by remapping any unscheduled - * instructions which depend on the current predicate register - * to a clone of the instruction which wrote the address reg. - */ -static struct ir3_instruction * -split_pred(struct ir3_sched_ctx *ctx) -{ - struct ir3 *ir; - struct ir3_instruction *new_pred = NULL; - unsigned i; - - debug_assert(ctx->pred); - - ir = ctx->pred->block->shader; - - for (i = 0; i < ir->predicates_count; i++) { - struct ir3_instruction *predicated = ir->predicates[i]; - - /* skip instructions already scheduled: */ - if (is_scheduled(predicated)) - continue; - - /* remap remaining instructions using current pred - * to new pred: - * - * TODO is there ever a case when pred isn't first - * (and only) src? - */ - if (ssa(predicated->regs[1]) == ctx->pred) { - if (!new_pred) { - new_pred = ir3_instr_clone(ctx->pred); - /* original pred is scheduled, but new one isn't: */ - new_pred->flags &= ~IR3_INSTR_MARK; - } - predicated->regs[1]->instr = new_pred; - } - } - - /* all remaining predicated remapped to new pred: */ - ctx->pred = NULL; - - return new_pred; -} - -static void -sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) -{ - struct list_head unscheduled_list; - - ctx->block = block; - - /* addr/pred writes are per-block: */ - ctx->addr = NULL; - ctx->pred = NULL; - - /* move all instructions to the unscheduled list, and - * empty the block's instruction list (to which we will - * be inserting). - */ - list_replace(&block->instr_list, &unscheduled_list); - list_inithead(&block->instr_list); - list_inithead(&ctx->depth_list); - - /* first a pre-pass to schedule all meta:input instructions - * (which need to appear first so that RA knows the register is - * occupied), and move remaining to depth sorted list: - */ - list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) { - if (instr->opc == OPC_META_INPUT) { - schedule(ctx, instr); - } else { - ir3_insert_by_depth(instr, &ctx->depth_list); - } - } - - while (!list_empty(&ctx->depth_list)) { - struct ir3_sched_notes notes = {0}; - struct ir3_instruction *instr; - - instr = find_eligible_instr(ctx, ¬es, true); - if (!instr) - instr = find_eligible_instr(ctx, ¬es, false); - - if (instr) { - unsigned delay = delay_calc(ctx->block, instr, false, false); - - /* and if we run out of instructions that can be scheduled, - * then it is time for nop's: - */ - debug_assert(delay <= 6); - while (delay > 0) { - ir3_NOP(block); - delay--; - } - - schedule(ctx, instr); - } else { - struct ir3_instruction *new_instr = NULL; - - /* nothing available to schedule.. if we are blocked on - * address/predicate register conflict, then break the - * deadlock by cloning the instruction that wrote that - * reg: - */ - if (notes.addr_conflict) { - new_instr = split_addr(ctx); - } else if (notes.pred_conflict) { - new_instr = split_pred(ctx); - } else { - debug_assert(0); - ctx->error = true; - return; - } - - if (new_instr) { - /* clearing current addr/pred can change what is - * available to schedule, so clear cache.. - */ - clear_cache(ctx, NULL); - - ir3_insert_by_depth(new_instr, &ctx->depth_list); - /* the original instr that wrote addr/pred may have - * originated from a different block: - */ - new_instr->block = block; - } - } - } - - /* And lastly, insert branch/jump instructions to take us to - * the next block. Later we'll strip back out the branches - * that simply jump to next instruction. - */ - if (block->successors[1]) { - /* if/else, conditional branches to "then" or "else": */ - struct ir3_instruction *br; - unsigned delay = 6; - - debug_assert(ctx->pred); - debug_assert(block->condition); - - delay -= distance(ctx->block, ctx->pred, delay, false); - - while (delay > 0) { - ir3_NOP(block); - delay--; - } - - /* create "else" branch first (since "then" block should - * frequently/always end up being a fall-thru): - */ - br = ir3_BR(block); - br->cat0.inv = true; - br->cat0.target = block->successors[1]; - - /* NOTE: we have to hard code delay of 6 above, since - * we want to insert the nop's before constructing the - * branch. Throw in an assert so we notice if this - * ever breaks on future generation: - */ - debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6); - - br = ir3_BR(block); - br->cat0.target = block->successors[0]; - - } else if (block->successors[0]) { - /* otherwise unconditional jump to next block: */ - struct ir3_instruction *jmp; - - jmp = ir3_JUMP(block); - jmp->cat0.target = block->successors[0]; - } - - /* NOTE: if we kept track of the predecessors, we could do a better - * job w/ (jp) flags.. every node w/ > predecessor is a join point. - * Note that as we eliminate blocks which contain only an unconditional - * jump we probably need to propagate (jp) flag.. - */ -} - -/* After scheduling individual blocks, we still could have cases where - * one (or more) paths into a block, a value produced by a previous - * has too few delay slots to be legal. We can't deal with this in the - * first pass, because loops (ie. we can't ensure all predecessor blocks - * are already scheduled in the first pass). All we can really do at - * this point is stuff in extra nop's until things are legal. - */ -static void -sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) -{ - unsigned n = 0; - - ctx->block = block; - - list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) { - unsigned delay = 0; - - for (unsigned i = 0; i < block->predecessors_count; i++) { - unsigned d = delay_calc(block->predecessors[i], instr, false, true); - delay = MAX2(d, delay); - } - - while (delay > n) { - struct ir3_instruction *nop = ir3_NOP(block); - - /* move to before instr: */ - list_delinit(&nop->node); - list_addtail(&nop->node, &instr->node); - - n++; - } - - /* we can bail once we hit worst case delay: */ - if (++n > 6) - break; - } -} - -int ir3_sched(struct ir3 *ir) -{ - struct ir3_sched_ctx ctx = {0}; - - ir3_clear_mark(ir); - - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - sched_block(&ctx, block); - } - - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - sched_intra_block(&ctx, block); - } - - if (ctx.error) - return -1; - return 0; -} - -/* does instruction 'prior' need to be scheduled before 'instr'? */ -static bool -depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior) -{ - /* TODO for dependencies that are related to a specific object, ie - * a specific SSBO/image/array, we could relax this constraint to - * make accesses to unrelated objects not depend on each other (at - * least as long as not declared coherent) - */ - if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) || - ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class)) - return true; - return !!(instr->barrier_class & prior->barrier_conflict); -} - -static void -add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr) -{ - struct list_head *prev = instr->node.prev; - struct list_head *next = instr->node.next; - - /* add dependencies on previous instructions that must be scheduled - * prior to the current instruction - */ - while (prev != &block->instr_list) { - struct ir3_instruction *pi = - LIST_ENTRY(struct ir3_instruction, prev, node); - - prev = prev->prev; - - if (is_meta(pi)) - continue; - - if (instr->barrier_class == pi->barrier_class) { - ir3_instr_add_dep(instr, pi); - break; - } - - if (depends_on(instr, pi)) - ir3_instr_add_dep(instr, pi); - } - - /* add dependencies on this instruction to following instructions - * that must be scheduled after the current instruction: - */ - while (next != &block->instr_list) { - struct ir3_instruction *ni = - LIST_ENTRY(struct ir3_instruction, next, node); - - next = next->next; - - if (is_meta(ni)) - continue; - - if (instr->barrier_class == ni->barrier_class) { - ir3_instr_add_dep(ni, instr); - break; - } - - if (depends_on(ni, instr)) - ir3_instr_add_dep(ni, instr); - } -} - -/* before scheduling a block, we need to add any necessary false-dependencies - * to ensure that: - * - * (1) barriers are scheduled in the right order wrt instructions related - * to the barrier - * - * (2) reads that come before a write actually get scheduled before the - * write - */ -static void -calculate_deps(struct ir3_block *block) -{ - list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { - if (instr->barrier_class) { - add_barrier_deps(block, instr); - } - } -} - -void -ir3_sched_add_deps(struct ir3 *ir) -{ - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - calculate_deps(block); - } -} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c deleted file mode 100644 index b58a204c6b9..00000000000 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c +++ /dev/null @@ -1,436 +0,0 @@ -/* - * Copyright (C) 2014 Rob Clark <[email protected]> - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark <[email protected]> - */ - -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_format.h" - -#include "freedreno_util.h" - -#include "ir3_shader.h" -#include "ir3_compiler.h" -#include "ir3_nir.h" - -int -ir3_glsl_type_size(const struct glsl_type *type) -{ - return glsl_count_attribute_slots(type, false); -} - -static void -delete_variant(struct ir3_shader_variant *v) -{ - if (v->ir) - ir3_destroy(v->ir); - if (v->bo) - fd_bo_del(v->bo); - if (v->immediates) - free(v->immediates); - free(v); -} - -/* for vertex shader, the inputs are loaded into registers before the shader - * is executed, so max_regs from the shader instructions might not properly - * reflect the # of registers actually used, especially in case passthrough - * varyings. - * - * Likewise, for fragment shader, we can have some regs which are passed - * input values but never touched by the resulting shader (ie. as result - * of dead code elimination or simply because we don't know how to turn - * the reg off. - */ -static void -fixup_regfootprint(struct ir3_shader_variant *v) -{ - unsigned i; - - for (i = 0; i < v->inputs_count; i++) { - /* skip frag inputs fetch via bary.f since their reg's are - * not written by gpu before shader starts (and in fact the - * regid's might not even be valid) - */ - if (v->inputs[i].bary) - continue; - - /* ignore high regs that are global to all threads in a warp - * (they exist by default) (a5xx+) - */ - if (v->inputs[i].regid >= regid(48,0)) - continue; - - if (v->inputs[i].compmask) { - unsigned n = util_last_bit(v->inputs[i].compmask) - 1; - int32_t regid = (v->inputs[i].regid + n) >> 2; - v->info.max_reg = MAX2(v->info.max_reg, regid); - } - } - - for (i = 0; i < v->outputs_count; i++) { - int32_t regid = (v->outputs[i].regid + 3) >> 2; - v->info.max_reg = MAX2(v->info.max_reg, regid); - } -} - -/* wrapper for ir3_assemble() which does some info fixup based on - * shader state. Non-static since used by ir3_cmdline too. - */ -void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id) -{ - void *bin; - - bin = ir3_assemble(v->ir, &v->info, gpu_id); - if (!bin) - return NULL; - - if (gpu_id >= 400) { - v->instrlen = v->info.sizedwords / (2 * 16); - } else { - v->instrlen = v->info.sizedwords / (2 * 4); - } - - /* NOTE: if relative addressing is used, we set constlen in - * the compiler (to worst-case value) since we don't know in - * the assembler what the max addr reg value can be: - */ - v->constlen = MIN2(255, MAX2(v->constlen, v->info.max_const + 1)); - - fixup_regfootprint(v); - - return bin; -} - -static void -assemble_variant(struct ir3_shader_variant *v) -{ - struct ir3_compiler *compiler = v->shader->compiler; - uint32_t gpu_id = compiler->gpu_id; - uint32_t sz, *bin; - - bin = ir3_shader_assemble(v, gpu_id); - sz = v->info.sizedwords * 4; - - v->bo = fd_bo_new(compiler->dev, sz, - DRM_FREEDRENO_GEM_CACHE_WCOMBINE | - DRM_FREEDRENO_GEM_TYPE_KMEM); - - memcpy(fd_bo_map(v->bo), bin, sz); - - if (ir3_shader_debug & IR3_DBG_DISASM) { - struct ir3_shader_key key = v->key; - printf("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type, - v->binning_pass, key.color_two_side, key.half_precision); - ir3_shader_disasm(v, bin, stdout); - } - - if (shader_debug_enabled(v->shader->type)) { - fprintf(stderr, "Native code for unnamed %s shader %s:\n", - _mesa_shader_stage_to_string(v->shader->type), - v->shader->nir->info.name); - if (v->shader->type == MESA_SHADER_FRAGMENT) - fprintf(stderr, "SIMD0\n"); - ir3_shader_disasm(v, bin, stderr); - } - - free(bin); - - /* no need to keep the ir around beyond this point: */ - ir3_destroy(v->ir); - v->ir = NULL; -} - -static struct ir3_shader_variant * -create_variant(struct ir3_shader *shader, struct ir3_shader_key *key, - bool binning_pass) -{ - struct ir3_shader_variant *v = CALLOC_STRUCT(ir3_shader_variant); - int ret; - - if (!v) - return NULL; - - v->id = ++shader->variant_count; - v->shader = shader; - v->binning_pass = binning_pass; - v->key = *key; - v->type = shader->type; - - ret = ir3_compile_shader_nir(shader->compiler, v); - if (ret) { - debug_error("compile failed!"); - goto fail; - } - - assemble_variant(v); - if (!v->bo) { - debug_error("assemble failed!"); - goto fail; - } - - return v; - -fail: - delete_variant(v); - return NULL; -} - -static inline struct ir3_shader_variant * -shader_variant(struct ir3_shader *shader, struct ir3_shader_key *key, - bool *created) -{ - struct ir3_shader_variant *v; - - *created = false; - - for (v = shader->variants; v; v = v->next) - if (ir3_shader_key_equal(key, &v->key)) - return v; - - /* compile new variant if it doesn't exist already: */ - v = create_variant(shader, key, false); - if (v) { - v->next = shader->variants; - shader->variants = v; - *created = true; - } - - return v; -} - -struct ir3_shader_variant * -ir3_shader_get_variant(struct ir3_shader *shader, struct ir3_shader_key *key, - bool binning_pass, bool *created) -{ - struct ir3_shader_variant *v = - shader_variant(shader, key, created); - - if (binning_pass) { - if (!v->binning) - v->binning = create_variant(shader, key, true); - return v->binning; - } - - return v; -} - -void -ir3_shader_destroy(struct ir3_shader *shader) -{ - struct ir3_shader_variant *v, *t; - for (v = shader->variants; v; ) { - t = v; - v = v->next; - delete_variant(t); - } - ralloc_free(shader->nir); - free(shader); -} - -struct ir3_shader * -ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir) -{ - struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader); - - shader->compiler = compiler; - shader->id = ++shader->compiler->shader_count; - shader->type = nir->info.stage; - - NIR_PASS_V(nir, nir_lower_io, nir_var_all, ir3_glsl_type_size, - (nir_lower_io_options)0); - - /* do first pass optimization, ignoring the key: */ - shader->nir = ir3_optimize_nir(shader, nir, NULL); - if (ir3_shader_debug & IR3_DBG_DISASM) { - printf("dump nir%d: type=%d", shader->id, shader->type); - nir_print_shader(shader->nir, stdout); - } - - return shader; -} - -static void dump_reg(FILE *out, const char *name, uint32_t r) -{ - if (r != regid(63,0)) - fprintf(out, "; %s: r%d.%c\n", name, r >> 2, "xyzw"[r & 0x3]); -} - -static void dump_output(FILE *out, struct ir3_shader_variant *so, - unsigned slot, const char *name) -{ - uint32_t regid; - regid = ir3_find_output_regid(so, slot); - dump_reg(out, name, regid); -} - -void -ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out) -{ - struct ir3 *ir = so->ir; - struct ir3_register *reg; - const char *type = ir3_shader_stage(so->shader); - uint8_t regid; - unsigned i; - - for (i = 0; i < ir->ninputs; i++) { - if (!ir->inputs[i]) { - fprintf(out, "; in%d unused\n", i); - continue; - } - reg = ir->inputs[i]->regs[0]; - regid = reg->num; - fprintf(out, "@in(%sr%d.%c)\tin%d\n", - (reg->flags & IR3_REG_HALF) ? "h" : "", - (regid >> 2), "xyzw"[regid & 0x3], i); - } - - for (i = 0; i < ir->noutputs; i++) { - if (!ir->outputs[i]) { - fprintf(out, "; out%d unused\n", i); - continue; - } - /* kill shows up as a virtual output.. skip it! */ - if (is_kill(ir->outputs[i])) - continue; - reg = ir->outputs[i]->regs[0]; - regid = reg->num; - fprintf(out, "@out(%sr%d.%c)\tout%d\n", - (reg->flags & IR3_REG_HALF) ? "h" : "", - (regid >> 2), "xyzw"[regid & 0x3], i); - } - - for (i = 0; i < so->immediates_count; i++) { - fprintf(out, "@const(c%d.x)\t", so->constbase.immediate + i); - fprintf(out, "0x%08x, 0x%08x, 0x%08x, 0x%08x\n", - so->immediates[i].val[0], - so->immediates[i].val[1], - so->immediates[i].val[2], - so->immediates[i].val[3]); - } - - disasm_a3xx(bin, so->info.sizedwords, 0, out); - - switch (so->type) { - case MESA_SHADER_VERTEX: - fprintf(out, "; %s: outputs:", type); - for (i = 0; i < so->outputs_count; i++) { - uint8_t regid = so->outputs[i].regid; - fprintf(out, " r%d.%c (%s)", - (regid >> 2), "xyzw"[regid & 0x3], - gl_varying_slot_name(so->outputs[i].slot)); - } - fprintf(out, "\n"); - fprintf(out, "; %s: inputs:", type); - for (i = 0; i < so->inputs_count; i++) { - uint8_t regid = so->inputs[i].regid; - fprintf(out, " r%d.%c (cm=%x,il=%u,b=%u)", - (regid >> 2), "xyzw"[regid & 0x3], - so->inputs[i].compmask, - so->inputs[i].inloc, - so->inputs[i].bary); - } - fprintf(out, "\n"); - break; - case MESA_SHADER_FRAGMENT: - fprintf(out, "; %s: outputs:", type); - for (i = 0; i < so->outputs_count; i++) { - uint8_t regid = so->outputs[i].regid; - fprintf(out, " r%d.%c (%s)", - (regid >> 2), "xyzw"[regid & 0x3], - gl_frag_result_name(so->outputs[i].slot)); - } - fprintf(out, "\n"); - fprintf(out, "; %s: inputs:", type); - for (i = 0; i < so->inputs_count; i++) { - uint8_t regid = so->inputs[i].regid; - fprintf(out, " r%d.%c (%s,cm=%x,il=%u,b=%u)", - (regid >> 2), "xyzw"[regid & 0x3], - gl_varying_slot_name(so->inputs[i].slot), - so->inputs[i].compmask, - so->inputs[i].inloc, - so->inputs[i].bary); - } - fprintf(out, "\n"); - break; - default: - /* TODO */ - break; - } - - /* print generic shader info: */ - fprintf(out, "; %s prog %d/%d: %u instructions, %d half, %d full\n", - type, so->shader->id, so->id, - so->info.instrs_count, - so->info.max_half_reg + 1, - so->info.max_reg + 1); - - fprintf(out, "; %d const, %u constlen\n", - so->info.max_const + 1, - so->constlen); - - fprintf(out, "; %u (ss), %u (sy)\n", so->info.ss, so->info.sy); - - /* print shader type specific info: */ - switch (so->type) { - case MESA_SHADER_VERTEX: - dump_output(out, so, VARYING_SLOT_POS, "pos"); - dump_output(out, so, VARYING_SLOT_PSIZ, "psize"); - break; - case MESA_SHADER_FRAGMENT: - dump_reg(out, "pos (bary)", - ir3_find_sysval_regid(so, SYSTEM_VALUE_VARYING_COORD)); - dump_output(out, so, FRAG_RESULT_DEPTH, "posz"); - if (so->color0_mrt) { - dump_output(out, so, FRAG_RESULT_COLOR, "color"); - } else { - dump_output(out, so, FRAG_RESULT_DATA0, "data0"); - dump_output(out, so, FRAG_RESULT_DATA1, "data1"); - dump_output(out, so, FRAG_RESULT_DATA2, "data2"); - dump_output(out, so, FRAG_RESULT_DATA3, "data3"); - dump_output(out, so, FRAG_RESULT_DATA4, "data4"); - dump_output(out, so, FRAG_RESULT_DATA5, "data5"); - dump_output(out, so, FRAG_RESULT_DATA6, "data6"); - dump_output(out, so, FRAG_RESULT_DATA7, "data7"); - } - /* these two are hard-coded since we don't know how to - * program them to anything but all 0's... - */ - if (so->frag_coord) - fprintf(out, "; fragcoord: r0.x\n"); - if (so->frag_face) - fprintf(out, "; fragface: hr0.x\n"); - break; - default: - /* TODO */ - break; - } - - fprintf(out, "\n"); -} - -uint64_t -ir3_shader_outputs(const struct ir3_shader *so) -{ - return so->nir->info.outputs_written; -} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h deleted file mode 100644 index bc47160d6ea..00000000000 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h +++ /dev/null @@ -1,587 +0,0 @@ -/* - * Copyright (C) 2014 Rob Clark <[email protected]> - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark <[email protected]> - */ - -#ifndef IR3_SHADER_H_ -#define IR3_SHADER_H_ - -#include <stdio.h> - -#include "compiler/shader_enums.h" -#include "compiler/nir/nir.h" -#include "util/bitscan.h" - -#include "ir3.h" - -struct glsl_type; - -/* driver param indices: */ -enum ir3_driver_param { - /* compute shader driver params: */ - IR3_DP_NUM_WORK_GROUPS_X = 0, - IR3_DP_NUM_WORK_GROUPS_Y = 1, - IR3_DP_NUM_WORK_GROUPS_Z = 2, - IR3_DP_LOCAL_GROUP_SIZE_X = 4, - IR3_DP_LOCAL_GROUP_SIZE_Y = 5, - IR3_DP_LOCAL_GROUP_SIZE_Z = 6, - /* NOTE: gl_NumWorkGroups should be vec4 aligned because - * glDispatchComputeIndirect() needs to load these from - * the info->indirect buffer. Keep that in mind when/if - * adding any addition CS driver params. - */ - IR3_DP_CS_COUNT = 8, /* must be aligned to vec4 */ - - /* vertex shader driver params: */ - IR3_DP_VTXID_BASE = 0, - IR3_DP_VTXCNT_MAX = 1, - /* user-clip-plane components, up to 8x vec4's: */ - IR3_DP_UCP0_X = 4, - /* .... */ - IR3_DP_UCP7_W = 35, - IR3_DP_VS_COUNT = 36 /* must be aligned to vec4 */ -}; - -#define IR3_MAX_SHADER_BUFFERS 32 -#define IR3_MAX_SHADER_IMAGES 32 -#define IR3_MAX_SO_BUFFERS 4 -#define IR3_MAX_SO_OUTPUTS 64 - -/** - * For consts needed to pass internal values to shader which may or may not - * be required, rather than allocating worst-case const space, we scan the - * shader and allocate consts as-needed: - * - * + SSBO sizes: only needed if shader has a get_buffer_size intrinsic - * for a given SSBO - * - * + Image dimensions: needed to calculate pixel offset, but only for - * images that have a image_store intrinsic - */ -struct ir3_driver_const_layout { - struct { - uint32_t mask; /* bitmask of SSBOs that have get_buffer_size */ - uint32_t count; /* number of consts allocated */ - /* one const allocated per SSBO which has get_buffer_size, - * ssbo_sizes.off[ssbo_id] is offset from start of ssbo_sizes - * consts: - */ - uint32_t off[IR3_MAX_SHADER_BUFFERS]; - } ssbo_size; - - struct { - uint32_t mask; /* bitmask of images that have image_store */ - uint32_t count; /* number of consts allocated */ - /* three const allocated per image which has image_store: - * + cpp (bytes per pixel) - * + pitch (y pitch) - * + array_pitch (z pitch) - */ - uint32_t off[IR3_MAX_SHADER_IMAGES]; - } image_dims; -}; - -/** - * A single output for vertex transform feedback. - */ -struct ir3_stream_output { - unsigned register_index:6; /**< 0 to 63 (OUT index) */ - unsigned start_component:2; /** 0 to 3 */ - unsigned num_components:3; /** 1 to 4 */ - unsigned output_buffer:3; /**< 0 to PIPE_MAX_SO_BUFFERS */ - unsigned dst_offset:16; /**< offset into the buffer in dwords */ - unsigned stream:2; /**< 0 to 3 */ -}; - -/** - * Stream output for vertex transform feedback. - */ -struct ir3_stream_output_info { - unsigned num_outputs; - /** stride for an entire vertex for each buffer in dwords */ - uint16_t stride[IR3_MAX_SO_BUFFERS]; - - /** - * Array of stream outputs, in the order they are to be written in. - * Selected components are tightly packed into the output buffer. - */ - struct ir3_stream_output output[IR3_MAX_SO_OUTPUTS]; -}; - -/* Configuration key used to identify a shader variant.. different - * shader variants can be used to implement features not supported - * in hw (two sided color), binning-pass vertex shader, etc. - */ -struct ir3_shader_key { - union { - struct { - /* - * Combined Vertex/Fragment shader parameters: - */ - unsigned ucp_enables : 8; - - /* do we need to check {v,f}saturate_{s,t,r}? */ - unsigned has_per_samp : 1; - - /* - * Vertex shader variant parameters: - */ - unsigned vclamp_color : 1; - - /* - * Fragment shader variant parameters: - */ - unsigned color_two_side : 1; - unsigned half_precision : 1; - /* used when shader needs to handle flat varyings (a4xx) - * for front/back color inputs to frag shader: - */ - unsigned rasterflat : 1; - unsigned fclamp_color : 1; - }; - uint32_t global; - }; - - /* bitmask of sampler which needs coords clamped for vertex - * shader: - */ - uint16_t vsaturate_s, vsaturate_t, vsaturate_r; - - /* bitmask of sampler which needs coords clamped for frag - * shader: - */ - uint16_t fsaturate_s, fsaturate_t, fsaturate_r; - - /* bitmask of ms shifts */ - uint32_t vsamples, fsamples; - - /* bitmask of samplers which need astc srgb workaround: */ - uint16_t vastc_srgb, fastc_srgb; -}; - -static inline bool -ir3_shader_key_equal(struct ir3_shader_key *a, struct ir3_shader_key *b) -{ - /* slow-path if we need to check {v,f}saturate_{s,t,r} */ - if (a->has_per_samp || b->has_per_samp) - return memcmp(a, b, sizeof(struct ir3_shader_key)) == 0; - return a->global == b->global; -} - -/* will the two keys produce different lowering for a fragment shader? */ -static inline bool -ir3_shader_key_changes_fs(struct ir3_shader_key *key, struct ir3_shader_key *last_key) -{ - if (last_key->has_per_samp || key->has_per_samp) { - if ((last_key->fsaturate_s != key->fsaturate_s) || - (last_key->fsaturate_t != key->fsaturate_t) || - (last_key->fsaturate_r != key->fsaturate_r) || - (last_key->fsamples != key->fsamples) || - (last_key->fastc_srgb != key->fastc_srgb)) - return true; - } - - if (last_key->fclamp_color != key->fclamp_color) - return true; - - if (last_key->color_two_side != key->color_two_side) - return true; - - if (last_key->half_precision != key->half_precision) - return true; - - if (last_key->rasterflat != key->rasterflat) - return true; - - if (last_key->ucp_enables != key->ucp_enables) - return true; - - return false; -} - -/* will the two keys produce different lowering for a vertex shader? */ -static inline bool -ir3_shader_key_changes_vs(struct ir3_shader_key *key, struct ir3_shader_key *last_key) -{ - if (last_key->has_per_samp || key->has_per_samp) { - if ((last_key->vsaturate_s != key->vsaturate_s) || - (last_key->vsaturate_t != key->vsaturate_t) || - (last_key->vsaturate_r != key->vsaturate_r) || - (last_key->vsamples != key->vsamples) || - (last_key->vastc_srgb != key->vastc_srgb)) - return true; - } - - if (last_key->vclamp_color != key->vclamp_color) - return true; - - if (last_key->ucp_enables != key->ucp_enables) - return true; - - return false; -} - -/* clears shader-key flags which don't apply to the given shader - * stage - */ -static inline void -ir3_normalize_key(struct ir3_shader_key *key, gl_shader_stage type) -{ - switch (type) { - case MESA_SHADER_FRAGMENT: - if (key->has_per_samp) { - key->vsaturate_s = 0; - key->vsaturate_t = 0; - key->vsaturate_r = 0; - key->vastc_srgb = 0; - key->vsamples = 0; - } - break; - case MESA_SHADER_VERTEX: - key->color_two_side = false; - key->half_precision = false; - key->rasterflat = false; - if (key->has_per_samp) { - key->fsaturate_s = 0; - key->fsaturate_t = 0; - key->fsaturate_r = 0; - key->fastc_srgb = 0; - key->fsamples = 0; - } - break; - default: - /* TODO */ - break; - } - -} - -struct ir3_shader_variant { - struct fd_bo *bo; - - /* variant id (for debug) */ - uint32_t id; - - struct ir3_shader_key key; - - /* vertex shaders can have an extra version for hwbinning pass, - * which is pointed to by so->binning: - */ - bool binning_pass; - struct ir3_shader_variant *binning; - - struct ir3_driver_const_layout const_layout; - struct ir3_info info; - struct ir3 *ir; - - /* the instructions length is in units of instruction groups - * (4 instructions for a3xx, 16 instructions for a4xx.. each - * instruction is 2 dwords): - */ - unsigned instrlen; - - /* the constants length is in units of vec4's, and is the sum of - * the uniforms and the built-in compiler constants - */ - unsigned constlen; - - /* number of uniforms (in vec4), not including built-in compiler - * constants, etc. - */ - unsigned num_uniforms; - - unsigned num_ubos; - - /* About Linkage: - * + Let the frag shader determine the position/compmask for the - * varyings, since it is the place where we know if the varying - * is actually used, and if so, which components are used. So - * what the hw calls "outloc" is taken from the "inloc" of the - * frag shader. - * + From the vert shader, we only need the output regid - */ - - bool frag_coord, frag_face, color0_mrt; - - /* NOTE: for input/outputs, slot is: - * gl_vert_attrib - for VS inputs - * gl_varying_slot - for VS output / FS input - * gl_frag_result - for FS output - */ - - /* varyings/outputs: */ - unsigned outputs_count; - struct { - uint8_t slot; - uint8_t regid; - } outputs[16 + 2]; /* +POSITION +PSIZE */ - bool writes_pos, writes_psize; - - /* attributes (VS) / varyings (FS): - * Note that sysval's should come *after* normal inputs. - */ - unsigned inputs_count; - struct { - uint8_t slot; - uint8_t regid; - uint8_t compmask; - uint8_t ncomp; - /* location of input (ie. offset passed to bary.f, etc). This - * matches the SP_VS_VPC_DST_REG.OUTLOCn value (a3xx and a4xx - * have the OUTLOCn value offset by 8, presumably to account - * for gl_Position/gl_PointSize) - */ - uint8_t inloc; - /* vertex shader specific: */ - bool sysval : 1; /* slot is a gl_system_value */ - /* fragment shader specific: */ - bool bary : 1; /* fetched varying (vs one loaded into reg) */ - bool rasterflat : 1; /* special handling for emit->rasterflat */ - enum glsl_interp_mode interpolate; - } inputs[16 + 2]; /* +POSITION +FACE */ - - /* sum of input components (scalar). For frag shaders, it only counts - * the varying inputs: - */ - unsigned total_in; - - /* For frag shaders, the total number of inputs (not scalar, - * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR) - */ - unsigned varying_in; - - /* number of samplers/textures (which are currently 1:1): */ - int num_samp; - - /* do we have one or more SSBO instructions: */ - bool has_ssbo; - - /* do we have kill instructions: */ - bool has_kill; - - /* Layout of constant registers, each section (in vec4). Pointer size - * is 32b (a3xx, a4xx), or 64b (a5xx+), which effects the size of the - * UBO and stream-out consts. - */ - struct { - /* user const start at zero */ - unsigned ubo; - /* NOTE that a3xx might need a section for SSBO addresses too */ - unsigned ssbo_sizes; - unsigned image_dims; - unsigned driver_param; - unsigned tfbo; - unsigned immediate; - } constbase; - - unsigned immediates_count; - unsigned immediates_size; - struct { - uint32_t val[4]; - } *immediates; - - /* for astc srgb workaround, the number/base of additional - * alpha tex states we need, and index of original tex states - */ - struct { - unsigned base, count; - unsigned orig_idx[16]; - } astc_srgb; - - /* shader variants form a linked list: */ - struct ir3_shader_variant *next; - - /* replicated here to avoid passing extra ptrs everywhere: */ - gl_shader_stage type; - struct ir3_shader *shader; -}; - -struct ir3_shader { - gl_shader_stage type; - - /* shader id (for debug): */ - uint32_t id; - uint32_t variant_count; - - /* so we know when we can disable TGSI related hacks: */ - bool from_tgsi; - - struct ir3_compiler *compiler; - - struct nir_shader *nir; - struct ir3_stream_output_info stream_output; - - struct ir3_shader_variant *variants; -}; - -void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id); -struct ir3_shader_variant * ir3_shader_get_variant(struct ir3_shader *shader, - struct ir3_shader_key *key, bool binning_pass, bool *created); -struct ir3_shader * ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir); -void ir3_shader_destroy(struct ir3_shader *shader); -void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out); -uint64_t ir3_shader_outputs(const struct ir3_shader *so); - -int -ir3_glsl_type_size(const struct glsl_type *type); - -static inline const char * -ir3_shader_stage(struct ir3_shader *shader) -{ - switch (shader->type) { - case MESA_SHADER_VERTEX: return "VERT"; - case MESA_SHADER_FRAGMENT: return "FRAG"; - case MESA_SHADER_COMPUTE: return "CL"; - default: - unreachable("invalid type"); - return NULL; - } -} - -/* - * Helper/util: - */ - -static inline int -ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot) -{ - int j; - - for (j = 0; j < so->outputs_count; j++) - if (so->outputs[j].slot == slot) - return j; - - /* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n] - * in the vertex shader.. but the fragment shader doesn't know this - * so it will always have both IN.COLOR[n] and IN.BCOLOR[n]. So - * at link time if there is no matching OUT.BCOLOR[n], we must map - * OUT.COLOR[n] to IN.BCOLOR[n]. And visa versa if there is only - * a OUT.BCOLOR[n] but no matching OUT.COLOR[n] - */ - if (slot == VARYING_SLOT_BFC0) { - slot = VARYING_SLOT_COL0; - } else if (slot == VARYING_SLOT_BFC1) { - slot = VARYING_SLOT_COL1; - } else if (slot == VARYING_SLOT_COL0) { - slot = VARYING_SLOT_BFC0; - } else if (slot == VARYING_SLOT_COL1) { - slot = VARYING_SLOT_BFC1; - } else { - return 0; - } - - for (j = 0; j < so->outputs_count; j++) - if (so->outputs[j].slot == slot) - return j; - - debug_assert(0); - - return 0; -} - -static inline int -ir3_next_varying(const struct ir3_shader_variant *so, int i) -{ - while (++i < so->inputs_count) - if (so->inputs[i].compmask && so->inputs[i].bary) - break; - return i; -} - -struct ir3_shader_linkage { - uint8_t max_loc; - uint8_t cnt; - struct { - uint8_t regid; - uint8_t compmask; - uint8_t loc; - } var[32]; -}; - -static inline void -ir3_link_add(struct ir3_shader_linkage *l, uint8_t regid, uint8_t compmask, uint8_t loc) -{ - int i = l->cnt++; - - debug_assert(i < ARRAY_SIZE(l->var)); - - l->var[i].regid = regid; - l->var[i].compmask = compmask; - l->var[i].loc = loc; - l->max_loc = MAX2(l->max_loc, loc + util_last_bit(compmask)); -} - -static inline void -ir3_link_shaders(struct ir3_shader_linkage *l, - const struct ir3_shader_variant *vs, - const struct ir3_shader_variant *fs) -{ - int j = -1, k; - - while (l->cnt < ARRAY_SIZE(l->var)) { - j = ir3_next_varying(fs, j); - - if (j >= fs->inputs_count) - break; - - if (fs->inputs[j].inloc >= fs->total_in) - continue; - - k = ir3_find_output(vs, fs->inputs[j].slot); - - ir3_link_add(l, vs->outputs[k].regid, - fs->inputs[j].compmask, fs->inputs[j].inloc); - } -} - -static inline uint32_t -ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot) -{ - int j; - for (j = 0; j < so->outputs_count; j++) - if (so->outputs[j].slot == slot) - return so->outputs[j].regid; - return regid(63, 0); -} - -static inline uint32_t -ir3_find_sysval_regid(const struct ir3_shader_variant *so, unsigned slot) -{ - int j; - for (j = 0; j < so->inputs_count; j++) - if (so->inputs[j].sysval && (so->inputs[j].slot == slot)) - return so->inputs[j].regid; - return regid(63, 0); -} - -/* calculate register footprint in terms of half-regs (ie. one full - * reg counts as two half-regs). - */ -static inline uint32_t -ir3_shader_halfregs(const struct ir3_shader_variant *v) -{ - return (2 * (v->info.max_reg + 1)) + (v->info.max_half_reg + 1); -} - -#endif /* IR3_SHADER_H_ */ diff --git a/src/gallium/drivers/freedreno/meson.build b/src/gallium/drivers/freedreno/meson.build index 797ba081758..f996126e386 100644 --- a/src/gallium/drivers/freedreno/meson.build +++ b/src/gallium/drivers/freedreno/meson.build @@ -18,18 +18,6 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -ir3_nir_trig_c = custom_target( - 'ir3_nir_trig.c', - input : 'ir3/ir3_nir_trig.py', - output : 'ir3_nir_trig.c', - command : [ - prog_python, '@INPUT@', - '-p', join_paths(meson.source_root(), 'src/compiler/nir/'), - ], - capture : true, - depend_files : nir_algebraic_py, -) - files_libfreedreno = files( 'adreno_common.xml.h', 'adreno_pm4.xml.h', @@ -215,35 +203,15 @@ files_libfreedreno = files( 'a6xx/fd6_texture.h', 'a6xx/fd6_zsa.c', 'a6xx/fd6_zsa.h', - 'ir3/disasm-a3xx.c', - 'ir3/instr-a3xx.h', - 'ir3/ir3.c', 'ir3/ir3_cache.c', 'ir3/ir3_cache.h', - 'ir3/ir3_compiler_nir.c', - 'ir3/ir3_compiler.c', - 'ir3/ir3_compiler.h', - 'ir3/ir3_cp.c', - 'ir3/ir3_depth.c', 'ir3/ir3_gallium.c', 'ir3/ir3_gallium.h', - 'ir3/ir3_group.c', - 'ir3/ir3.h', - 'ir3/ir3_legalize.c', - 'ir3/ir3_nir.c', - 'ir3/ir3_nir.h', - 'ir3/ir3_nir_lower_tg4_to_tex.c', - 'ir3/ir3_print.c', - 'ir3/ir3_ra.c', - 'ir3/ir3_sched.c', - 'ir3/ir3_shader.c', - 'ir3/ir3_shader.h', ) freedreno_includes = [ inc_src, inc_include, inc_gallium, inc_gallium_aux, - inc_freedreno, - include_directories('ir3') + inc_freedreno, include_directories('ir3'), ] freedreno_c_args = [] @@ -258,7 +226,7 @@ endif libfreedreno = static_library( 'freedreno', - [files_libfreedreno, ir3_nir_trig_c], + [files_libfreedreno], include_directories : freedreno_includes, c_args : [freedreno_c_args, c_vis_args], cpp_args : [freedreno_cpp_args, cpp_vis_args], @@ -273,6 +241,7 @@ driver_freedreno = declare_dependency( libfreedrenowinsys, libfreedreno, libfreedreno_drm, + libfreedreno_ir3, ], dependencies : idep_nir, ) @@ -288,6 +257,7 @@ ir3_compiler = executable( link_with : [ libfreedreno, libfreedreno_drm, + libfreedreno_ir3, libgallium, libglsl_standalone, libmesa_util, |