summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/freedreno/ir3
diff options
context:
space:
mode:
authorRob Clark <[email protected]>2014-07-25 11:15:59 -0400
committerRob Clark <[email protected]>2014-07-25 13:29:28 -0400
commitdb193e5ad06e7a2fbcffb3bb5df85d212eb12291 (patch)
tree58d1ec24c0af7b1acb1477eeaababe3d7eda6019 /src/gallium/drivers/freedreno/ir3
parent7d7e6ae9c3544ce1889aa9b8a34545c6f42017e7 (diff)
freedreno/ir3: split out shader compiler from a3xx
Move the bits we want to share between generations from fd3_program to ir3_shader. So overall structure is: fdN_shader_stateobj -> ir3_shader -> ir3_shader_variant -> ir3 |- ... \- ir3_shader_variant -> ir3 So the ir3_shader becomes the topmost generation neutral object, which manages the set of variants each of which generates, compiles, and assembles it's own ir. There is a bit of additional renaming to s/fd3_compiler/ir3_compiler/, etc. Keep the split between the gallium level stateobj and the shader helper object because it might be a good idea to pre-compute some generation specific register values (ie. anything that is independent of linking). Signed-off-by: Rob Clark <[email protected]>
Diffstat (limited to 'src/gallium/drivers/freedreno/ir3')
-rw-r--r--src/gallium/drivers/freedreno/ir3/disasm-a3xx.c805
-rw-r--r--src/gallium/drivers/freedreno/ir3/instr-a3xx.h691
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3.c675
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3.h480
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_compiler.c2639
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_compiler.h42
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_compiler_old.c1524
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_cp.c158
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_depth.c159
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_dump.c425
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_flatten.c155
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_ra.c790
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_sched.c401
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_shader.c211
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_shader.h163
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_visitor.h154
16 files changed, 9472 insertions, 0 deletions
diff --git a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
new file mode 100644
index 00000000000..8c3704bf658
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
@@ -0,0 +1,805 @@
+/*
+ * Copyright (c) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <assert.h>
+
+#include <util/u_debug.h>
+
+#include "disasm.h"
+#include "instr-a3xx.h"
+
+static enum debug_t debug;
+
+#define printf debug_printf
+
+static const char *levels[] = {
+ "",
+ "\t",
+ "\t\t",
+ "\t\t\t",
+ "\t\t\t\t",
+ "\t\t\t\t\t",
+ "\t\t\t\t\t\t",
+ "\t\t\t\t\t\t\t",
+ "\t\t\t\t\t\t\t\t",
+ "\t\t\t\t\t\t\t\t\t",
+ "x",
+ "x",
+ "x",
+ "x",
+ "x",
+ "x",
+};
+
+static const char *component = "xyzw";
+
+static const char *type[] = {
+ [TYPE_F16] = "f16",
+ [TYPE_F32] = "f32",
+ [TYPE_U16] = "u16",
+ [TYPE_U32] = "u32",
+ [TYPE_S16] = "s16",
+ [TYPE_S32] = "s32",
+ [TYPE_U8] = "u8",
+ [TYPE_S8] = "s8",
+};
+
+static void print_reg(reg_t reg, bool full, bool r, bool c, bool im,
+ bool neg, bool abs, bool addr_rel)
+{
+ const char type = c ? 'c' : 'r';
+
+ // XXX I prefer - and || for neg/abs, but preserving format used
+ // by libllvm-a3xx for easy diffing..
+
+ if (abs && neg)
+ printf("(absneg)");
+ else if (neg)
+ printf("(neg)");
+ else if (abs)
+ printf("(abs)");
+
+ if (r)
+ printf("(r)");
+
+ if (im) {
+ printf("%d", reg.iim_val);
+ } else if (addr_rel) {
+ /* I would just use %+d but trying to make it diff'able with
+ * libllvm-a3xx...
+ */
+ if (reg.iim_val < 0)
+ printf("%s%c<a0.x - %d>", full ? "" : "h", type, -reg.iim_val);
+ else if (reg.iim_val > 0)
+ printf("%s%c<a0.x + %d>", full ? "" : "h", type, reg.iim_val);
+ else
+ printf("%s%c<a0.x>", full ? "" : "h", type);
+ } else if ((reg.num == REG_A0) && !c) {
+ printf("a0.%c", component[reg.comp]);
+ } else if ((reg.num == REG_P0) && !c) {
+ printf("p0.%c", component[reg.comp]);
+ } else {
+ printf("%s%c%d.%c", full ? "" : "h", type, reg.num, component[reg.comp]);
+ }
+}
+
+
+/* current instruction repeat flag: */
+static unsigned repeat;
+
+static void print_reg_dst(reg_t reg, bool full, bool addr_rel)
+{
+ print_reg(reg, full, false, false, false, false, false, addr_rel);
+}
+
+static void print_reg_src(reg_t reg, bool full, bool r, bool c, bool im,
+ bool neg, bool abs, bool addr_rel)
+{
+ print_reg(reg, full, r, c, im, neg, abs, addr_rel);
+}
+
+static void print_instr_cat0(instr_t *instr)
+{
+ instr_cat0_t *cat0 = &instr->cat0;
+
+ switch (cat0->opc) {
+ case OPC_KILL:
+ printf(" %sp0.%c", cat0->inv ? "!" : "",
+ component[cat0->comp]);
+ break;
+ case OPC_BR:
+ printf(" %sp0.%c, #%d", cat0->inv ? "!" : "",
+ component[cat0->comp], cat0->immed);
+ break;
+ case OPC_JUMP:
+ case OPC_CALL:
+ printf(" #%d", cat0->immed);
+ break;
+ }
+
+ if ((debug & PRINT_VERBOSE) && (cat0->dummy1|cat0->dummy2|cat0->dummy3|cat0->dummy4))
+ printf("\t{0: %x,%x,%x,%x}", cat0->dummy1, cat0->dummy2, cat0->dummy3, cat0->dummy4);
+}
+
+static void print_instr_cat1(instr_t *instr)
+{
+ instr_cat1_t *cat1 = &instr->cat1;
+
+ if (cat1->ul)
+ printf("(ul)");
+
+ if (cat1->src_type == cat1->dst_type) {
+ if ((cat1->src_type == TYPE_S16) && (((reg_t)cat1->dst).num == REG_A0)) {
+ /* special case (nmemonic?): */
+ printf("mova");
+ } else {
+ printf("mov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
+ }
+ } else {
+ printf("cov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
+ }
+
+ printf(" ");
+
+ if (cat1->even)
+ printf("(even)");
+
+ if (cat1->pos_inf)
+ printf("(pos_infinity)");
+
+ print_reg_dst((reg_t)(cat1->dst), type_size(cat1->dst_type) == 32,
+ cat1->dst_rel);
+
+ printf(", ");
+
+ /* ugg, have to special case this.. vs print_reg().. */
+ if (cat1->src_im) {
+ if (type_float(cat1->src_type))
+ printf("(%f)", cat1->fim_val);
+ else
+ printf("%d", cat1->iim_val);
+ } else if (cat1->src_rel && !cat1->src_c) {
+ /* I would just use %+d but trying to make it diff'able with
+ * libllvm-a3xx...
+ */
+ char type = cat1->src_rel_c ? 'c' : 'r';
+ if (cat1->off < 0)
+ printf("%c<a0.x - %d>", type, -cat1->off);
+ else if (cat1->off > 0)
+ printf("%c<a0.x + %d>", type, cat1->off);
+ else
+ printf("c<a0.x>");
+ } else {
+ print_reg_src((reg_t)(cat1->src), type_size(cat1->src_type) == 32,
+ cat1->src_r, cat1->src_c, cat1->src_im, false, false, false);
+ }
+
+ if ((debug & PRINT_VERBOSE) && (cat1->must_be_0))
+ printf("\t{1: %x}", cat1->must_be_0);
+}
+
+static void print_instr_cat2(instr_t *instr)
+{
+ instr_cat2_t *cat2 = &instr->cat2;
+ static const char *cond[] = {
+ "lt",
+ "le",
+ "gt",
+ "ge",
+ "eq",
+ "ne",
+ "?6?",
+ };
+
+ switch (cat2->opc) {
+ case OPC_CMPS_F:
+ case OPC_CMPS_U:
+ case OPC_CMPS_S:
+ case OPC_CMPV_F:
+ case OPC_CMPV_U:
+ case OPC_CMPV_S:
+ printf(".%s", cond[cat2->cond]);
+ break;
+ }
+
+ printf(" ");
+ if (cat2->ei)
+ printf("(ei)");
+ print_reg_dst((reg_t)(cat2->dst), cat2->full ^ cat2->dst_half, false);
+ printf(", ");
+
+ if (cat2->c1.src1_c) {
+ print_reg_src((reg_t)(cat2->c1.src1), cat2->full, cat2->src1_r,
+ cat2->c1.src1_c, cat2->src1_im, cat2->src1_neg,
+ cat2->src1_abs, false);
+ } else if (cat2->rel1.src1_rel) {
+ print_reg_src((reg_t)(cat2->rel1.src1), cat2->full, cat2->src1_r,
+ cat2->rel1.src1_c, cat2->src1_im, cat2->src1_neg,
+ cat2->src1_abs, cat2->rel1.src1_rel);
+ } else {
+ print_reg_src((reg_t)(cat2->src1), cat2->full, cat2->src1_r,
+ false, cat2->src1_im, cat2->src1_neg,
+ cat2->src1_abs, false);
+ }
+
+ switch (cat2->opc) {
+ case OPC_ABSNEG_F:
+ case OPC_ABSNEG_S:
+ case OPC_CLZ_B:
+ case OPC_CLZ_S:
+ case OPC_SIGN_F:
+ case OPC_FLOOR_F:
+ case OPC_CEIL_F:
+ case OPC_RNDNE_F:
+ case OPC_RNDAZ_F:
+ case OPC_TRUNC_F:
+ case OPC_NOT_B:
+ case OPC_BFREV_B:
+ case OPC_SETRM:
+ case OPC_CBITS_B:
+ /* these only have one src reg */
+ break;
+ default:
+ printf(", ");
+ if (cat2->c2.src2_c) {
+ print_reg_src((reg_t)(cat2->c2.src2), cat2->full, cat2->src2_r,
+ cat2->c2.src2_c, cat2->src2_im, cat2->src2_neg,
+ cat2->src2_abs, false);
+ } else if (cat2->rel2.src2_rel) {
+ print_reg_src((reg_t)(cat2->rel2.src2), cat2->full, cat2->src2_r,
+ cat2->rel2.src2_c, cat2->src2_im, cat2->src2_neg,
+ cat2->src2_abs, cat2->rel2.src2_rel);
+ } else {
+ print_reg_src((reg_t)(cat2->src2), cat2->full, cat2->src2_r,
+ false, cat2->src2_im, cat2->src2_neg,
+ cat2->src2_abs, false);
+ }
+ break;
+ }
+}
+
+static void print_instr_cat3(instr_t *instr)
+{
+ instr_cat3_t *cat3 = &instr->cat3;
+ bool full = instr_cat3_full(cat3);
+
+ printf(" ");
+ print_reg_dst((reg_t)(cat3->dst), full ^ cat3->dst_half, false);
+ printf(", ");
+ if (cat3->c1.src1_c) {
+ print_reg_src((reg_t)(cat3->c1.src1), full,
+ cat3->src1_r, cat3->c1.src1_c, false, cat3->src1_neg,
+ false, false);
+ } else if (cat3->rel1.src1_rel) {
+ print_reg_src((reg_t)(cat3->rel1.src1), full,
+ cat3->src1_r, cat3->rel1.src1_c, false, cat3->src1_neg,
+ false, cat3->rel1.src1_rel);
+ } else {
+ print_reg_src((reg_t)(cat3->src1), full,
+ cat3->src1_r, false, false, cat3->src1_neg,
+ false, false);
+ }
+ printf(", ");
+ print_reg_src((reg_t)cat3->src2, full,
+ cat3->src2_r, cat3->src2_c, false, cat3->src2_neg,
+ false, false);
+ printf(", ");
+ if (cat3->c2.src3_c) {
+ print_reg_src((reg_t)(cat3->c2.src3), full,
+ cat3->src3_r, cat3->c2.src3_c, false, cat3->src3_neg,
+ false, false);
+ } else if (cat3->rel2.src3_rel) {
+ print_reg_src((reg_t)(cat3->rel2.src3), full,
+ cat3->src3_r, cat3->rel2.src3_c, false, cat3->src3_neg,
+ false, cat3->rel2.src3_rel);
+ } else {
+ print_reg_src((reg_t)(cat3->src3), full,
+ cat3->src3_r, false, false, cat3->src3_neg,
+ false, false);
+ }
+}
+
+static void print_instr_cat4(instr_t *instr)
+{
+ instr_cat4_t *cat4 = &instr->cat4;
+
+ printf(" ");
+ print_reg_dst((reg_t)(cat4->dst), cat4->full ^ cat4->dst_half, false);
+ printf(", ");
+
+ if (cat4->c.src_c) {
+ print_reg_src((reg_t)(cat4->c.src), cat4->full,
+ cat4->src_r, cat4->c.src_c, cat4->src_im,
+ cat4->src_neg, cat4->src_abs, false);
+ } else if (cat4->rel.src_rel) {
+ print_reg_src((reg_t)(cat4->rel.src), cat4->full,
+ cat4->src_r, cat4->rel.src_c, cat4->src_im,
+ cat4->src_neg, cat4->src_abs, cat4->rel.src_rel);
+ } else {
+ print_reg_src((reg_t)(cat4->src), cat4->full,
+ cat4->src_r, false, cat4->src_im,
+ cat4->src_neg, cat4->src_abs, false);
+ }
+
+ if ((debug & PRINT_VERBOSE) && (cat4->dummy1|cat4->dummy2))
+ printf("\t{4: %x,%x}", cat4->dummy1, cat4->dummy2);
+}
+
+static void print_instr_cat5(instr_t *instr)
+{
+ static const struct {
+ bool src1, src2, samp, tex;
+ } info[0x1f] = {
+ [OPC_ISAM] = { true, false, true, true, },
+ [OPC_ISAML] = { true, true, true, true, },
+ [OPC_ISAMM] = { true, false, true, true, },
+ [OPC_SAM] = { true, false, true, true, },
+ [OPC_SAMB] = { true, true, true, true, },
+ [OPC_SAML] = { true, true, true, true, },
+ [OPC_SAMGQ] = { true, false, true, true, },
+ [OPC_GETLOD] = { true, false, true, true, },
+ [OPC_CONV] = { true, true, true, true, },
+ [OPC_CONVM] = { true, true, true, true, },
+ [OPC_GETSIZE] = { true, false, false, true, },
+ [OPC_GETBUF] = { false, false, false, true, },
+ [OPC_GETPOS] = { true, false, false, true, },
+ [OPC_GETINFO] = { false, false, false, true, },
+ [OPC_DSX] = { true, false, false, false, },
+ [OPC_DSY] = { true, false, false, false, },
+ [OPC_GATHER4R] = { true, false, true, true, },
+ [OPC_GATHER4G] = { true, false, true, true, },
+ [OPC_GATHER4B] = { true, false, true, true, },
+ [OPC_GATHER4A] = { true, false, true, true, },
+ [OPC_SAMGP0] = { true, false, true, true, },
+ [OPC_SAMGP1] = { true, false, true, true, },
+ [OPC_SAMGP2] = { true, false, true, true, },
+ [OPC_SAMGP3] = { true, false, true, true, },
+ [OPC_DSXPP_1] = { true, false, false, false, },
+ [OPC_DSYPP_1] = { true, false, false, false, },
+ [OPC_RGETPOS] = { false, false, false, false, },
+ [OPC_RGETINFO] = { false, false, false, false, },
+ };
+ instr_cat5_t *cat5 = &instr->cat5;
+ int i;
+
+ if (cat5->is_3d) printf(".3d");
+ if (cat5->is_a) printf(".a");
+ if (cat5->is_o) printf(".o");
+ if (cat5->is_p) printf(".p");
+ if (cat5->is_s) printf(".s");
+ if (cat5->is_s2en) printf(".s2en");
+
+ printf(" ");
+
+ switch (cat5->opc) {
+ case OPC_DSXPP_1:
+ case OPC_DSYPP_1:
+ break;
+ default:
+ printf("(%s)", type[cat5->type]);
+ break;
+ }
+
+ printf("(");
+ for (i = 0; i < 4; i++)
+ if (cat5->wrmask & (1 << i))
+ printf("%c", "xyzw"[i]);
+ printf(")");
+
+ print_reg_dst((reg_t)(cat5->dst), type_size(cat5->type) == 32, false);
+
+ if (info[cat5->opc].src1) {
+ printf(", ");
+ print_reg_src((reg_t)(cat5->src1), cat5->full, false, false, false,
+ false, false, false);
+ }
+
+ if (cat5->is_s2en) {
+ printf(", ");
+ print_reg_src((reg_t)(cat5->s2en.src2), cat5->full, false, false, false,
+ false, false, false);
+ printf(", ");
+ print_reg_src((reg_t)(cat5->s2en.src3), false, false, false, false,
+ false, false, false);
+ } else {
+ if (cat5->is_o || info[cat5->opc].src2) {
+ printf(", ");
+ print_reg_src((reg_t)(cat5->norm.src2), cat5->full,
+ false, false, false, false, false, false);
+ }
+ if (info[cat5->opc].samp)
+ printf(", s#%d", cat5->norm.samp);
+ if (info[cat5->opc].tex)
+ printf(", t#%d", cat5->norm.tex);
+ }
+
+ if (debug & PRINT_VERBOSE) {
+ if (cat5->is_s2en) {
+ if ((debug & PRINT_VERBOSE) && (cat5->s2en.dummy1|cat5->s2en.dummy2|cat5->dummy2))
+ printf("\t{5: %x,%x,%x}", cat5->s2en.dummy1, cat5->s2en.dummy2, cat5->dummy2);
+ } else {
+ if ((debug & PRINT_VERBOSE) && (cat5->norm.dummy1|cat5->dummy2))
+ printf("\t{5: %x,%x}", cat5->norm.dummy1, cat5->dummy2);
+ }
+ }
+}
+
+static int32_t u2i(uint32_t val, int nbits)
+{
+ return ((val >> (nbits-1)) * ~((1 << nbits) - 1)) | val;
+}
+
+static void print_instr_cat6(instr_t *instr)
+{
+ instr_cat6_t *cat6 = &instr->cat6;
+
+ printf(".%s ", type[cat6->type]);
+
+ switch (cat6->opc) {
+ case OPC_LDG:
+ case OPC_LDP:
+ case OPC_LDL:
+ case OPC_LDLW:
+ case OPC_LDLV:
+ /* load instructions: */
+ print_reg_dst((reg_t)(cat6->a.dst), type_size(cat6->type) == 32, false);
+ printf(",");
+ switch (cat6->opc) {
+ case OPC_LDG:
+ printf("g");
+ break;
+ case OPC_LDP:
+ printf("p");
+ break;
+ case OPC_LDL:
+ case OPC_LDLW:
+ case OPC_LDLV:
+ printf("l");
+ break;
+ }
+ printf("[");
+ print_reg_src((reg_t)(cat6->a.src), true,
+ false, false, false, false, false, false);
+ if (cat6->a.off)
+ printf("%+d", cat6->a.off);
+ printf("]");
+ break;
+ case OPC_PREFETCH:
+ /* similar to load instructions: */
+ printf("g[");
+ print_reg_src((reg_t)(cat6->a.src), true,
+ false, false, false, false, false, false);
+ if (cat6->a.off)
+ printf("%+d", cat6->a.off);
+ printf("]");
+ break;
+ case OPC_STG:
+ case OPC_STP:
+ case OPC_STL:
+ case OPC_STLW:
+ /* store instructions: */
+ switch (cat6->opc) {
+ case OPC_STG:
+ printf("g");
+ break;
+ case OPC_STP:
+ printf("p");
+ break;
+ case OPC_STL:
+ case OPC_STLW:
+ printf("l");
+ break;
+ }
+ printf("[");
+ print_reg_dst((reg_t)(cat6->b.dst), true, false);
+ if (cat6->b.off || cat6->b.off_hi)
+ printf("%+d", u2i((cat6->b.off_hi << 8) | cat6->b.off, 13));
+ printf("]");
+ printf(",");
+ print_reg_src((reg_t)(cat6->b.src), type_size(cat6->type) == 32,
+ false, false, false, false, false, false);
+
+ break;
+ case OPC_STI:
+ /* sti has same encoding as other store instructions, but
+ * slightly different syntax:
+ */
+ print_reg_dst((reg_t)(cat6->b.dst), false /* XXX is it always half? */, false);
+ if (cat6->b.off || cat6->b.off_hi)
+ printf("%+d", u2i((cat6->b.off_hi << 8) | cat6->b.off, 13));
+ printf(",");
+ print_reg_src((reg_t)(cat6->b.src), type_size(cat6->type) == 32,
+ false, false, false, false, false, false);
+ break;
+ }
+
+ printf(", %d", cat6->iim_val);
+
+ if (debug & PRINT_VERBOSE) {
+ switch (cat6->opc) {
+ case OPC_LDG:
+ case OPC_LDP:
+ /* load instructions: */
+ if (cat6->a.dummy1|cat6->a.dummy2|cat6->a.dummy3)
+ printf("\t{6: %x,%x,%x}", cat6->a.dummy1, cat6->a.dummy2, cat6->a.dummy3);
+ if ((cat6->a.must_be_one1 != 1) || (cat6->a.must_be_one2 != 1))
+ printf("{?? %d,%d ??}", cat6->a.must_be_one1, cat6->a.must_be_one2);
+ break;
+ case OPC_STG:
+ case OPC_STP:
+ case OPC_STI:
+ /* store instructions: */
+ if (cat6->b.dummy1|cat6->b.dummy2)
+ printf("\t{6: %x,%x}", cat6->b.dummy1, cat6->b.dummy2);
+ if ((cat6->b.must_be_one1 != 1) || (cat6->b.must_be_one2 != 1) ||
+ (cat6->b.must_be_zero1 != 0))
+ printf("{?? %d,%d,%d ??}", cat6->b.must_be_one1, cat6->b.must_be_one2,
+ cat6->b.must_be_zero1);
+ break;
+ }
+ }
+}
+
+/* size of largest OPC field of all the instruction categories: */
+#define NOPC_BITS 6
+
+struct opc_info {
+ uint16_t cat;
+ uint16_t opc;
+ const char *name;
+ void (*print)(instr_t *instr);
+} opcs[1 << (3+NOPC_BITS)] = {
+#define OPC(cat, opc, name) [((cat) << NOPC_BITS) | (opc)] = { (cat), (opc), #name, print_instr_cat##cat }
+ /* category 0: */
+ OPC(0, OPC_NOP, nop),
+ OPC(0, OPC_BR, br),
+ OPC(0, OPC_JUMP, jump),
+ OPC(0, OPC_CALL, call),
+ OPC(0, OPC_RET, ret),
+ OPC(0, OPC_KILL, kill),
+ OPC(0, OPC_END, end),
+ OPC(0, OPC_EMIT, emit),
+ OPC(0, OPC_CUT, cut),
+ OPC(0, OPC_CHMASK, chmask),
+ OPC(0, OPC_CHSH, chsh),
+ OPC(0, OPC_FLOW_REV, flow_rev),
+
+ /* category 1: */
+ OPC(1, 0, ),
+
+ /* category 2: */
+ OPC(2, OPC_ADD_F, add.f),
+ OPC(2, OPC_MIN_F, min.f),
+ OPC(2, OPC_MAX_F, max.f),
+ OPC(2, OPC_MUL_F, mul.f),
+ OPC(2, OPC_SIGN_F, sign.f),
+ OPC(2, OPC_CMPS_F, cmps.f),
+ OPC(2, OPC_ABSNEG_F, absneg.f),
+ OPC(2, OPC_CMPV_F, cmpv.f),
+ OPC(2, OPC_FLOOR_F, floor.f),
+ OPC(2, OPC_CEIL_F, ceil.f),
+ OPC(2, OPC_RNDNE_F, rndne.f),
+ OPC(2, OPC_RNDAZ_F, rndaz.f),
+ OPC(2, OPC_TRUNC_F, trunc.f),
+ OPC(2, OPC_ADD_U, add.u),
+ OPC(2, OPC_ADD_S, add.s),
+ OPC(2, OPC_SUB_U, sub.u),
+ OPC(2, OPC_SUB_S, sub.s),
+ OPC(2, OPC_CMPS_U, cmps.u),
+ OPC(2, OPC_CMPS_S, cmps.s),
+ OPC(2, OPC_MIN_U, min.u),
+ OPC(2, OPC_MIN_S, min.s),
+ OPC(2, OPC_MAX_U, max.u),
+ OPC(2, OPC_MAX_S, max.s),
+ OPC(2, OPC_ABSNEG_S, absneg.s),
+ OPC(2, OPC_AND_B, and.b),
+ OPC(2, OPC_OR_B, or.b),
+ OPC(2, OPC_NOT_B, not.b),
+ OPC(2, OPC_XOR_B, xor.b),
+ OPC(2, OPC_CMPV_U, cmpv.u),
+ OPC(2, OPC_CMPV_S, cmpv.s),
+ OPC(2, OPC_MUL_U, mul.u),
+ OPC(2, OPC_MUL_S, mul.s),
+ OPC(2, OPC_MULL_U, mull.u),
+ OPC(2, OPC_BFREV_B, bfrev.b),
+ OPC(2, OPC_CLZ_S, clz.s),
+ OPC(2, OPC_CLZ_B, clz.b),
+ OPC(2, OPC_SHL_B, shl.b),
+ OPC(2, OPC_SHR_B, shr.b),
+ OPC(2, OPC_ASHR_B, ashr.b),
+ OPC(2, OPC_BARY_F, bary.f),
+ OPC(2, OPC_MGEN_B, mgen.b),
+ OPC(2, OPC_GETBIT_B, getbit.b),
+ OPC(2, OPC_SETRM, setrm),
+ OPC(2, OPC_CBITS_B, cbits.b),
+ OPC(2, OPC_SHB, shb),
+ OPC(2, OPC_MSAD, msad),
+
+ /* category 3: */
+ OPC(3, OPC_MAD_U16, mad.u16),
+ OPC(3, OPC_MADSH_U16, madsh.u16),
+ OPC(3, OPC_MAD_S16, mad.s16),
+ OPC(3, OPC_MADSH_M16, madsh.m16),
+ OPC(3, OPC_MAD_U24, mad.u24),
+ OPC(3, OPC_MAD_S24, mad.s24),
+ OPC(3, OPC_MAD_F16, mad.f16),
+ OPC(3, OPC_MAD_F32, mad.f32),
+ OPC(3, OPC_SEL_B16, sel.b16),
+ OPC(3, OPC_SEL_B32, sel.b32),
+ OPC(3, OPC_SEL_S16, sel.s16),
+ OPC(3, OPC_SEL_S32, sel.s32),
+ OPC(3, OPC_SEL_F16, sel.f16),
+ OPC(3, OPC_SEL_F32, sel.f32),
+ OPC(3, OPC_SAD_S16, sad.s16),
+ OPC(3, OPC_SAD_S32, sad.s32),
+
+ /* category 4: */
+ OPC(4, OPC_RCP, rcp),
+ OPC(4, OPC_RSQ, rsq),
+ OPC(4, OPC_LOG2, log2),
+ OPC(4, OPC_EXP2, exp2),
+ OPC(4, OPC_SIN, sin),
+ OPC(4, OPC_COS, cos),
+ OPC(4, OPC_SQRT, sqrt),
+
+ /* category 5: */
+ OPC(5, OPC_ISAM, isam),
+ OPC(5, OPC_ISAML, isaml),
+ OPC(5, OPC_ISAMM, isamm),
+ OPC(5, OPC_SAM, sam),
+ OPC(5, OPC_SAMB, samb),
+ OPC(5, OPC_SAML, saml),
+ OPC(5, OPC_SAMGQ, samgq),
+ OPC(5, OPC_GETLOD, getlod),
+ OPC(5, OPC_CONV, conv),
+ OPC(5, OPC_CONVM, convm),
+ OPC(5, OPC_GETSIZE, getsize),
+ OPC(5, OPC_GETBUF, getbuf),
+ OPC(5, OPC_GETPOS, getpos),
+ OPC(5, OPC_GETINFO, getinfo),
+ OPC(5, OPC_DSX, dsx),
+ OPC(5, OPC_DSY, dsy),
+ OPC(5, OPC_GATHER4R, gather4r),
+ OPC(5, OPC_GATHER4G, gather4g),
+ OPC(5, OPC_GATHER4B, gather4b),
+ OPC(5, OPC_GATHER4A, gather4a),
+ OPC(5, OPC_SAMGP0, samgp0),
+ OPC(5, OPC_SAMGP1, samgp1),
+ OPC(5, OPC_SAMGP2, samgp2),
+ OPC(5, OPC_SAMGP3, samgp3),
+ OPC(5, OPC_DSXPP_1, dsxpp.1),
+ OPC(5, OPC_DSYPP_1, dsypp.1),
+ OPC(5, OPC_RGETPOS, rgetpos),
+ OPC(5, OPC_RGETINFO, rgetinfo),
+
+
+ /* category 6: */
+ OPC(6, OPC_LDG, ldg),
+ OPC(6, OPC_LDL, ldl),
+ OPC(6, OPC_LDP, ldp),
+ OPC(6, OPC_STG, stg),
+ OPC(6, OPC_STL, stl),
+ OPC(6, OPC_STP, stp),
+ OPC(6, OPC_STI, sti),
+ OPC(6, OPC_G2L, g2l),
+ OPC(6, OPC_L2G, l2g),
+ OPC(6, OPC_PREFETCH, prefetch),
+ OPC(6, OPC_LDLW, ldlw),
+ OPC(6, OPC_STLW, stlw),
+ OPC(6, OPC_RESFMT, resfmt),
+ OPC(6, OPC_RESINFO, resinf),
+ OPC(6, OPC_ATOMIC_ADD_L, atomic.add.l),
+ OPC(6, OPC_ATOMIC_SUB_L, atomic.sub.l),
+ OPC(6, OPC_ATOMIC_XCHG_L, atomic.xchg.l),
+ OPC(6, OPC_ATOMIC_INC_L, atomic.inc.l),
+ OPC(6, OPC_ATOMIC_DEC_L, atomic.dec.l),
+ OPC(6, OPC_ATOMIC_CMPXCHG_L, atomic.cmpxchg.l),
+ OPC(6, OPC_ATOMIC_MIN_L, atomic.min.l),
+ OPC(6, OPC_ATOMIC_MAX_L, atomic.max.l),
+ OPC(6, OPC_ATOMIC_AND_L, atomic.and.l),
+ OPC(6, OPC_ATOMIC_OR_L, atomic.or.l),
+ OPC(6, OPC_ATOMIC_XOR_L, atomic.xor.l),
+ OPC(6, OPC_LDGB_TYPED_4D, ldgb.typed.4d),
+ OPC(6, OPC_STGB_4D_4, stgb.4d.4),
+ OPC(6, OPC_STIB, stib),
+ OPC(6, OPC_LDC_4, ldc.4),
+ OPC(6, OPC_LDLV, ldlv),
+
+
+#undef OPC
+};
+
+#define GETINFO(instr) (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr)]))
+
+// XXX hack.. probably should move this table somewhere common:
+#include "ir3.h"
+const char *ir3_instr_name(struct ir3_instruction *instr)
+{
+ if (instr->category == -1) return "??meta??";
+ return opcs[(instr->category << NOPC_BITS) | instr->opc].name;
+}
+
+static void print_instr(uint32_t *dwords, int level, int n)
+{
+ instr_t *instr = (instr_t *)dwords;
+ uint32_t opc = instr_opc(instr);
+ const char *name;
+
+ printf("%s%04d[%08xx_%08xx] ", levels[level], n, dwords[1], dwords[0]);
+
+#if 0
+ /* print unknown bits: */
+ if (debug & PRINT_RAW)
+ printf("[%08xx_%08xx] ", dwords[1] & 0x001ff800, dwords[0] & 0x00000000);
+
+ if (debug & PRINT_VERBOSE)
+ printf("%d,%02d ", instr->opc_cat, opc);
+#endif
+
+ /* NOTE: order flags are printed is a bit fugly.. but for now I
+ * try to match the order in llvm-a3xx disassembler for easy
+ * diff'ing..
+ */
+
+ if (instr->sync)
+ printf("(sy)");
+ if (instr->ss && (instr->opc_cat <= 4))
+ printf("(ss)");
+ if (instr->jmp_tgt)
+ printf("(jp)");
+ if (instr->repeat && (instr->opc_cat <= 4)) {
+ printf("(rpt%d)", instr->repeat);
+ repeat = instr->repeat;
+ } else {
+ repeat = 0;
+ }
+ if (instr->ul && ((2 <= instr->opc_cat) && (instr->opc_cat <= 4)))
+ printf("(ul)");
+
+ name = GETINFO(instr)->name;
+
+ if (name) {
+ printf("%s", name);
+ GETINFO(instr)->print(instr);
+ } else {
+ printf("unknown(%d,%d)", instr->opc_cat, opc);
+ }
+
+ printf("\n");
+}
+
+int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, enum shader_t type)
+{
+ int i;
+
+ assert((sizedwords % 2) == 0);
+
+ for (i = 0; i < sizedwords; i += 2)
+ print_instr(&dwords[i], level, i/2);
+
+ return 0;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
new file mode 100644
index 00000000000..c67f1037ced
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
@@ -0,0 +1,691 @@
+/*
+ * Copyright (c) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef INSTR_A3XX_H_
+#define INSTR_A3XX_H_
+
+#define PACKED __attribute__((__packed__))
+
+#include <stdint.h>
+#include <assert.h>
+
+typedef enum {
+ /* category 0: */
+ OPC_NOP = 0,
+ OPC_BR = 1,
+ OPC_JUMP = 2,
+ OPC_CALL = 3,
+ OPC_RET = 4,
+ OPC_KILL = 5,
+ OPC_END = 6,
+ OPC_EMIT = 7,
+ OPC_CUT = 8,
+ OPC_CHMASK = 9,
+ OPC_CHSH = 10,
+ OPC_FLOW_REV = 11,
+
+ /* category 1: */
+ /* no opc.. all category 1 are variants of mov */
+
+ /* category 2: */
+ OPC_ADD_F = 0,
+ OPC_MIN_F = 1,
+ OPC_MAX_F = 2,
+ OPC_MUL_F = 3,
+ OPC_SIGN_F = 4,
+ OPC_CMPS_F = 5,
+ OPC_ABSNEG_F = 6,
+ OPC_CMPV_F = 7,
+ /* 8 - invalid */
+ OPC_FLOOR_F = 9,
+ OPC_CEIL_F = 10,
+ OPC_RNDNE_F = 11,
+ OPC_RNDAZ_F = 12,
+ OPC_TRUNC_F = 13,
+ /* 14-15 - invalid */
+ OPC_ADD_U = 16,
+ OPC_ADD_S = 17,
+ OPC_SUB_U = 18,
+ OPC_SUB_S = 19,
+ OPC_CMPS_U = 20,
+ OPC_CMPS_S = 21,
+ OPC_MIN_U = 22,
+ OPC_MIN_S = 23,
+ OPC_MAX_U = 24,
+ OPC_MAX_S = 25,
+ OPC_ABSNEG_S = 26,
+ /* 27 - invalid */
+ OPC_AND_B = 28,
+ OPC_OR_B = 29,
+ OPC_NOT_B = 30,
+ OPC_XOR_B = 31,
+ /* 32 - invalid */
+ OPC_CMPV_U = 33,
+ OPC_CMPV_S = 34,
+ /* 35-47 - invalid */
+ OPC_MUL_U = 48,
+ OPC_MUL_S = 49,
+ OPC_MULL_U = 50,
+ OPC_BFREV_B = 51,
+ OPC_CLZ_S = 52,
+ OPC_CLZ_B = 53,
+ OPC_SHL_B = 54,
+ OPC_SHR_B = 55,
+ OPC_ASHR_B = 56,
+ OPC_BARY_F = 57,
+ OPC_MGEN_B = 58,
+ OPC_GETBIT_B = 59,
+ OPC_SETRM = 60,
+ OPC_CBITS_B = 61,
+ OPC_SHB = 62,
+ OPC_MSAD = 63,
+
+ /* category 3: */
+ OPC_MAD_U16 = 0,
+ OPC_MADSH_U16 = 1,
+ OPC_MAD_S16 = 2,
+ OPC_MADSH_M16 = 3, /* should this be .s16? */
+ OPC_MAD_U24 = 4,
+ OPC_MAD_S24 = 5,
+ OPC_MAD_F16 = 6,
+ OPC_MAD_F32 = 7,
+ OPC_SEL_B16 = 8,
+ OPC_SEL_B32 = 9,
+ OPC_SEL_S16 = 10,
+ OPC_SEL_S32 = 11,
+ OPC_SEL_F16 = 12,
+ OPC_SEL_F32 = 13,
+ OPC_SAD_S16 = 14,
+ OPC_SAD_S32 = 15,
+
+ /* category 4: */
+ OPC_RCP = 0,
+ OPC_RSQ = 1,
+ OPC_LOG2 = 2,
+ OPC_EXP2 = 3,
+ OPC_SIN = 4,
+ OPC_COS = 5,
+ OPC_SQRT = 6,
+ // 7-63 - invalid
+
+ /* category 5: */
+ OPC_ISAM = 0,
+ OPC_ISAML = 1,
+ OPC_ISAMM = 2,
+ OPC_SAM = 3,
+ OPC_SAMB = 4,
+ OPC_SAML = 5,
+ OPC_SAMGQ = 6,
+ OPC_GETLOD = 7,
+ OPC_CONV = 8,
+ OPC_CONVM = 9,
+ OPC_GETSIZE = 10,
+ OPC_GETBUF = 11,
+ OPC_GETPOS = 12,
+ OPC_GETINFO = 13,
+ OPC_DSX = 14,
+ OPC_DSY = 15,
+ OPC_GATHER4R = 16,
+ OPC_GATHER4G = 17,
+ OPC_GATHER4B = 18,
+ OPC_GATHER4A = 19,
+ OPC_SAMGP0 = 20,
+ OPC_SAMGP1 = 21,
+ OPC_SAMGP2 = 22,
+ OPC_SAMGP3 = 23,
+ OPC_DSXPP_1 = 24,
+ OPC_DSYPP_1 = 25,
+ OPC_RGETPOS = 26,
+ OPC_RGETINFO = 27,
+
+ /* category 6: */
+ OPC_LDG = 0, /* load-global */
+ OPC_LDL = 1,
+ OPC_LDP = 2,
+ OPC_STG = 3, /* store-global */
+ OPC_STL = 4,
+ OPC_STP = 5,
+ OPC_STI = 6,
+ OPC_G2L = 7,
+ OPC_L2G = 8,
+ OPC_PREFETCH = 9,
+ OPC_LDLW = 10,
+ OPC_STLW = 11,
+ OPC_RESFMT = 14,
+ OPC_RESINFO = 15,
+ OPC_ATOMIC_ADD_L = 16,
+ OPC_ATOMIC_SUB_L = 17,
+ OPC_ATOMIC_XCHG_L = 18,
+ OPC_ATOMIC_INC_L = 19,
+ OPC_ATOMIC_DEC_L = 20,
+ OPC_ATOMIC_CMPXCHG_L = 21,
+ OPC_ATOMIC_MIN_L = 22,
+ OPC_ATOMIC_MAX_L = 23,
+ OPC_ATOMIC_AND_L = 24,
+ OPC_ATOMIC_OR_L = 25,
+ OPC_ATOMIC_XOR_L = 26,
+ OPC_LDGB_TYPED_4D = 27,
+ OPC_STGB_4D_4 = 28,
+ OPC_STIB = 29,
+ OPC_LDC_4 = 30,
+ OPC_LDLV = 31,
+
+ /* meta instructions (category -1): */
+ /* placeholder instr to mark inputs/outputs: */
+ OPC_META_INPUT = 0,
+ OPC_META_OUTPUT = 1,
+ /* The "fan-in" and "fan-out" instructions are used for keeping
+ * track of instructions that write to multiple dst registers
+ * (fan-out) like texture sample instructions, or read multiple
+ * consecutive scalar registers (fan-in) (bary.f, texture samp)
+ */
+ OPC_META_FO = 2,
+ OPC_META_FI = 3,
+ /* branches/flow control */
+ OPC_META_FLOW = 4,
+ OPC_META_PHI = 5,
+ /* relative addressing */
+ OPC_META_DEREF = 6,
+
+
+} opc_t;
+
+typedef enum {
+ TYPE_F16 = 0,
+ TYPE_F32 = 1,
+ TYPE_U16 = 2,
+ TYPE_U32 = 3,
+ TYPE_S16 = 4,
+ TYPE_S32 = 5,
+ TYPE_U8 = 6,
+ TYPE_S8 = 7, // XXX I assume?
+} type_t;
+
+static inline uint32_t type_size(type_t type)
+{
+ switch (type) {
+ case TYPE_F32:
+ case TYPE_U32:
+ case TYPE_S32:
+ return 32;
+ case TYPE_F16:
+ case TYPE_U16:
+ case TYPE_S16:
+ return 16;
+ case TYPE_U8:
+ case TYPE_S8:
+ return 8;
+ default:
+ assert(0); /* invalid type */
+ return 0;
+ }
+}
+
+static inline int type_float(type_t type)
+{
+ return (type == TYPE_F32) || (type == TYPE_F16);
+}
+
+static inline int type_uint(type_t type)
+{
+ return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8);
+}
+
+static inline int type_sint(type_t type)
+{
+ return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8);
+}
+
+typedef union PACKED {
+ /* normal gpr or const src register: */
+ struct PACKED {
+ uint32_t comp : 2;
+ uint32_t num : 10;
+ };
+ /* for immediate val: */
+ int32_t iim_val : 11;
+ /* to make compiler happy: */
+ uint32_t dummy32;
+ uint32_t dummy10 : 10;
+ uint32_t dummy11 : 11;
+ uint32_t dummy12 : 12;
+ uint32_t dummy13 : 13;
+ uint32_t dummy8 : 8;
+} reg_t;
+
+/* special registers: */
+#define REG_A0 61 /* address register */
+#define REG_P0 62 /* predicate register */
+
+static inline int reg_special(reg_t reg)
+{
+ return (reg.num == REG_A0) || (reg.num == REG_P0);
+}
+
+typedef struct PACKED {
+ /* dword0: */
+ int16_t immed : 16;
+ uint32_t dummy1 : 16;
+
+ /* dword1: */
+ uint32_t dummy2 : 8;
+ uint32_t repeat : 3;
+ uint32_t dummy3 : 1;
+ uint32_t ss : 1;
+ uint32_t dummy4 : 7;
+ uint32_t inv : 1;
+ uint32_t comp : 2;
+ uint32_t opc : 4;
+ uint32_t jmp_tgt : 1;
+ uint32_t sync : 1;
+ uint32_t opc_cat : 3;
+} instr_cat0_t;
+
+typedef struct PACKED {
+ /* dword0: */
+ union PACKED {
+ /* for normal src register: */
+ struct PACKED {
+ uint32_t src : 11;
+ /* at least low bit of pad must be zero or it will
+ * look like a address relative src
+ */
+ uint32_t pad : 21;
+ };
+ /* for address relative: */
+ struct PACKED {
+ int32_t off : 10;
+ uint32_t src_rel_c : 1;
+ uint32_t src_rel : 1;
+ uint32_t unknown : 20;
+ };
+ /* for immediate: */
+ int32_t iim_val;
+ float fim_val;
+ };
+
+ /* dword1: */
+ uint32_t dst : 8;
+ uint32_t repeat : 3;
+ uint32_t src_r : 1;
+ uint32_t ss : 1;
+ uint32_t ul : 1;
+ uint32_t dst_type : 3;
+ uint32_t dst_rel : 1;
+ uint32_t src_type : 3;
+ uint32_t src_c : 1;
+ uint32_t src_im : 1;
+ uint32_t even : 1;
+ uint32_t pos_inf : 1;
+ uint32_t must_be_0 : 2;
+ uint32_t jmp_tgt : 1;
+ uint32_t sync : 1;
+ uint32_t opc_cat : 3;
+} instr_cat1_t;
+
+typedef struct PACKED {
+ /* dword0: */
+ union PACKED {
+ struct PACKED {
+ uint32_t src1 : 11;
+ uint32_t must_be_zero1: 2;
+ uint32_t src1_im : 1; /* immediate */
+ uint32_t src1_neg : 1; /* negate */
+ uint32_t src1_abs : 1; /* absolute value */
+ };
+ struct PACKED {
+ uint32_t src1 : 10;
+ uint32_t src1_c : 1; /* relative-const */
+ uint32_t src1_rel : 1; /* relative address */
+ uint32_t must_be_zero : 1;
+ uint32_t dummy : 3;
+ } rel1;
+ struct PACKED {
+ uint32_t src1 : 12;
+ uint32_t src1_c : 1; /* const */
+ uint32_t dummy : 3;
+ } c1;
+ };
+
+ union PACKED {
+ struct PACKED {
+ uint32_t src2 : 11;
+ uint32_t must_be_zero2: 2;
+ uint32_t src2_im : 1; /* immediate */
+ uint32_t src2_neg : 1; /* negate */
+ uint32_t src2_abs : 1; /* absolute value */
+ };
+ struct PACKED {
+ uint32_t src2 : 10;
+ uint32_t src2_c : 1; /* relative-const */
+ uint32_t src2_rel : 1; /* relative address */
+ uint32_t must_be_zero : 1;
+ uint32_t dummy : 3;
+ } rel2;
+ struct PACKED {
+ uint32_t src2 : 12;
+ uint32_t src2_c : 1; /* const */
+ uint32_t dummy : 3;
+ } c2;
+ };
+
+ /* dword1: */
+ uint32_t dst : 8;
+ uint32_t repeat : 3;
+ uint32_t src1_r : 1;
+ uint32_t ss : 1;
+ uint32_t ul : 1; /* dunno */
+ uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */
+ uint32_t ei : 1;
+ uint32_t cond : 3;
+ uint32_t src2_r : 1;
+ uint32_t full : 1; /* not half */
+ uint32_t opc : 6;
+ uint32_t jmp_tgt : 1;
+ uint32_t sync : 1;
+ uint32_t opc_cat : 3;
+} instr_cat2_t;
+
+typedef struct PACKED {
+ /* dword0: */
+ union PACKED {
+ struct PACKED {
+ uint32_t src1 : 11;
+ uint32_t must_be_zero1: 2;
+ uint32_t src2_c : 1;
+ uint32_t src1_neg : 1;
+ uint32_t src2_r : 1;
+ };
+ struct PACKED {
+ uint32_t src1 : 10;
+ uint32_t src1_c : 1;
+ uint32_t src1_rel : 1;
+ uint32_t must_be_zero : 1;
+ uint32_t dummy : 3;
+ } rel1;
+ struct PACKED {
+ uint32_t src1 : 12;
+ uint32_t src1_c : 1;
+ uint32_t dummy : 3;
+ } c1;
+ };
+
+ union PACKED {
+ struct PACKED {
+ uint32_t src3 : 11;
+ uint32_t must_be_zero2: 2;
+ uint32_t src3_r : 1;
+ uint32_t src2_neg : 1;
+ uint32_t src3_neg : 1;
+ };
+ struct PACKED {
+ uint32_t src3 : 10;
+ uint32_t src3_c : 1;
+ uint32_t src3_rel : 1;
+ uint32_t must_be_zero : 1;
+ uint32_t dummy : 3;
+ } rel2;
+ struct PACKED {
+ uint32_t src3 : 12;
+ uint32_t src3_c : 1;
+ uint32_t dummy : 3;
+ } c2;
+ };
+
+ /* dword1: */
+ uint32_t dst : 8;
+ uint32_t repeat : 3;
+ uint32_t src1_r : 1;
+ uint32_t ss : 1;
+ uint32_t ul : 1;
+ uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */
+ uint32_t src2 : 8;
+ uint32_t opc : 4;
+ uint32_t jmp_tgt : 1;
+ uint32_t sync : 1;
+ uint32_t opc_cat : 3;
+} instr_cat3_t;
+
+static inline bool instr_cat3_full(instr_cat3_t *cat3)
+{
+ switch (cat3->opc) {
+ case OPC_MAD_F16:
+ case OPC_MAD_U16:
+ case OPC_MAD_S16:
+ case OPC_SEL_B16:
+ case OPC_SEL_S16:
+ case OPC_SEL_F16:
+ case OPC_SAD_S16:
+ case OPC_SAD_S32: // really??
+ return false;
+ default:
+ return true;
+ }
+}
+
+typedef struct PACKED {
+ /* dword0: */
+ union PACKED {
+ struct PACKED {
+ uint32_t src : 11;
+ uint32_t must_be_zero1: 2;
+ uint32_t src_im : 1; /* immediate */
+ uint32_t src_neg : 1; /* negate */
+ uint32_t src_abs : 1; /* absolute value */
+ };
+ struct PACKED {
+ uint32_t src : 10;
+ uint32_t src_c : 1; /* relative-const */
+ uint32_t src_rel : 1; /* relative address */
+ uint32_t must_be_zero : 1;
+ uint32_t dummy : 3;
+ } rel;
+ struct PACKED {
+ uint32_t src : 12;
+ uint32_t src_c : 1; /* const */
+ uint32_t dummy : 3;
+ } c;
+ };
+ uint32_t dummy1 : 16; /* seem to be ignored */
+
+ /* dword1: */
+ uint32_t dst : 8;
+ uint32_t repeat : 3;
+ uint32_t src_r : 1;
+ uint32_t ss : 1;
+ uint32_t ul : 1;
+ uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */
+ uint32_t dummy2 : 5; /* seem to be ignored */
+ uint32_t full : 1; /* not half */
+ uint32_t opc : 6;
+ uint32_t jmp_tgt : 1;
+ uint32_t sync : 1;
+ uint32_t opc_cat : 3;
+} instr_cat4_t;
+
+typedef struct PACKED {
+ /* dword0: */
+ union PACKED {
+ /* normal case: */
+ struct PACKED {
+ uint32_t full : 1; /* not half */
+ uint32_t src1 : 8;
+ uint32_t src2 : 8;
+ uint32_t dummy1 : 4; /* seem to be ignored */
+ uint32_t samp : 4;
+ uint32_t tex : 7;
+ } norm;
+ /* s2en case: */
+ struct PACKED {
+ uint32_t full : 1; /* not half */
+ uint32_t src1 : 8;
+ uint32_t src2 : 11;
+ uint32_t dummy1 : 1;
+ uint32_t src3 : 8;
+ uint32_t dummy2 : 3;
+ } s2en;
+ /* same in either case: */
+ // XXX I think, confirm this
+ struct PACKED {
+ uint32_t full : 1; /* not half */
+ uint32_t src1 : 8;
+ uint32_t pad : 23;
+ };
+ };
+
+ /* dword1: */
+ uint32_t dst : 8;
+ uint32_t wrmask : 4; /* write-mask */
+ uint32_t type : 3;
+ uint32_t dummy2 : 1; /* seems to be ignored */
+ uint32_t is_3d : 1;
+
+ uint32_t is_a : 1;
+ uint32_t is_s : 1;
+ uint32_t is_s2en : 1;
+ uint32_t is_o : 1;
+ uint32_t is_p : 1;
+
+ uint32_t opc : 5;
+ uint32_t jmp_tgt : 1;
+ uint32_t sync : 1;
+ uint32_t opc_cat : 3;
+} instr_cat5_t;
+
+/* used for load instructions: */
+typedef struct PACKED {
+ /* dword0: */
+ uint32_t must_be_one1 : 1;
+ int16_t off : 13;
+ uint32_t src : 8;
+ uint32_t dummy1 : 1;
+ uint32_t must_be_one2 : 1;
+ int32_t iim_val : 8;
+
+ /* dword1: */
+ uint32_t dst : 8;
+ uint32_t dummy2 : 9;
+ uint32_t type : 3;
+ uint32_t dummy3 : 2;
+ uint32_t opc : 5;
+ uint32_t jmp_tgt : 1;
+ uint32_t sync : 1;
+ uint32_t opc_cat : 3;
+} instr_cat6a_t;
+
+/* used for store instructions: */
+typedef struct PACKED {
+ /* dword0: */
+ uint32_t must_be_zero1 : 1;
+ uint32_t src : 8;
+ uint32_t off_hi : 5; /* high bits of 'off'... ugly! */
+ uint32_t dummy1 : 9;
+ uint32_t must_be_one1 : 1;
+ int32_t iim_val : 8;
+
+ /* dword1: */
+ uint16_t off : 8;
+ uint32_t must_be_one2 : 1;
+ uint32_t dst : 8;
+ uint32_t type : 3;
+ uint32_t dummy2 : 2;
+ uint32_t opc : 5;
+ uint32_t jmp_tgt : 1;
+ uint32_t sync : 1;
+ uint32_t opc_cat : 3;
+} instr_cat6b_t;
+
+typedef union PACKED {
+ instr_cat6a_t a;
+ instr_cat6b_t b;
+ struct PACKED {
+ /* dword0: */
+ uint32_t pad1 : 24;
+ int32_t iim_val : 8;
+
+ /* dword1: */
+ uint32_t pad2 : 17;
+ uint32_t type : 3;
+ uint32_t pad3 : 2;
+ uint32_t opc : 5;
+ uint32_t jmp_tgt : 1;
+ uint32_t sync : 1;
+ uint32_t opc_cat : 3;
+ };
+} instr_cat6_t;
+
+typedef union PACKED {
+ instr_cat0_t cat0;
+ instr_cat1_t cat1;
+ instr_cat2_t cat2;
+ instr_cat3_t cat3;
+ instr_cat4_t cat4;
+ instr_cat5_t cat5;
+ instr_cat6_t cat6;
+ struct PACKED {
+ /* dword0: */
+ uint64_t pad1 : 40;
+ uint32_t repeat : 3; /* cat0-cat4 */
+ uint32_t pad2 : 1;
+ uint32_t ss : 1; /* cat1-cat4 (cat0??) */
+ uint32_t ul : 1; /* cat2-cat4 (and cat1 in blob.. which may be bug??) */
+ uint32_t pad3 : 13;
+ uint32_t jmp_tgt : 1;
+ uint32_t sync : 1;
+ uint32_t opc_cat : 3;
+
+ };
+} instr_t;
+
+static inline uint32_t instr_opc(instr_t *instr)
+{
+ switch (instr->opc_cat) {
+ case 0: return instr->cat0.opc;
+ case 1: return 0;
+ case 2: return instr->cat2.opc;
+ case 3: return instr->cat3.opc;
+ case 4: return instr->cat4.opc;
+ case 5: return instr->cat5.opc;
+ case 6: return instr->cat6.opc;
+ default: return 0;
+ }
+}
+
+static inline bool is_mad(opc_t opc)
+{
+ switch (opc) {
+ case OPC_MAD_U16:
+ case OPC_MADSH_U16:
+ case OPC_MAD_S16:
+ case OPC_MADSH_M16:
+ case OPC_MAD_U24:
+ case OPC_MAD_S24:
+ case OPC_MAD_F16:
+ case OPC_MAD_F32:
+ return true;
+ default:
+ return false;
+ }
+}
+
+#endif /* INSTR_A3XX_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
new file mode 100644
index 00000000000..ea2a9251b28
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -0,0 +1,675 @@
+/*
+ * Copyright (c) 2012 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <stdbool.h>
+#include <errno.h>
+
+#include "freedreno_util.h"
+#include "instr-a3xx.h"
+
+#define CHUNK_SZ 1020
+
+struct ir3_heap_chunk {
+ struct ir3_heap_chunk *next;
+ uint32_t heap[CHUNK_SZ];
+};
+
+static void grow_heap(struct ir3 *shader)
+{
+ struct ir3_heap_chunk *chunk = calloc(1, sizeof(*chunk));
+ chunk->next = shader->chunk;
+ shader->chunk = chunk;
+ shader->heap_idx = 0;
+}
+
+/* simple allocator to carve allocations out of an up-front allocated heap,
+ * so that we can free everything easily in one shot.
+ */
+void * ir3_alloc(struct ir3 *shader, int sz)
+{
+ void *ptr;
+
+ sz = align(sz, 4) / 4;
+
+ if ((shader->heap_idx + sz) > CHUNK_SZ)
+ grow_heap(shader);
+
+ ptr = &shader->chunk->heap[shader->heap_idx];
+ shader->heap_idx += sz;
+
+ return ptr;
+}
+
+struct ir3 * ir3_create(void)
+{
+ struct ir3 *shader =
+ calloc(1, sizeof(struct ir3));
+ grow_heap(shader);
+ return shader;
+}
+
+void ir3_destroy(struct ir3 *shader)
+{
+ while (shader->chunk) {
+ struct ir3_heap_chunk *chunk = shader->chunk;
+ shader->chunk = chunk->next;
+ free(chunk);
+ }
+ free(shader);
+}
+
+#define iassert(cond) do { \
+ if (!(cond)) { \
+ assert(cond); \
+ return -1; \
+ } } while (0)
+
+static uint32_t reg(struct ir3_register *reg, struct ir3_info *info,
+ uint32_t repeat, uint32_t valid_flags)
+{
+ reg_t val = { .dummy32 = 0 };
+
+ assert(!(reg->flags & ~valid_flags));
+
+ if (!(reg->flags & IR3_REG_R))
+ repeat = 0;
+
+ if (reg->flags & IR3_REG_IMMED) {
+ val.iim_val = reg->iim_val;
+ } else {
+ int8_t components = util_last_bit(reg->wrmask);
+ int8_t max = (reg->num + repeat + components - 1) >> 2;
+
+ val.comp = reg->num & 0x3;
+ val.num = reg->num >> 2;
+
+ if (reg->flags & IR3_REG_CONST) {
+ info->max_const = MAX2(info->max_const, max);
+ } else if ((max != REG_A0) && (max != REG_P0)) {
+ if (reg->flags & IR3_REG_HALF) {
+ info->max_half_reg = MAX2(info->max_half_reg, max);
+ } else {
+ info->max_reg = MAX2(info->max_reg, max);
+ }
+ }
+ }
+
+ return val.dummy32;
+}
+
+static int emit_cat0(struct ir3_instruction *instr, void *ptr,
+ struct ir3_info *info)
+{
+ instr_cat0_t *cat0 = ptr;
+
+ cat0->immed = instr->cat0.immed;
+ cat0->repeat = instr->repeat;
+ cat0->ss = !!(instr->flags & IR3_INSTR_SS);
+ cat0->inv = instr->cat0.inv;
+ cat0->comp = instr->cat0.comp;
+ cat0->opc = instr->opc;
+ cat0->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
+ cat0->sync = !!(instr->flags & IR3_INSTR_SY);
+ cat0->opc_cat = 0;
+
+ return 0;
+}
+
+static uint32_t type_flags(type_t type)
+{
+ return (type_size(type) == 32) ? 0 : IR3_REG_HALF;
+}
+
+static int emit_cat1(struct ir3_instruction *instr, void *ptr,
+ struct ir3_info *info)
+{
+ struct ir3_register *dst = instr->regs[0];
+ struct ir3_register *src = instr->regs[1];
+ instr_cat1_t *cat1 = ptr;
+
+ iassert(instr->regs_count == 2);
+ iassert(!((dst->flags ^ type_flags(instr->cat1.dst_type)) & IR3_REG_HALF));
+ iassert((src->flags & IR3_REG_IMMED) ||
+ !((src->flags ^ type_flags(instr->cat1.src_type)) & IR3_REG_HALF));
+
+ if (src->flags & IR3_REG_IMMED) {
+ cat1->iim_val = src->iim_val;
+ cat1->src_im = 1;
+ } else if (src->flags & IR3_REG_RELATIV) {
+ cat1->off = src->offset;
+ cat1->src_rel = 1;
+ cat1->src_rel_c = !!(src->flags & IR3_REG_CONST);
+ } else {
+ cat1->src = reg(src, info, instr->repeat,
+ IR3_REG_IMMED | IR3_REG_R |
+ IR3_REG_CONST | IR3_REG_HALF);
+ cat1->src_c = !!(src->flags & IR3_REG_CONST);
+ }
+
+ cat1->dst = reg(dst, info, instr->repeat,
+ IR3_REG_RELATIV | IR3_REG_EVEN |
+ IR3_REG_R | IR3_REG_POS_INF | IR3_REG_HALF);
+ cat1->repeat = instr->repeat;
+ cat1->src_r = !!(src->flags & IR3_REG_R);
+ cat1->ss = !!(instr->flags & IR3_INSTR_SS);
+ cat1->ul = !!(instr->flags & IR3_INSTR_UL);
+ cat1->dst_type = instr->cat1.dst_type;
+ cat1->dst_rel = !!(dst->flags & IR3_REG_RELATIV);
+ cat1->src_type = instr->cat1.src_type;
+ cat1->even = !!(dst->flags & IR3_REG_EVEN);
+ cat1->pos_inf = !!(dst->flags & IR3_REG_POS_INF);
+ cat1->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
+ cat1->sync = !!(instr->flags & IR3_INSTR_SY);
+ cat1->opc_cat = 1;
+
+ return 0;
+}
+
+static int emit_cat2(struct ir3_instruction *instr, void *ptr,
+ struct ir3_info *info)
+{
+ struct ir3_register *dst = instr->regs[0];
+ struct ir3_register *src1 = instr->regs[1];
+ struct ir3_register *src2 = instr->regs[2];
+ instr_cat2_t *cat2 = ptr;
+
+ iassert((instr->regs_count == 2) || (instr->regs_count == 3));
+
+ if (src1->flags & IR3_REG_RELATIV) {
+ iassert(src1->num < (1 << 10));
+ cat2->rel1.src1 = reg(src1, info, instr->repeat,
+ IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
+ IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF);
+ cat2->rel1.src1_c = !!(src1->flags & IR3_REG_CONST);
+ cat2->rel1.src1_rel = 1;
+ } else if (src1->flags & IR3_REG_CONST) {
+ iassert(src1->num < (1 << 12));
+ cat2->c1.src1 = reg(src1, info, instr->repeat,
+ IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS |
+ IR3_REG_R | IR3_REG_HALF);
+ cat2->c1.src1_c = 1;
+ } else {
+ iassert(src1->num < (1 << 11));
+ cat2->src1 = reg(src1, info, instr->repeat,
+ IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS |
+ IR3_REG_R | IR3_REG_HALF);
+ }
+ cat2->src1_im = !!(src1->flags & IR3_REG_IMMED);
+ cat2->src1_neg = !!(src1->flags & IR3_REG_NEGATE);
+ cat2->src1_abs = !!(src1->flags & IR3_REG_ABS);
+ cat2->src1_r = !!(src1->flags & IR3_REG_R);
+
+ if (src2) {
+ iassert((src2->flags & IR3_REG_IMMED) ||
+ !((src1->flags ^ src2->flags) & IR3_REG_HALF));
+
+ if (src2->flags & IR3_REG_RELATIV) {
+ iassert(src2->num < (1 << 10));
+ cat2->rel2.src2 = reg(src2, info, instr->repeat,
+ IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
+ IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF);
+ cat2->rel2.src2_c = !!(src2->flags & IR3_REG_CONST);
+ cat2->rel2.src2_rel = 1;
+ } else if (src2->flags & IR3_REG_CONST) {
+ iassert(src2->num < (1 << 12));
+ cat2->c2.src2 = reg(src2, info, instr->repeat,
+ IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS |
+ IR3_REG_R | IR3_REG_HALF);
+ cat2->c2.src2_c = 1;
+ } else {
+ iassert(src2->num < (1 << 11));
+ cat2->src2 = reg(src2, info, instr->repeat,
+ IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS |
+ IR3_REG_R | IR3_REG_HALF);
+ }
+
+ cat2->src2_im = !!(src2->flags & IR3_REG_IMMED);
+ cat2->src2_neg = !!(src2->flags & IR3_REG_NEGATE);
+ cat2->src2_abs = !!(src2->flags & IR3_REG_ABS);
+ cat2->src2_r = !!(src2->flags & IR3_REG_R);
+ }
+
+ cat2->dst = reg(dst, info, instr->repeat,
+ IR3_REG_R | IR3_REG_EI | IR3_REG_HALF);
+ cat2->repeat = instr->repeat;
+ cat2->ss = !!(instr->flags & IR3_INSTR_SS);
+ cat2->ul = !!(instr->flags & IR3_INSTR_UL);
+ cat2->dst_half = !!((src1->flags ^ dst->flags) & IR3_REG_HALF);
+ cat2->ei = !!(dst->flags & IR3_REG_EI);
+ cat2->cond = instr->cat2.condition;
+ cat2->full = ! (src1->flags & IR3_REG_HALF);
+ cat2->opc = instr->opc;
+ cat2->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
+ cat2->sync = !!(instr->flags & IR3_INSTR_SY);
+ cat2->opc_cat = 2;
+
+ return 0;
+}
+
+static int emit_cat3(struct ir3_instruction *instr, void *ptr,
+ struct ir3_info *info)
+{
+ struct ir3_register *dst = instr->regs[0];
+ struct ir3_register *src1 = instr->regs[1];
+ struct ir3_register *src2 = instr->regs[2];
+ struct ir3_register *src3 = instr->regs[3];
+ instr_cat3_t *cat3 = ptr;
+ uint32_t src_flags = 0;
+
+ switch (instr->opc) {
+ case OPC_MAD_F16:
+ case OPC_MAD_U16:
+ case OPC_MAD_S16:
+ case OPC_SEL_B16:
+ case OPC_SEL_S16:
+ case OPC_SEL_F16:
+ case OPC_SAD_S16:
+ case OPC_SAD_S32: // really??
+ src_flags |= IR3_REG_HALF;
+ break;
+ default:
+ break;
+ }
+
+ iassert(instr->regs_count == 4);
+ iassert(!((src1->flags ^ src_flags) & IR3_REG_HALF));
+ iassert(!((src2->flags ^ src_flags) & IR3_REG_HALF));
+ iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF));
+
+ if (src1->flags & IR3_REG_RELATIV) {
+ iassert(src1->num < (1 << 10));
+ cat3->rel1.src1 = reg(src1, info, instr->repeat,
+ IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
+ IR3_REG_R | IR3_REG_HALF);
+ cat3->rel1.src1_c = !!(src1->flags & IR3_REG_CONST);
+ cat3->rel1.src1_rel = 1;
+ } else if (src1->flags & IR3_REG_CONST) {
+ iassert(src1->num < (1 << 12));
+ cat3->c1.src1 = reg(src1, info, instr->repeat,
+ IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_R |
+ IR3_REG_HALF);
+ cat3->c1.src1_c = 1;
+ } else {
+ iassert(src1->num < (1 << 11));
+ cat3->src1 = reg(src1, info, instr->repeat,
+ IR3_REG_NEGATE | IR3_REG_R | IR3_REG_HALF);
+ }
+
+ cat3->src1_neg = !!(src1->flags & IR3_REG_NEGATE);
+ cat3->src1_r = !!(src1->flags & IR3_REG_R);
+
+ cat3->src2 = reg(src2, info, instr->repeat,
+ IR3_REG_CONST | IR3_REG_NEGATE |
+ IR3_REG_R | IR3_REG_HALF);
+ cat3->src2_c = !!(src2->flags & IR3_REG_CONST);
+ cat3->src2_neg = !!(src2->flags & IR3_REG_NEGATE);
+ cat3->src2_r = !!(src2->flags & IR3_REG_R);
+
+
+ if (src3->flags & IR3_REG_RELATIV) {
+ iassert(src3->num < (1 << 10));
+ cat3->rel2.src3 = reg(src3, info, instr->repeat,
+ IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
+ IR3_REG_R | IR3_REG_HALF);
+ cat3->rel2.src3_c = !!(src3->flags & IR3_REG_CONST);
+ cat3->rel2.src3_rel = 1;
+ } else if (src3->flags & IR3_REG_CONST) {
+ iassert(src3->num < (1 << 12));
+ cat3->c2.src3 = reg(src3, info, instr->repeat,
+ IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_R |
+ IR3_REG_HALF);
+ cat3->c2.src3_c = 1;
+ } else {
+ iassert(src3->num < (1 << 11));
+ cat3->src3 = reg(src3, info, instr->repeat,
+ IR3_REG_NEGATE | IR3_REG_R | IR3_REG_HALF);
+ }
+
+ cat3->src3_neg = !!(src3->flags & IR3_REG_NEGATE);
+ cat3->src3_r = !!(src3->flags & IR3_REG_R);
+
+ cat3->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+ cat3->repeat = instr->repeat;
+ cat3->ss = !!(instr->flags & IR3_INSTR_SS);
+ cat3->ul = !!(instr->flags & IR3_INSTR_UL);
+ cat3->dst_half = !!((src_flags ^ dst->flags) & IR3_REG_HALF);
+ cat3->opc = instr->opc;
+ cat3->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
+ cat3->sync = !!(instr->flags & IR3_INSTR_SY);
+ cat3->opc_cat = 3;
+
+ return 0;
+}
+
+static int emit_cat4(struct ir3_instruction *instr, void *ptr,
+ struct ir3_info *info)
+{
+ struct ir3_register *dst = instr->regs[0];
+ struct ir3_register *src = instr->regs[1];
+ instr_cat4_t *cat4 = ptr;
+
+ iassert(instr->regs_count == 2);
+
+ if (src->flags & IR3_REG_RELATIV) {
+ iassert(src->num < (1 << 10));
+ cat4->rel.src = reg(src, info, instr->repeat,
+ IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
+ IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF);
+ cat4->rel.src_c = !!(src->flags & IR3_REG_CONST);
+ cat4->rel.src_rel = 1;
+ } else if (src->flags & IR3_REG_CONST) {
+ iassert(src->num < (1 << 12));
+ cat4->c.src = reg(src, info, instr->repeat,
+ IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS |
+ IR3_REG_R | IR3_REG_HALF);
+ cat4->c.src_c = 1;
+ } else {
+ iassert(src->num < (1 << 11));
+ cat4->src = reg(src, info, instr->repeat,
+ IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS |
+ IR3_REG_R | IR3_REG_HALF);
+ }
+
+ cat4->src_im = !!(src->flags & IR3_REG_IMMED);
+ cat4->src_neg = !!(src->flags & IR3_REG_NEGATE);
+ cat4->src_abs = !!(src->flags & IR3_REG_ABS);
+ cat4->src_r = !!(src->flags & IR3_REG_R);
+
+ cat4->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+ cat4->repeat = instr->repeat;
+ cat4->ss = !!(instr->flags & IR3_INSTR_SS);
+ cat4->ul = !!(instr->flags & IR3_INSTR_UL);
+ cat4->dst_half = !!((src->flags ^ dst->flags) & IR3_REG_HALF);
+ cat4->full = ! (src->flags & IR3_REG_HALF);
+ cat4->opc = instr->opc;
+ cat4->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
+ cat4->sync = !!(instr->flags & IR3_INSTR_SY);
+ cat4->opc_cat = 4;
+
+ return 0;
+}
+
+static int emit_cat5(struct ir3_instruction *instr, void *ptr,
+ struct ir3_info *info)
+{
+ struct ir3_register *dst = instr->regs[0];
+ struct ir3_register *src1 = instr->regs[1];
+ struct ir3_register *src2 = instr->regs[2];
+ struct ir3_register *src3 = instr->regs[3];
+ instr_cat5_t *cat5 = ptr;
+
+ iassert(!((dst->flags ^ type_flags(instr->cat5.type)) & IR3_REG_HALF));
+
+ if (src1) {
+ cat5->full = ! (src1->flags & IR3_REG_HALF);
+ cat5->src1 = reg(src1, info, instr->repeat, IR3_REG_HALF);
+ }
+
+
+ if (instr->flags & IR3_INSTR_S2EN) {
+ if (src2) {
+ iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF));
+ cat5->s2en.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF);
+ }
+ if (src3) {
+ iassert(src3->flags & IR3_REG_HALF);
+ cat5->s2en.src3 = reg(src3, info, instr->repeat, IR3_REG_HALF);
+ }
+ iassert(!(instr->cat5.samp | instr->cat5.tex));
+ } else {
+ iassert(!src3);
+ if (src2) {
+ iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF));
+ cat5->norm.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF);
+ }
+ cat5->norm.samp = instr->cat5.samp;
+ cat5->norm.tex = instr->cat5.tex;
+ }
+
+ cat5->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+ cat5->wrmask = dst->wrmask;
+ cat5->type = instr->cat5.type;
+ cat5->is_3d = !!(instr->flags & IR3_INSTR_3D);
+ cat5->is_a = !!(instr->flags & IR3_INSTR_A);
+ cat5->is_s = !!(instr->flags & IR3_INSTR_S);
+ cat5->is_s2en = !!(instr->flags & IR3_INSTR_S2EN);
+ cat5->is_o = !!(instr->flags & IR3_INSTR_O);
+ cat5->is_p = !!(instr->flags & IR3_INSTR_P);
+ cat5->opc = instr->opc;
+ cat5->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
+ cat5->sync = !!(instr->flags & IR3_INSTR_SY);
+ cat5->opc_cat = 5;
+
+ return 0;
+}
+
+static int emit_cat6(struct ir3_instruction *instr, void *ptr,
+ struct ir3_info *info)
+{
+ struct ir3_register *dst = instr->regs[0];
+ struct ir3_register *src = instr->regs[1];
+ instr_cat6_t *cat6 = ptr;
+
+ iassert(instr->regs_count == 2);
+
+ switch (instr->opc) {
+ /* load instructions: */
+ case OPC_LDG:
+ case OPC_LDP:
+ case OPC_LDL:
+ case OPC_LDLW:
+ case OPC_LDLV:
+ case OPC_PREFETCH: {
+ instr_cat6a_t *cat6a = ptr;
+
+ iassert(!((dst->flags ^ type_flags(instr->cat6.type)) & IR3_REG_HALF));
+
+ cat6a->must_be_one1 = 1;
+ cat6a->must_be_one2 = 1;
+ cat6a->off = instr->cat6.offset;
+ cat6a->src = reg(src, info, instr->repeat, 0);
+ cat6a->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+ break;
+ }
+ /* store instructions: */
+ case OPC_STG:
+ case OPC_STP:
+ case OPC_STL:
+ case OPC_STLW:
+ case OPC_STI: {
+ instr_cat6b_t *cat6b = ptr;
+ uint32_t src_flags = type_flags(instr->cat6.type);
+ uint32_t dst_flags = (instr->opc == OPC_STI) ? IR3_REG_HALF : 0;
+
+ iassert(!((src->flags ^ src_flags) & IR3_REG_HALF));
+
+ cat6b->must_be_one1 = 1;
+ cat6b->must_be_one2 = 1;
+ cat6b->src = reg(src, info, instr->repeat, src_flags);
+ cat6b->off_hi = instr->cat6.offset >> 8;
+ cat6b->off = instr->cat6.offset;
+ cat6b->dst = reg(dst, info, instr->repeat, IR3_REG_R | dst_flags);
+
+ break;
+ }
+ default:
+ // TODO
+ break;
+ }
+
+ cat6->iim_val = instr->cat6.iim_val;
+ cat6->type = instr->cat6.type;
+ cat6->opc = instr->opc;
+ cat6->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
+ cat6->sync = !!(instr->flags & IR3_INSTR_SY);
+ cat6->opc_cat = 6;
+
+ return 0;
+}
+
+static int (*emit[])(struct ir3_instruction *instr, void *ptr,
+ struct ir3_info *info) = {
+ emit_cat0, emit_cat1, emit_cat2, emit_cat3, emit_cat4, emit_cat5, emit_cat6,
+};
+
+void * ir3_assemble(struct ir3 *shader, struct ir3_info *info)
+{
+ uint32_t *ptr, *dwords;
+ uint32_t i;
+
+ info->max_reg = -1;
+ info->max_half_reg = -1;
+ info->max_const = -1;
+ info->instrs_count = 0;
+
+ /* need a integer number of instruction "groups" (sets of four
+ * instructions), so pad out w/ NOPs if needed:
+ * (each instruction is 64bits)
+ */
+ info->sizedwords = 2 * align(shader->instrs_count, 4);
+
+ ptr = dwords = calloc(1, 4 * info->sizedwords);
+
+ for (i = 0; i < shader->instrs_count; i++) {
+ struct ir3_instruction *instr = shader->instrs[i];
+ int ret = emit[instr->category](instr, dwords, info);
+ if (ret)
+ goto fail;
+ info->instrs_count += 1 + instr->repeat;
+ dwords += 2;
+ }
+
+ return ptr;
+
+fail:
+ free(ptr);
+ return NULL;
+}
+
+static struct ir3_register * reg_create(struct ir3 *shader,
+ int num, int flags)
+{
+ struct ir3_register *reg =
+ ir3_alloc(shader, sizeof(struct ir3_register));
+ reg->wrmask = 1;
+ reg->flags = flags;
+ reg->num = num;
+ return reg;
+}
+
+static void insert_instr(struct ir3 *shader,
+ struct ir3_instruction *instr)
+{
+#ifdef DEBUG
+ static uint32_t serialno = 0;
+ instr->serialno = ++serialno;
+#endif
+ if (shader->instrs_count == shader->instrs_sz) {
+ shader->instrs_sz = MAX2(2 * shader->instrs_sz, 16);
+ shader->instrs = realloc(shader->instrs,
+ shader->instrs_sz * sizeof(shader->instrs[0]));
+ }
+ shader->instrs[shader->instrs_count++] = instr;
+}
+
+struct ir3_block * ir3_block_create(struct ir3 *shader,
+ unsigned ntmp, unsigned nin, unsigned nout)
+{
+ struct ir3_block *block;
+ unsigned size;
+ char *ptr;
+
+ size = sizeof(*block);
+ size += sizeof(block->temporaries[0]) * ntmp;
+ size += sizeof(block->inputs[0]) * nin;
+ size += sizeof(block->outputs[0]) * nout;
+
+ ptr = ir3_alloc(shader, size);
+
+ block = (void *)ptr;
+ ptr += sizeof(*block);
+
+ block->temporaries = (void *)ptr;
+ block->ntemporaries = ntmp;
+ ptr += sizeof(block->temporaries[0]) * ntmp;
+
+ block->inputs = (void *)ptr;
+ block->ninputs = nin;
+ ptr += sizeof(block->inputs[0]) * nin;
+
+ block->outputs = (void *)ptr;
+ block->noutputs = nout;
+ ptr += sizeof(block->outputs[0]) * nout;
+
+ block->shader = shader;
+
+ return block;
+}
+
+struct ir3_instruction * ir3_instr_create(struct ir3_block *block,
+ int category, opc_t opc)
+{
+ struct ir3_instruction *instr =
+ ir3_alloc(block->shader, sizeof(struct ir3_instruction));
+ instr->block = block;
+ instr->category = category;
+ instr->opc = opc;
+ insert_instr(block->shader, instr);
+ return instr;
+}
+
+struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
+{
+ struct ir3_instruction *new_instr =
+ ir3_alloc(instr->block->shader, sizeof(struct ir3_instruction));
+ unsigned i;
+
+ *new_instr = *instr;
+ insert_instr(instr->block->shader, new_instr);
+
+ /* clone registers: */
+ new_instr->regs_count = 0;
+ for (i = 0; i < instr->regs_count; i++) {
+ struct ir3_register *reg = instr->regs[i];
+ struct ir3_register *new_reg =
+ ir3_reg_create(new_instr, reg->num, reg->flags);
+ *new_reg = *reg;
+ }
+
+ return new_instr;
+}
+
+struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
+ int num, int flags)
+{
+ struct ir3_register *reg = reg_create(instr->block->shader, num, flags);
+ assert(instr->regs_count < ARRAY_SIZE(instr->regs));
+ instr->regs[instr->regs_count++] = reg;
+ return reg;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
new file mode 100644
index 00000000000..9ed914ba2e4
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IR3_H_
+#define IR3_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "instr-a3xx.h"
+#include "disasm.h" /* TODO move 'enum shader_t' somewhere else.. */
+
+/* low level intermediate representation of an adreno shader program */
+
+struct ir3;
+struct ir3_instruction;
+struct ir3_block;
+
+struct ir3 * fd_asm_parse(const char *src);
+
+struct ir3_info {
+ uint16_t sizedwords;
+ uint16_t instrs_count; /* expanded to account for rpt's */
+ /* NOTE: max_reg, etc, does not include registers not touched
+ * by the shader (ie. vertex fetched via VFD_DECODE but not
+ * touched by shader)
+ */
+ int8_t max_reg; /* highest GPR # used by shader */
+ int8_t max_half_reg;
+ int8_t max_const;
+};
+
+struct ir3_register {
+ enum {
+ IR3_REG_CONST = 0x001,
+ IR3_REG_IMMED = 0x002,
+ IR3_REG_HALF = 0x004,
+ IR3_REG_RELATIV= 0x008,
+ IR3_REG_R = 0x010,
+ IR3_REG_NEGATE = 0x020,
+ IR3_REG_ABS = 0x040,
+ IR3_REG_EVEN = 0x080,
+ IR3_REG_POS_INF= 0x100,
+ /* (ei) flag, end-input? Set on last bary, presumably to signal
+ * that the shader needs no more input:
+ */
+ IR3_REG_EI = 0x200,
+ /* meta-flags, for intermediate stages of IR, ie.
+ * before register assignment is done:
+ */
+ IR3_REG_SSA = 0x1000, /* 'instr' is ptr to assigning instr */
+ IR3_REG_IA = 0x2000, /* meta-input dst is "assigned" */
+ IR3_REG_ADDR = 0x4000, /* register is a0.x */
+ } flags;
+ union {
+ /* normal registers:
+ * the component is in the low two bits of the reg #, so
+ * rN.x becomes: (N << 2) | x
+ */
+ int num;
+ /* immediate: */
+ int iim_val;
+ float fim_val;
+ /* relative: */
+ int offset;
+ /* for IR3_REG_SSA, src registers contain ptr back to
+ * assigning instruction.
+ */
+ struct ir3_instruction *instr;
+ };
+
+ /* used for cat5 instructions, but also for internal/IR level
+ * tracking of what registers are read/written by an instruction.
+ * wrmask may be a bad name since it is used to represent both
+ * src and dst that touch multiple adjacent registers.
+ */
+ int wrmask;
+};
+
+struct ir3_instruction {
+ struct ir3_block *block;
+ int category;
+ opc_t opc;
+ enum {
+ /* (sy) flag is set on first instruction, and after sample
+ * instructions (probably just on RAW hazard).
+ */
+ IR3_INSTR_SY = 0x001,
+ /* (ss) flag is set on first instruction, and first instruction
+ * to depend on the result of "long" instructions (RAW hazard):
+ *
+ * rcp, rsq, log2, exp2, sin, cos, sqrt
+ *
+ * It seems to synchronize until all in-flight instructions are
+ * completed, for example:
+ *
+ * rsq hr1.w, hr1.w
+ * add.f hr2.z, (neg)hr2.z, hc0.y
+ * mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
+ * rsq hr2.x, hr2.x
+ * (rpt1)nop
+ * mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
+ * nop
+ * mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
+ * (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
+ * (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
+ *
+ * The last mul.f does not have (ss) set, presumably because the
+ * (ss) on the previous instruction does the job.
+ *
+ * The blob driver also seems to set it on WAR hazards, although
+ * not really clear if this is needed or just blob compiler being
+ * sloppy. So far I haven't found a case where removing the (ss)
+ * causes problems for WAR hazard, but I could just be getting
+ * lucky:
+ *
+ * rcp r1.y, r3.y
+ * (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
+ *
+ */
+ IR3_INSTR_SS = 0x002,
+ /* (jp) flag is set on jump targets:
+ */
+ IR3_INSTR_JP = 0x004,
+ IR3_INSTR_UL = 0x008,
+ IR3_INSTR_3D = 0x010,
+ IR3_INSTR_A = 0x020,
+ IR3_INSTR_O = 0x040,
+ IR3_INSTR_P = 0x080,
+ IR3_INSTR_S = 0x100,
+ IR3_INSTR_S2EN = 0x200,
+ /* meta-flags, for intermediate stages of IR, ie.
+ * before register assignment is done:
+ */
+ IR3_INSTR_MARK = 0x1000,
+ } flags;
+ int repeat;
+ unsigned regs_count;
+ struct ir3_register *regs[5];
+ union {
+ struct {
+ char inv;
+ char comp;
+ int immed;
+ } cat0;
+ struct {
+ type_t src_type, dst_type;
+ } cat1;
+ struct {
+ enum {
+ IR3_COND_LT = 0,
+ IR3_COND_LE = 1,
+ IR3_COND_GT = 2,
+ IR3_COND_GE = 3,
+ IR3_COND_EQ = 4,
+ IR3_COND_NE = 5,
+ } condition;
+ } cat2;
+ struct {
+ unsigned samp, tex;
+ type_t type;
+ } cat5;
+ struct {
+ type_t type;
+ int offset;
+ int iim_val;
+ } cat6;
+ /* for meta-instructions, just used to hold extra data
+ * before instruction scheduling, etc
+ */
+ struct {
+ int off; /* component/offset */
+ } fo;
+ struct {
+ struct ir3_block *if_block, *else_block;
+ } flow;
+ struct {
+ struct ir3_block *block;
+ } inout;
+ };
+
+ /* transient values used during various algorithms: */
+ union {
+ /* The instruction depth is the max dependency distance to output.
+ *
+ * You can also think of it as the "cost", if we did any sort of
+ * optimization for register footprint. Ie. a value that is just
+ * result of moving a const to a reg would have a low cost, so to
+ * it could make sense to duplicate the instruction at various
+ * points where the result is needed to reduce register footprint.
+ */
+ unsigned depth;
+ };
+ struct ir3_instruction *next;
+#ifdef DEBUG
+ uint32_t serialno;
+#endif
+};
+
+struct ir3_heap_chunk;
+
+struct ir3 {
+ unsigned instrs_count, instrs_sz;
+ struct ir3_instruction **instrs;
+ unsigned heap_idx;
+ struct ir3_heap_chunk *chunk;
+};
+
+struct ir3_block {
+ struct ir3 *shader;
+ unsigned ntemporaries, ninputs, noutputs;
+ /* maps TGSI_FILE_TEMPORARY index back to the assigning instruction: */
+ struct ir3_instruction **temporaries;
+ struct ir3_instruction **inputs;
+ struct ir3_instruction **outputs;
+ /* only a single address register: */
+ struct ir3_instruction *address;
+ struct ir3_block *parent;
+ struct ir3_instruction *head;
+};
+
+struct ir3 * ir3_create(void);
+void ir3_destroy(struct ir3 *shader);
+void * ir3_assemble(struct ir3 *shader,
+ struct ir3_info *info);
+void * ir3_alloc(struct ir3 *shader, int sz);
+
+struct ir3_block * ir3_block_create(struct ir3 *shader,
+ unsigned ntmp, unsigned nin, unsigned nout);
+
+struct ir3_instruction * ir3_instr_create(struct ir3_block *block,
+ int category, opc_t opc);
+struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr);
+const char *ir3_instr_name(struct ir3_instruction *instr);
+
+struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
+ int num, int flags);
+
+
+static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
+{
+ if (instr->flags & IR3_INSTR_MARK)
+ return true; /* already visited */
+ instr->flags ^= IR3_INSTR_MARK;
+ return false;
+}
+
+static inline void ir3_clear_mark(struct ir3 *shader)
+{
+ /* TODO would be nice to drop the instruction array.. for
+ * new compiler, _clear_mark() is all we use it for, and
+ * we could probably manage a linked list instead..
+ */
+ unsigned i;
+ for (i = 0; i < shader->instrs_count; i++) {
+ struct ir3_instruction *instr = shader->instrs[i];
+ instr->flags &= ~IR3_INSTR_MARK;
+ }
+}
+
+static inline int ir3_instr_regno(struct ir3_instruction *instr,
+ struct ir3_register *reg)
+{
+ unsigned i;
+ for (i = 0; i < instr->regs_count; i++)
+ if (reg == instr->regs[i])
+ return i;
+ return -1;
+}
+
+
+/* comp:
+ * 0 - x
+ * 1 - y
+ * 2 - z
+ * 3 - w
+ */
+static inline uint32_t regid(int num, int comp)
+{
+ return (num << 2) | (comp & 0x3);
+}
+
+static inline uint32_t reg_num(struct ir3_register *reg)
+{
+ return reg->num >> 2;
+}
+
+static inline uint32_t reg_comp(struct ir3_register *reg)
+{
+ return reg->num & 0x3;
+}
+
+static inline bool is_flow(struct ir3_instruction *instr)
+{
+ return (instr->category == 0);
+}
+
+static inline bool is_kill(struct ir3_instruction *instr)
+{
+ return is_flow(instr) && (instr->opc == OPC_KILL);
+}
+
+static inline bool is_nop(struct ir3_instruction *instr)
+{
+ return is_flow(instr) && (instr->opc == OPC_NOP);
+}
+
+static inline bool is_alu(struct ir3_instruction *instr)
+{
+ return (1 <= instr->category) && (instr->category <= 3);
+}
+
+static inline bool is_sfu(struct ir3_instruction *instr)
+{
+ return (instr->category == 4);
+}
+
+static inline bool is_tex(struct ir3_instruction *instr)
+{
+ return (instr->category == 5);
+}
+
+static inline bool is_input(struct ir3_instruction *instr)
+{
+ return (instr->category == 2) && (instr->opc == OPC_BARY_F);
+}
+
+static inline bool is_meta(struct ir3_instruction *instr)
+{
+ /* TODO how should we count PHI (and maybe fan-in/out) which
+ * might actually contribute some instructions to the final
+ * result?
+ */
+ return (instr->category == -1);
+}
+
+static inline bool is_addr(struct ir3_instruction *instr)
+{
+ return is_meta(instr) && (instr->opc == OPC_META_DEREF);
+}
+
+static inline bool writes_addr(struct ir3_instruction *instr)
+{
+ if (instr->regs_count > 0) {
+ struct ir3_register *dst = instr->regs[0];
+ return !!(dst->flags & IR3_REG_ADDR);
+ }
+ return false;
+}
+
+static inline bool writes_pred(struct ir3_instruction *instr)
+{
+ if (instr->regs_count > 0) {
+ struct ir3_register *dst = instr->regs[0];
+ return reg_num(dst) == REG_P0;
+ }
+ return false;
+}
+
+static inline bool reg_gpr(struct ir3_register *r)
+{
+ if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_SSA | IR3_REG_ADDR))
+ return false;
+ if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
+ return false;
+ return true;
+}
+
+/* dump: */
+#include <stdio.h>
+void ir3_dump(struct ir3 *shader, const char *name,
+ struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3? */,
+ FILE *f);
+void ir3_dump_instr_single(struct ir3_instruction *instr);
+void ir3_dump_instr_list(struct ir3_instruction *instr);
+
+/* flatten if/else: */
+int ir3_block_flatten(struct ir3_block *block);
+
+/* depth calculation: */
+int ir3_delayslots(struct ir3_instruction *assigner,
+ struct ir3_instruction *consumer, unsigned n);
+void ir3_block_depth(struct ir3_block *block);
+
+/* copy-propagate: */
+void ir3_block_cp(struct ir3_block *block);
+
+/* scheduling: */
+void ir3_block_sched(struct ir3_block *block);
+
+/* register assignment: */
+int ir3_block_ra(struct ir3_block *block, enum shader_t type,
+ bool half_precision, bool frag_coord, bool frag_face,
+ bool *has_samp);
+
+#ifndef ARRAY_SIZE
+# define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+#endif
+
+/* ************************************************************************* */
+/* split this out or find some helper to use.. like main/bitset.h.. */
+
+#include <string.h>
+
+#define MAX_REG 256
+
+typedef uint8_t regmask_t[2 * MAX_REG / 8];
+
+static inline unsigned regmask_idx(struct ir3_register *reg)
+{
+ unsigned num = reg->num;
+ assert(num < MAX_REG);
+ if (reg->flags & IR3_REG_HALF)
+ num += MAX_REG;
+ return num;
+}
+
+static inline void regmask_init(regmask_t *regmask)
+{
+ memset(regmask, 0, sizeof(*regmask));
+}
+
+static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg)
+{
+ unsigned idx = regmask_idx(reg);
+ unsigned i;
+ for (i = 0; i < 4; i++, idx++)
+ if (reg->wrmask & (1 << i))
+ (*regmask)[idx / 8] |= 1 << (idx % 8);
+}
+
+/* set bits in a if not set in b, conceptually:
+ * a |= (reg & ~b)
+ */
+static inline void regmask_set_if_not(regmask_t *a,
+ struct ir3_register *reg, regmask_t *b)
+{
+ unsigned idx = regmask_idx(reg);
+ unsigned i;
+ for (i = 0; i < 4; i++, idx++)
+ if (reg->wrmask & (1 << i))
+ if (!((*b)[idx / 8] & (1 << (idx % 8))))
+ (*a)[idx / 8] |= 1 << (idx % 8);
+}
+
+static inline unsigned regmask_get(regmask_t *regmask,
+ struct ir3_register *reg)
+{
+ unsigned idx = regmask_idx(reg);
+ unsigned i;
+ for (i = 0; i < 4; i++, idx++)
+ if (reg->wrmask & (1 << i))
+ if ((*regmask)[idx / 8] & (1 << (idx % 8)))
+ return true;
+ return false;
+}
+
+/* ************************************************************************* */
+
+#endif /* IR3_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
new file mode 100644
index 00000000000..1fa2fd4e389
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
@@ -0,0 +1,2639 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#include <stdarg.h>
+
+#include "pipe/p_state.h"
+#include "util/u_string.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_ureg.h"
+#include "tgsi/tgsi_info.h"
+#include "tgsi/tgsi_strings.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_scan.h"
+
+#include "freedreno_lowering.h"
+#include "freedreno_util.h"
+
+#include "ir3_compiler.h"
+#include "ir3_shader.h"
+
+#include "instr-a3xx.h"
+#include "ir3.h"
+
+struct ir3_compile_context {
+ const struct tgsi_token *tokens;
+ bool free_tokens;
+ struct ir3 *ir;
+ struct ir3_shader_variant *so;
+
+ struct ir3_block *block;
+ struct ir3_instruction *current_instr;
+
+ /* we need to defer updates to block->outputs[] until the end
+ * of an instruction (so we don't see new value until *after*
+ * the src registers are processed)
+ */
+ struct {
+ struct ir3_instruction *instr, **instrp;
+ } output_updates[16];
+ unsigned num_output_updates;
+
+ /* are we in a sequence of "atomic" instructions?
+ */
+ bool atomic;
+
+ /* For fragment shaders, from the hw perspective the only
+ * actual input is r0.xy position register passed to bary.f.
+ * But TGSI doesn't know that, it still declares things as
+ * IN[] registers. So we do all the input tracking normally
+ * and fix things up after compile_instructions()
+ *
+ * NOTE that frag_pos is the hardware position (possibly it
+ * is actually an index or tag or some such.. it is *not*
+ * values that can be directly used for gl_FragCoord..)
+ */
+ struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4];
+
+ struct tgsi_parse_context parser;
+ unsigned type;
+
+ struct tgsi_shader_info info;
+
+ /* for calculating input/output positions/linkages: */
+ unsigned next_inloc;
+
+ unsigned num_internal_temps;
+ struct tgsi_src_register internal_temps[6];
+
+ /* idx/slot for last compiler generated immediate */
+ unsigned immediate_idx;
+
+ /* stack of branch instructions that mark (potentially nested)
+ * branch if/else/loop/etc
+ */
+ struct {
+ struct ir3_instruction *instr, *cond;
+ bool inv; /* true iff in else leg of branch */
+ } branch[16];
+ unsigned int branch_count;
+
+ /* list of kill instructions: */
+ struct ir3_instruction *kill[16];
+ unsigned int kill_count;
+
+ /* used when dst is same as one of the src, to avoid overwriting a
+ * src element before the remaining scalar instructions that make
+ * up the vector operation
+ */
+ struct tgsi_dst_register tmp_dst;
+ struct tgsi_src_register *tmp_src;
+};
+
+
+static void vectorize(struct ir3_compile_context *ctx,
+ struct ir3_instruction *instr, struct tgsi_dst_register *dst,
+ int nsrcs, ...);
+static void create_mov(struct ir3_compile_context *ctx,
+ struct tgsi_dst_register *dst, struct tgsi_src_register *src);
+static type_t get_ftype(struct ir3_compile_context *ctx);
+
+static unsigned
+compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so,
+ const struct tgsi_token *tokens)
+{
+ unsigned ret;
+ struct tgsi_shader_info *info = &ctx->info;
+ const struct fd_lowering_config lconfig = {
+ .color_two_side = so->key.color_two_side,
+ .lower_DST = true,
+ .lower_XPD = true,
+ .lower_SCS = true,
+ .lower_LRP = true,
+ .lower_FRC = true,
+ .lower_POW = true,
+ .lower_LIT = true,
+ .lower_EXP = true,
+ .lower_LOG = true,
+ .lower_DP4 = true,
+ .lower_DP3 = true,
+ .lower_DPH = true,
+ .lower_DP2 = true,
+ .lower_DP2A = true,
+ };
+
+ ctx->tokens = fd_transform_lowering(&lconfig, tokens, &ctx->info);
+ ctx->free_tokens = !!ctx->tokens;
+ if (!ctx->tokens) {
+ /* no lowering */
+ ctx->tokens = tokens;
+ }
+ ctx->ir = so->ir;
+ ctx->so = so;
+ ctx->next_inloc = 8;
+ ctx->num_internal_temps = 0;
+ ctx->branch_count = 0;
+ ctx->kill_count = 0;
+ ctx->block = NULL;
+ ctx->current_instr = NULL;
+ ctx->num_output_updates = 0;
+ ctx->atomic = false;
+ ctx->frag_pos = NULL;
+ ctx->frag_face = NULL;
+
+ memset(ctx->frag_coord, 0, sizeof(ctx->frag_coord));
+
+#define FM(x) (1 << TGSI_FILE_##x)
+ /* optimize can't deal with relative addressing: */
+ if (info->indirect_files & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT)))
+ return TGSI_PARSE_ERROR;
+
+ /* Immediates go after constants: */
+ so->first_immediate = info->file_max[TGSI_FILE_CONSTANT] + 1;
+ ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1);
+
+ ret = tgsi_parse_init(&ctx->parser, ctx->tokens);
+ if (ret != TGSI_PARSE_OK)
+ return ret;
+
+ ctx->type = ctx->parser.FullHeader.Processor.Processor;
+
+ return ret;
+}
+
+static void
+compile_error(struct ir3_compile_context *ctx, const char *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ _debug_vprintf(format, ap);
+ va_end(ap);
+ tgsi_dump(ctx->tokens, 0);
+ debug_assert(0);
+}
+
+#define compile_assert(ctx, cond) do { \
+ if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
+ } while (0)
+
+static void
+compile_free(struct ir3_compile_context *ctx)
+{
+ if (ctx->free_tokens)
+ free((void *)ctx->tokens);
+ tgsi_parse_free(&ctx->parser);
+}
+
+struct instr_translater {
+ void (*fxn)(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst);
+ unsigned tgsi_opc;
+ opc_t opc;
+ opc_t hopc; /* opc to use for half_precision mode, if different */
+ unsigned arg;
+};
+
+static void
+instr_finish(struct ir3_compile_context *ctx)
+{
+ unsigned i;
+
+ if (ctx->atomic)
+ return;
+
+ for (i = 0; i < ctx->num_output_updates; i++)
+ *(ctx->output_updates[i].instrp) = ctx->output_updates[i].instr;
+
+ ctx->num_output_updates = 0;
+}
+
+/* For "atomic" groups of instructions, for example the four scalar
+ * instructions to perform a vec4 operation. Basically this just
+ * blocks out handling of output_updates so the next scalar instruction
+ * still sees the result from before the start of the atomic group.
+ *
+ * NOTE: when used properly, this could probably replace get/put_dst()
+ * stuff.
+ */
+static void
+instr_atomic_start(struct ir3_compile_context *ctx)
+{
+ ctx->atomic = true;
+}
+
+static void
+instr_atomic_end(struct ir3_compile_context *ctx)
+{
+ ctx->atomic = false;
+ instr_finish(ctx);
+}
+
+static struct ir3_instruction *
+instr_create(struct ir3_compile_context *ctx, int category, opc_t opc)
+{
+ instr_finish(ctx);
+ return (ctx->current_instr = ir3_instr_create(ctx->block, category, opc));
+}
+
+static struct ir3_instruction *
+instr_clone(struct ir3_compile_context *ctx, struct ir3_instruction *instr)
+{
+ instr_finish(ctx);
+ return (ctx->current_instr = ir3_instr_clone(instr));
+}
+
+static struct ir3_block *
+push_block(struct ir3_compile_context *ctx)
+{
+ struct ir3_block *block;
+ unsigned ntmp, nin, nout;
+
+#define SCALAR_REGS(file) (4 * (ctx->info.file_max[TGSI_FILE_ ## file] + 1))
+
+ /* hmm, give ourselves room to create 4 extra temporaries (vec4):
+ */
+ ntmp = SCALAR_REGS(TEMPORARY);
+ ntmp += 4 * 4;
+
+ nout = SCALAR_REGS(OUTPUT);
+ nin = SCALAR_REGS(INPUT);
+
+ /* for outermost block, 'inputs' are the actual shader INPUT
+ * register file. Reads from INPUT registers always go back to
+ * top block. For nested blocks, 'inputs' is used to track any
+ * TEMPORARY file register from one of the enclosing blocks that
+ * is ready in this block.
+ */
+ if (!ctx->block) {
+ /* NOTE: fragment shaders actually have two inputs (r0.xy, the
+ * position)
+ */
+ if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+ int n = 2;
+ if (ctx->info.reads_position)
+ n += 4;
+ if (ctx->info.uses_frontface)
+ n += 4;
+ nin = MAX2(n, nin);
+ nout += ARRAY_SIZE(ctx->kill);
+ }
+ } else {
+ nin = ntmp;
+ }
+
+ block = ir3_block_create(ctx->ir, ntmp, nin, nout);
+
+ if ((ctx->type == TGSI_PROCESSOR_FRAGMENT) && !ctx->block)
+ block->noutputs -= ARRAY_SIZE(ctx->kill);
+
+ block->parent = ctx->block;
+ ctx->block = block;
+
+ return block;
+}
+
+static void
+pop_block(struct ir3_compile_context *ctx)
+{
+ ctx->block = ctx->block->parent;
+ compile_assert(ctx, ctx->block);
+}
+
+static struct ir3_instruction *
+create_output(struct ir3_block *block, struct ir3_instruction *instr,
+ unsigned n)
+{
+ struct ir3_instruction *out;
+
+ out = ir3_instr_create(block, -1, OPC_META_OUTPUT);
+ out->inout.block = block;
+ ir3_reg_create(out, n, 0);
+ if (instr)
+ ir3_reg_create(out, 0, IR3_REG_SSA)->instr = instr;
+
+ return out;
+}
+
+static struct ir3_instruction *
+create_input(struct ir3_block *block, struct ir3_instruction *instr,
+ unsigned n)
+{
+ struct ir3_instruction *in;
+
+ in = ir3_instr_create(block, -1, OPC_META_INPUT);
+ in->inout.block = block;
+ ir3_reg_create(in, n, 0);
+ if (instr)
+ ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr;
+
+ return in;
+}
+
+static struct ir3_instruction *
+block_input(struct ir3_block *block, unsigned n)
+{
+ /* references to INPUT register file always go back up to
+ * top level:
+ */
+ if (block->parent)
+ return block_input(block->parent, n);
+ return block->inputs[n];
+}
+
+/* return temporary in scope, creating if needed meta-input node
+ * to track block inputs
+ */
+static struct ir3_instruction *
+block_temporary(struct ir3_block *block, unsigned n)
+{
+ /* references to TEMPORARY register file, find the nearest
+ * enclosing block which has already assigned this temporary,
+ * creating meta-input instructions along the way to keep
+ * track of block inputs
+ */
+ if (block->parent && !block->temporaries[n]) {
+ /* if already have input for this block, reuse: */
+ if (!block->inputs[n])
+ block->inputs[n] = block_temporary(block->parent, n);
+
+ /* and create new input to return: */
+ return create_input(block, block->inputs[n], n);
+ }
+ return block->temporaries[n];
+}
+
+static struct ir3_instruction *
+create_immed(struct ir3_compile_context *ctx, float val)
+{
+ /* NOTE: *don't* use instr_create() here!
+ */
+ struct ir3_instruction *instr;
+ instr = ir3_instr_create(ctx->block, 1, 0);
+ instr->cat1.src_type = get_ftype(ctx);
+ instr->cat1.dst_type = get_ftype(ctx);
+ ir3_reg_create(instr, 0, 0);
+ ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = val;
+ return instr;
+}
+
+static void
+ssa_dst(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
+ const struct tgsi_dst_register *dst, unsigned chan)
+{
+ unsigned n = regid(dst->Index, chan);
+ unsigned idx = ctx->num_output_updates;
+
+ compile_assert(ctx, idx < ARRAY_SIZE(ctx->output_updates));
+
+ /* NOTE: defer update of temporaries[idx] or output[idx]
+ * until instr_finish(), so that if the current instruction
+ * reads the same TEMP/OUT[] it gets the old value:
+ *
+ * bleh.. this might be a bit easier to just figure out
+ * in instr_finish(). But at that point we've already
+ * lost information about OUTPUT vs TEMPORARY register
+ * file..
+ */
+
+ switch (dst->File) {
+ case TGSI_FILE_OUTPUT:
+ compile_assert(ctx, n < ctx->block->noutputs);
+ ctx->output_updates[idx].instrp = &ctx->block->outputs[n];
+ ctx->output_updates[idx].instr = instr;
+ ctx->num_output_updates++;
+ break;
+ case TGSI_FILE_TEMPORARY:
+ compile_assert(ctx, n < ctx->block->ntemporaries);
+ ctx->output_updates[idx].instrp = &ctx->block->temporaries[n];
+ ctx->output_updates[idx].instr = instr;
+ ctx->num_output_updates++;
+ break;
+ case TGSI_FILE_ADDRESS:
+ compile_assert(ctx, n < 1);
+ ctx->output_updates[idx].instrp = &ctx->block->address;
+ ctx->output_updates[idx].instr = instr;
+ ctx->num_output_updates++;
+ break;
+ }
+}
+
+static void
+ssa_src(struct ir3_compile_context *ctx, struct ir3_register *reg,
+ const struct tgsi_src_register *src, unsigned chan)
+{
+ struct ir3_block *block = ctx->block;
+ unsigned n = regid(src->Index, chan);
+
+ switch (src->File) {
+ case TGSI_FILE_INPUT:
+ reg->flags |= IR3_REG_SSA;
+ reg->instr = block_input(ctx->block, n);
+ break;
+ case TGSI_FILE_OUTPUT:
+ /* really this should just happen in case of 'MOV_SAT OUT[n], ..',
+ * for the following clamp instructions:
+ */
+ reg->flags |= IR3_REG_SSA;
+ reg->instr = block->outputs[n];
+ /* we don't have to worry about read from an OUTPUT that was
+ * assigned outside of the current block, because the _SAT
+ * clamp instructions will always be in the same block as
+ * the original instruction which wrote the OUTPUT
+ */
+ compile_assert(ctx, reg->instr);
+ break;
+ case TGSI_FILE_TEMPORARY:
+ reg->flags |= IR3_REG_SSA;
+ reg->instr = block_temporary(ctx->block, n);
+ break;
+ }
+
+ if ((reg->flags & IR3_REG_SSA) && !reg->instr) {
+ /* this can happen when registers (or components of a TGSI
+ * register) are used as src before they have been assigned
+ * (undefined contents). To avoid confusing the rest of the
+ * compiler, and to generally keep things peachy, substitute
+ * an instruction that sets the src to 0.0. Or to keep
+ * things undefined, I could plug in a random number? :-P
+ *
+ * NOTE: *don't* use instr_create() here!
+ */
+ reg->instr = create_immed(ctx, 0.0);
+ }
+}
+
+static struct ir3_register *
+add_dst_reg_wrmask(struct ir3_compile_context *ctx,
+ struct ir3_instruction *instr, const struct tgsi_dst_register *dst,
+ unsigned chan, unsigned wrmask)
+{
+ unsigned flags = 0, num = 0;
+ struct ir3_register *reg;
+
+ switch (dst->File) {
+ case TGSI_FILE_OUTPUT:
+ case TGSI_FILE_TEMPORARY:
+ /* uses SSA */
+ break;
+ case TGSI_FILE_ADDRESS:
+ flags |= IR3_REG_ADDR;
+ /* uses SSA */
+ break;
+ default:
+ compile_error(ctx, "unsupported dst register file: %s\n",
+ tgsi_file_name(dst->File));
+ break;
+ }
+
+ if (dst->Indirect)
+ flags |= IR3_REG_RELATIV;
+
+ reg = ir3_reg_create(instr, regid(num, chan), flags);
+
+ /* NOTE: do not call ssa_dst() if atomic.. vectorize()
+ * itself will call ssa_dst(). This is to filter out
+ * the (initially bogus) .x component dst which is
+ * created (but not necessarily used, ie. if the net
+ * result of the vector operation does not write to
+ * the .x component)
+ */
+
+ reg->wrmask = wrmask;
+ if (wrmask == 0x1) {
+ /* normal case */
+ if (!ctx->atomic)
+ ssa_dst(ctx, instr, dst, chan);
+ } else if ((dst->File == TGSI_FILE_TEMPORARY) ||
+ (dst->File == TGSI_FILE_OUTPUT) ||
+ (dst->File == TGSI_FILE_ADDRESS)) {
+ unsigned i;
+
+ /* if instruction writes multiple, we need to create
+ * some place-holder collect the registers:
+ */
+ for (i = 0; i < 4; i++) {
+ if (wrmask & (1 << i)) {
+ struct ir3_instruction *collect =
+ ir3_instr_create(ctx->block, -1, OPC_META_FO);
+ collect->fo.off = i;
+ /* unused dst reg: */
+ ir3_reg_create(collect, 0, 0);
+ /* and src reg used to hold original instr */
+ ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = instr;
+ if (!ctx->atomic)
+ ssa_dst(ctx, collect, dst, chan+i);
+ }
+ }
+ }
+
+ return reg;
+}
+
+static struct ir3_register *
+add_dst_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
+ const struct tgsi_dst_register *dst, unsigned chan)
+{
+ return add_dst_reg_wrmask(ctx, instr, dst, chan, 0x1);
+}
+
+static struct ir3_register *
+add_src_reg_wrmask(struct ir3_compile_context *ctx,
+ struct ir3_instruction *instr, const struct tgsi_src_register *src,
+ unsigned chan, unsigned wrmask)
+{
+ unsigned flags = 0, num = 0;
+ struct ir3_register *reg;
+ struct ir3_instruction *orig = NULL;
+
+ /* TODO we need to use a mov to temp for const >= 64.. or maybe
+ * we could use relative addressing..
+ */
+ compile_assert(ctx, src->Index < 64);
+
+ switch (src->File) {
+ case TGSI_FILE_IMMEDIATE:
+ /* TODO if possible, use actual immediate instead of const.. but
+ * TGSI has vec4 immediates, we can only embed scalar (of limited
+ * size, depending on instruction..)
+ */
+ flags |= IR3_REG_CONST;
+ num = src->Index + ctx->so->first_immediate;
+ break;
+ case TGSI_FILE_CONSTANT:
+ flags |= IR3_REG_CONST;
+ num = src->Index;
+ break;
+ case TGSI_FILE_OUTPUT:
+ /* NOTE: we should only end up w/ OUTPUT file for things like
+ * clamp()'ing saturated dst instructions
+ */
+ case TGSI_FILE_INPUT:
+ case TGSI_FILE_TEMPORARY:
+ /* uses SSA */
+ break;
+ default:
+ compile_error(ctx, "unsupported src register file: %s\n",
+ tgsi_file_name(src->File));
+ break;
+ }
+
+ if (src->Absolute)
+ flags |= IR3_REG_ABS;
+ if (src->Negate)
+ flags |= IR3_REG_NEGATE;
+
+ if (src->Indirect) {
+ flags |= IR3_REG_RELATIV;
+
+ /* shouldn't happen, and we can't cope with it below: */
+ compile_assert(ctx, wrmask == 0x1);
+
+ /* wrap in a meta-deref to track both the src and address: */
+ orig = instr;
+
+ instr = ir3_instr_create(ctx->block, -1, OPC_META_DEREF);
+ ir3_reg_create(instr, 0, 0);
+ ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->block->address;
+ }
+
+ reg = ir3_reg_create(instr, regid(num, chan), flags);
+
+ reg->wrmask = wrmask;
+ if (wrmask == 0x1) {
+ /* normal case */
+ ssa_src(ctx, reg, src, chan);
+ } else if ((src->File == TGSI_FILE_TEMPORARY) ||
+ (src->File == TGSI_FILE_OUTPUT) ||
+ (src->File == TGSI_FILE_INPUT)) {
+ struct ir3_instruction *collect;
+ unsigned i;
+
+ compile_assert(ctx, !src->Indirect);
+
+ /* if instruction reads multiple, we need to create
+ * some place-holder collect the registers:
+ */
+ collect = ir3_instr_create(ctx->block, -1, OPC_META_FI);
+ ir3_reg_create(collect, 0, 0); /* unused dst reg */
+
+ for (i = 0; i < 4; i++) {
+ if (wrmask & (1 << i)) {
+ /* and src reg used point to the original instr */
+ ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
+ src, chan + i);
+ } else if (wrmask & ~((i << i) - 1)) {
+ /* if any remaining components, then dummy
+ * placeholder src reg to fill in the blanks:
+ */
+ ir3_reg_create(collect, 0, 0);
+ }
+ }
+
+ reg->flags |= IR3_REG_SSA;
+ reg->instr = collect;
+ }
+
+ if (src->Indirect) {
+ reg = ir3_reg_create(orig, 0, flags | IR3_REG_SSA);
+ reg->instr = instr;
+ }
+ return reg;
+}
+
+static struct ir3_register *
+add_src_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
+ const struct tgsi_src_register *src, unsigned chan)
+{
+ return add_src_reg_wrmask(ctx, instr, src, chan, 0x1);
+}
+
+static void
+src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
+{
+ src->File = dst->File;
+ src->Indirect = dst->Indirect;
+ src->Dimension = dst->Dimension;
+ src->Index = dst->Index;
+ src->Absolute = 0;
+ src->Negate = 0;
+ src->SwizzleX = TGSI_SWIZZLE_X;
+ src->SwizzleY = TGSI_SWIZZLE_Y;
+ src->SwizzleZ = TGSI_SWIZZLE_Z;
+ src->SwizzleW = TGSI_SWIZZLE_W;
+}
+
+/* Get internal-temp src/dst to use for a sequence of instructions
+ * generated by a single TGSI op.
+ */
+static struct tgsi_src_register *
+get_internal_temp(struct ir3_compile_context *ctx,
+ struct tgsi_dst_register *tmp_dst)
+{
+ struct tgsi_src_register *tmp_src;
+ int n;
+
+ tmp_dst->File = TGSI_FILE_TEMPORARY;
+ tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
+ tmp_dst->Indirect = 0;
+ tmp_dst->Dimension = 0;
+
+ /* assign next temporary: */
+ n = ctx->num_internal_temps++;
+ compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
+ tmp_src = &ctx->internal_temps[n];
+
+ tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1;
+
+ src_from_dst(tmp_src, tmp_dst);
+
+ return tmp_src;
+}
+
+static inline bool
+is_const(struct tgsi_src_register *src)
+{
+ return (src->File == TGSI_FILE_CONSTANT) ||
+ (src->File == TGSI_FILE_IMMEDIATE);
+}
+
+static inline bool
+is_relative(struct tgsi_src_register *src)
+{
+ return src->Indirect;
+}
+
+static inline bool
+is_rel_or_const(struct tgsi_src_register *src)
+{
+ return is_relative(src) || is_const(src);
+}
+
+static type_t
+get_ftype(struct ir3_compile_context *ctx)
+{
+ return TYPE_F32;
+}
+
+static type_t
+get_utype(struct ir3_compile_context *ctx)
+{
+ return TYPE_U32;
+}
+
+static unsigned
+src_swiz(struct tgsi_src_register *src, int chan)
+{
+ switch (chan) {
+ case 0: return src->SwizzleX;
+ case 1: return src->SwizzleY;
+ case 2: return src->SwizzleZ;
+ case 3: return src->SwizzleW;
+ }
+ assert(0);
+ return 0;
+}
+
+/* for instructions that cannot take a const register as src, if needed
+ * generate a move to temporary gpr:
+ */
+static struct tgsi_src_register *
+get_unconst(struct ir3_compile_context *ctx, struct tgsi_src_register *src)
+{
+ struct tgsi_dst_register tmp_dst;
+ struct tgsi_src_register *tmp_src;
+
+ compile_assert(ctx, is_rel_or_const(src));
+
+ tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+ create_mov(ctx, &tmp_dst, src);
+
+ return tmp_src;
+}
+
+static void
+get_immediate(struct ir3_compile_context *ctx,
+ struct tgsi_src_register *reg, uint32_t val)
+{
+ unsigned neg, swiz, idx, i;
+ /* actually maps 1:1 currently.. not sure if that is safe to rely on: */
+ static const unsigned swiz2tgsi[] = {
+ TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
+ };
+
+ for (i = 0; i < ctx->immediate_idx; i++) {
+ swiz = i % 4;
+ idx = i / 4;
+
+ if (ctx->so->immediates[idx].val[swiz] == val) {
+ neg = 0;
+ break;
+ }
+
+ if (ctx->so->immediates[idx].val[swiz] == -val) {
+ neg = 1;
+ break;
+ }
+ }
+
+ if (i == ctx->immediate_idx) {
+ /* need to generate a new immediate: */
+ swiz = i % 4;
+ idx = i / 4;
+ neg = 0;
+ ctx->so->immediates[idx].val[swiz] = val;
+ ctx->so->immediates_count = idx + 1;
+ ctx->immediate_idx++;
+ }
+
+ reg->File = TGSI_FILE_IMMEDIATE;
+ reg->Indirect = 0;
+ reg->Dimension = 0;
+ reg->Index = idx;
+ reg->Absolute = 0;
+ reg->Negate = neg;
+ reg->SwizzleX = swiz2tgsi[swiz];
+ reg->SwizzleY = swiz2tgsi[swiz];
+ reg->SwizzleZ = swiz2tgsi[swiz];
+ reg->SwizzleW = swiz2tgsi[swiz];
+}
+
+static void
+create_mov(struct ir3_compile_context *ctx, struct tgsi_dst_register *dst,
+ struct tgsi_src_register *src)
+{
+ type_t type_mov = get_ftype(ctx);
+ unsigned i;
+
+ for (i = 0; i < 4; i++) {
+ /* move to destination: */
+ if (dst->WriteMask & (1 << i)) {
+ struct ir3_instruction *instr;
+
+ if (src->Absolute || src->Negate) {
+ /* can't have abs or neg on a mov instr, so use
+ * absneg.f instead to handle these cases:
+ */
+ instr = instr_create(ctx, 2, OPC_ABSNEG_F);
+ } else {
+ instr = instr_create(ctx, 1, 0);
+ instr->cat1.src_type = type_mov;
+ instr->cat1.dst_type = type_mov;
+ }
+
+ add_dst_reg(ctx, instr, dst, i);
+ add_src_reg(ctx, instr, src, src_swiz(src, i));
+ }
+ }
+}
+
+static void
+create_clamp(struct ir3_compile_context *ctx,
+ struct tgsi_dst_register *dst, struct tgsi_src_register *val,
+ struct tgsi_src_register *minval, struct tgsi_src_register *maxval)
+{
+ struct ir3_instruction *instr;
+
+ instr = instr_create(ctx, 2, OPC_MAX_F);
+ vectorize(ctx, instr, dst, 2, val, 0, minval, 0);
+
+ instr = instr_create(ctx, 2, OPC_MIN_F);
+ vectorize(ctx, instr, dst, 2, val, 0, maxval, 0);
+}
+
+static void
+create_clamp_imm(struct ir3_compile_context *ctx,
+ struct tgsi_dst_register *dst,
+ uint32_t minval, uint32_t maxval)
+{
+ struct tgsi_src_register minconst, maxconst;
+ struct tgsi_src_register src;
+
+ src_from_dst(&src, dst);
+
+ get_immediate(ctx, &minconst, minval);
+ get_immediate(ctx, &maxconst, maxval);
+
+ create_clamp(ctx, dst, &src, &minconst, &maxconst);
+}
+
+static struct tgsi_dst_register *
+get_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst)
+{
+ struct tgsi_dst_register *dst = &inst->Dst[0].Register;
+ unsigned i;
+ for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+ struct tgsi_src_register *src = &inst->Src[i].Register;
+ if ((src->File == dst->File) && (src->Index == dst->Index)) {
+ if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) &&
+ (src->SwizzleX == TGSI_SWIZZLE_X) &&
+ (src->SwizzleY == TGSI_SWIZZLE_Y) &&
+ (src->SwizzleZ == TGSI_SWIZZLE_Z) &&
+ (src->SwizzleW == TGSI_SWIZZLE_W))
+ continue;
+ ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst);
+ ctx->tmp_dst.WriteMask = dst->WriteMask;
+ dst = &ctx->tmp_dst;
+ break;
+ }
+ }
+ return dst;
+}
+
+static void
+put_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst,
+ struct tgsi_dst_register *dst)
+{
+ /* if necessary, add mov back into original dst: */
+ if (dst != &inst->Dst[0].Register) {
+ create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src);
+ }
+}
+
+/* helper to generate the necessary repeat and/or additional instructions
+ * to turn a scalar instruction into a vector operation:
+ */
+static void
+vectorize(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
+ struct tgsi_dst_register *dst, int nsrcs, ...)
+{
+ va_list ap;
+ int i, j, n = 0;
+
+ instr_atomic_start(ctx);
+
+ add_dst_reg(ctx, instr, dst, TGSI_SWIZZLE_X);
+
+ va_start(ap, nsrcs);
+ for (j = 0; j < nsrcs; j++) {
+ struct tgsi_src_register *src =
+ va_arg(ap, struct tgsi_src_register *);
+ unsigned flags = va_arg(ap, unsigned);
+ struct ir3_register *reg;
+ if (flags & IR3_REG_IMMED) {
+ reg = ir3_reg_create(instr, 0, IR3_REG_IMMED);
+ /* this is an ugly cast.. should have put flags first! */
+ reg->iim_val = *(int *)&src;
+ } else {
+ reg = add_src_reg(ctx, instr, src, TGSI_SWIZZLE_X);
+ }
+ reg->flags |= flags & ~IR3_REG_NEGATE;
+ if (flags & IR3_REG_NEGATE)
+ reg->flags ^= IR3_REG_NEGATE;
+ }
+ va_end(ap);
+
+ for (i = 0; i < 4; i++) {
+ if (dst->WriteMask & (1 << i)) {
+ struct ir3_instruction *cur;
+
+ if (n++ == 0) {
+ cur = instr;
+ } else {
+ cur = instr_clone(ctx, instr);
+ }
+
+ ssa_dst(ctx, cur, dst, i);
+
+ /* fix-up dst register component: */
+ cur->regs[0]->num = regid(cur->regs[0]->num >> 2, i);
+
+ /* fix-up src register component: */
+ va_start(ap, nsrcs);
+ for (j = 0; j < nsrcs; j++) {
+ struct ir3_register *reg = cur->regs[j+1];
+ struct tgsi_src_register *src =
+ va_arg(ap, struct tgsi_src_register *);
+ unsigned flags = va_arg(ap, unsigned);
+ if (reg->flags & IR3_REG_SSA) {
+ ssa_src(ctx, reg, src, src_swiz(src, i));
+ } else if (!(flags & IR3_REG_IMMED)) {
+ reg->num = regid(reg->num >> 2, src_swiz(src, i));
+ }
+ }
+ va_end(ap);
+ }
+ }
+
+ instr_atomic_end(ctx);
+}
+
+/*
+ * Handlers for TGSI instructions which do not have a 1:1 mapping to
+ * native instructions:
+ */
+
+static void
+trans_clamp(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct tgsi_dst_register *dst = get_dst(ctx, inst);
+ struct tgsi_src_register *src0 = &inst->Src[0].Register;
+ struct tgsi_src_register *src1 = &inst->Src[1].Register;
+ struct tgsi_src_register *src2 = &inst->Src[2].Register;
+
+ create_clamp(ctx, dst, src0, src1, src2);
+
+ put_dst(ctx, inst, dst);
+}
+
+/* ARL(x) = x, but mova from hrN.x to a0.. */
+static void
+trans_arl(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct ir3_instruction *instr;
+ struct tgsi_dst_register tmp_dst;
+ struct tgsi_src_register *tmp_src;
+ struct tgsi_dst_register *dst = &inst->Dst[0].Register;
+ struct tgsi_src_register *src = &inst->Src[0].Register;
+ unsigned chan = src->SwizzleX;
+
+ compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS);
+
+ /* NOTE: we allocate a temporary from a flat register
+ * namespace (ignoring half vs full). It turns out
+ * not to really matter since registers get reassigned
+ * later in ir3_ra which (hopefully!) can deal a bit
+ * better with mixed half and full precision.
+ */
+ tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+ /* cov.f{32,16}s16 Rtmp, Rsrc */
+ instr = instr_create(ctx, 1, 0);
+ instr->cat1.src_type = get_ftype(ctx);
+ instr->cat1.dst_type = TYPE_S16;
+ add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
+ add_src_reg(ctx, instr, src, chan);
+
+ /* shl.b Rtmp, Rtmp, 2 */
+ instr = instr_create(ctx, 2, OPC_SHL_B);
+ add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
+ add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
+ ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
+
+ /* mova a0, Rtmp */
+ instr = instr_create(ctx, 1, 0);
+ instr->cat1.src_type = TYPE_S16;
+ instr->cat1.dst_type = TYPE_S16;
+ add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF;
+ add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
+}
+
+/*
+ * texture fetch/sample instructions:
+ */
+
+struct tex_info {
+ int8_t order[4];
+ unsigned src_wrmask, flags;
+};
+
+static const struct tex_info *
+get_tex_info(struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ static const struct tex_info tex1d = {
+ .order = { 0, -1, -1, -1 }, /* coord.x */
+ .src_wrmask = TGSI_WRITEMASK_XY,
+ .flags = 0,
+ };
+ static const struct tex_info tex1ds = {
+ .order = { 0, -1, 2, -1 }, /* coord.xz */
+ .src_wrmask = TGSI_WRITEMASK_XYZ,
+ .flags = IR3_INSTR_S,
+ };
+ static const struct tex_info tex2d = {
+ .order = { 0, 1, -1, -1 }, /* coord.xy */
+ .src_wrmask = TGSI_WRITEMASK_XY,
+ .flags = 0,
+ };
+ static const struct tex_info tex2ds = {
+ .order = { 0, 1, 2, -1 }, /* coord.xyz */
+ .src_wrmask = TGSI_WRITEMASK_XYZ,
+ .flags = IR3_INSTR_S,
+ };
+ static const struct tex_info tex3d = {
+ .order = { 0, 1, 2, -1 }, /* coord.xyz */
+ .src_wrmask = TGSI_WRITEMASK_XYZ,
+ .flags = IR3_INSTR_3D,
+ };
+ static const struct tex_info tex3ds = {
+ .order = { 0, 1, 2, 3 }, /* coord.xyzw */
+ .src_wrmask = TGSI_WRITEMASK_XYZW,
+ .flags = IR3_INSTR_S | IR3_INSTR_3D,
+ };
+ static const struct tex_info txp1d = {
+ .order = { 0, -1, 3, -1 }, /* coord.xw */
+ .src_wrmask = TGSI_WRITEMASK_XYZ,
+ .flags = IR3_INSTR_P,
+ };
+ static const struct tex_info txp1ds = {
+ .order = { 0, -1, 2, 3 }, /* coord.xzw */
+ .src_wrmask = TGSI_WRITEMASK_XYZW,
+ .flags = IR3_INSTR_P | IR3_INSTR_S,
+ };
+ static const struct tex_info txp2d = {
+ .order = { 0, 1, 3, -1 }, /* coord.xyw */
+ .src_wrmask = TGSI_WRITEMASK_XYZ,
+ .flags = IR3_INSTR_P,
+ };
+ static const struct tex_info txp2ds = {
+ .order = { 0, 1, 2, 3 }, /* coord.xyzw */
+ .src_wrmask = TGSI_WRITEMASK_XYZW,
+ .flags = IR3_INSTR_P | IR3_INSTR_S,
+ };
+ static const struct tex_info txp3d = {
+ .order = { 0, 1, 2, 3 }, /* coord.xyzw */
+ .src_wrmask = TGSI_WRITEMASK_XYZW,
+ .flags = IR3_INSTR_P | IR3_INSTR_3D,
+ };
+
+ unsigned tex = inst->Texture.Texture;
+
+ switch (inst->Instruction.Opcode) {
+ case TGSI_OPCODE_TEX:
+ switch (tex) {
+ case TGSI_TEXTURE_1D:
+ return &tex1d;
+ case TGSI_TEXTURE_SHADOW1D:
+ return &tex1ds;
+ case TGSI_TEXTURE_2D:
+ case TGSI_TEXTURE_RECT:
+ return &tex2d;
+ case TGSI_TEXTURE_SHADOW2D:
+ case TGSI_TEXTURE_SHADOWRECT:
+ return &tex2ds;
+ case TGSI_TEXTURE_3D:
+ case TGSI_TEXTURE_CUBE:
+ return &tex3d;
+ case TGSI_TEXTURE_SHADOWCUBE:
+ return &tex3ds;
+ default:
+ compile_error(ctx, "unknown texture type: %s\n",
+ tgsi_texture_names[tex]);
+ return NULL;
+ }
+ break;
+ case TGSI_OPCODE_TXP:
+ switch (tex) {
+ case TGSI_TEXTURE_1D:
+ return &txp1d;
+ case TGSI_TEXTURE_SHADOW1D:
+ return &txp1ds;
+ case TGSI_TEXTURE_2D:
+ case TGSI_TEXTURE_RECT:
+ return &txp2d;
+ case TGSI_TEXTURE_SHADOW2D:
+ case TGSI_TEXTURE_SHADOWRECT:
+ return &txp2ds;
+ case TGSI_TEXTURE_3D:
+ case TGSI_TEXTURE_CUBE:
+ return &txp3d;
+ default:
+ compile_error(ctx, "unknown texture type: %s\n",
+ tgsi_texture_names[tex]);
+ break;
+ }
+ break;
+ }
+ compile_assert(ctx, 0);
+ return NULL;
+}
+
+static struct tgsi_src_register *
+get_tex_coord(struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst,
+ const struct tex_info *tinf)
+{
+ struct tgsi_src_register *coord = &inst->Src[0].Register;
+ struct ir3_instruction *instr;
+ unsigned tex = inst->Texture.Texture;
+ bool needs_mov = false;
+ unsigned i;
+
+ /* cat5 instruction cannot seem to handle const or relative: */
+ if (is_rel_or_const(coord))
+ needs_mov = true;
+
+ /* 1D textures we fix up w/ 0.0 as 2nd coord: */
+ if ((tex == TGSI_TEXTURE_1D) || (tex == TGSI_TEXTURE_SHADOW1D))
+ needs_mov = true;
+
+ /* The texture sample instructions need to coord in successive
+ * registers/components (ie. src.xy but not src.yx). And TXP
+ * needs the .w component in .z for 2D.. so in some cases we
+ * might need to emit some mov instructions to shuffle things
+ * around:
+ */
+ for (i = 1; (i < 4) && (tinf->order[i] >= 0) && !needs_mov; i++)
+ if (src_swiz(coord, i) != (src_swiz(coord, 0) + tinf->order[i]))
+ needs_mov = true;
+
+ if (needs_mov) {
+ struct tgsi_dst_register tmp_dst;
+ struct tgsi_src_register *tmp_src;
+ unsigned j;
+
+ type_t type_mov = get_ftype(ctx);
+
+ /* need to move things around: */
+ tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+ for (j = 0; j < 4; j++) {
+ if (tinf->order[j] < 0)
+ continue;
+ instr = instr_create(ctx, 1, 0); /* mov */
+ instr->cat1.src_type = type_mov;
+ instr->cat1.dst_type = type_mov;
+ add_dst_reg(ctx, instr, &tmp_dst, j);
+ add_src_reg(ctx, instr, coord,
+ src_swiz(coord, tinf->order[j]));
+ }
+
+ /* fix up .y coord: */
+ if ((tex == TGSI_TEXTURE_1D) ||
+ (tex == TGSI_TEXTURE_SHADOW1D)) {
+ instr = instr_create(ctx, 1, 0); /* mov */
+ instr->cat1.src_type = type_mov;
+ instr->cat1.dst_type = type_mov;
+ add_dst_reg(ctx, instr, &tmp_dst, 1); /* .y */
+ ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = 0.5;
+ }
+
+ coord = tmp_src;
+ }
+
+ return coord;
+}
+
+static void
+trans_samp(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct ir3_instruction *instr;
+ struct tgsi_dst_register *dst = &inst->Dst[0].Register;
+ struct tgsi_src_register *coord;
+ struct tgsi_src_register *samp = &inst->Src[1].Register;
+ const struct tex_info *tinf;
+
+ tinf = get_tex_info(ctx, inst);
+ coord = get_tex_coord(ctx, inst, tinf);
+
+ instr = instr_create(ctx, 5, t->opc);
+ instr->cat5.type = get_ftype(ctx);
+ instr->cat5.samp = samp->Index;
+ instr->cat5.tex = samp->Index;
+ instr->flags |= tinf->flags;
+
+ add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask);
+ add_src_reg_wrmask(ctx, instr, coord, coord->SwizzleX, tinf->src_wrmask);
+}
+
+/*
+ * SEQ(a,b) = (a == b) ? 1.0 : 0.0
+ * cmps.f.eq tmp0, a, b
+ * cov.u16f16 dst, tmp0
+ *
+ * SNE(a,b) = (a != b) ? 1.0 : 0.0
+ * cmps.f.ne tmp0, a, b
+ * cov.u16f16 dst, tmp0
+ *
+ * SGE(a,b) = (a >= b) ? 1.0 : 0.0
+ * cmps.f.ge tmp0, a, b
+ * cov.u16f16 dst, tmp0
+ *
+ * SLE(a,b) = (a <= b) ? 1.0 : 0.0
+ * cmps.f.le tmp0, a, b
+ * cov.u16f16 dst, tmp0
+ *
+ * SGT(a,b) = (a > b) ? 1.0 : 0.0
+ * cmps.f.gt tmp0, a, b
+ * cov.u16f16 dst, tmp0
+ *
+ * SLT(a,b) = (a < b) ? 1.0 : 0.0
+ * cmps.f.lt tmp0, a, b
+ * cov.u16f16 dst, tmp0
+ *
+ * CMP(a,b,c) = (a < 0.0) ? b : c
+ * cmps.f.lt tmp0, a, {0.0}
+ * sel.b16 dst, b, tmp0, c
+ */
+static void
+trans_cmp(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct ir3_instruction *instr;
+ struct tgsi_dst_register tmp_dst;
+ struct tgsi_src_register *tmp_src;
+ struct tgsi_src_register constval0;
+ /* final instruction for CMP() uses orig src1 and src2: */
+ struct tgsi_dst_register *dst = get_dst(ctx, inst);
+ struct tgsi_src_register *a0, *a1, *a2;
+ unsigned condition;
+
+ tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+ a0 = &inst->Src[0].Register; /* a */
+ a1 = &inst->Src[1].Register; /* b */
+
+ switch (t->tgsi_opc) {
+ case TGSI_OPCODE_SEQ:
+ case TGSI_OPCODE_FSEQ:
+ condition = IR3_COND_EQ;
+ break;
+ case TGSI_OPCODE_SNE:
+ case TGSI_OPCODE_FSNE:
+ condition = IR3_COND_NE;
+ break;
+ case TGSI_OPCODE_SGE:
+ case TGSI_OPCODE_FSGE:
+ condition = IR3_COND_GE;
+ break;
+ case TGSI_OPCODE_SLT:
+ case TGSI_OPCODE_FSLT:
+ condition = IR3_COND_LT;
+ break;
+ case TGSI_OPCODE_SLE:
+ condition = IR3_COND_LE;
+ break;
+ case TGSI_OPCODE_SGT:
+ condition = IR3_COND_GT;
+ break;
+ case TGSI_OPCODE_CMP:
+ get_immediate(ctx, &constval0, fui(0.0));
+ a0 = &inst->Src[0].Register; /* a */
+ a1 = &constval0; /* {0.0} */
+ condition = IR3_COND_LT;
+ break;
+ default:
+ compile_assert(ctx, 0);
+ return;
+ }
+
+ if (is_const(a0) && is_const(a1))
+ a0 = get_unconst(ctx, a0);
+
+ /* cmps.f.<cond> tmp, a0, a1 */
+ instr = instr_create(ctx, 2, OPC_CMPS_F);
+ instr->cat2.condition = condition;
+ vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
+
+ switch (t->tgsi_opc) {
+ case TGSI_OPCODE_SEQ:
+ case TGSI_OPCODE_FSEQ:
+ case TGSI_OPCODE_SGE:
+ case TGSI_OPCODE_FSGE:
+ case TGSI_OPCODE_SLE:
+ case TGSI_OPCODE_SNE:
+ case TGSI_OPCODE_FSNE:
+ case TGSI_OPCODE_SGT:
+ case TGSI_OPCODE_SLT:
+ case TGSI_OPCODE_FSLT:
+ /* cov.u16f16 dst, tmp0 */
+ instr = instr_create(ctx, 1, 0);
+ instr->cat1.src_type = get_utype(ctx);
+ instr->cat1.dst_type = get_ftype(ctx);
+ vectorize(ctx, instr, dst, 1, tmp_src, 0);
+ break;
+ case TGSI_OPCODE_CMP:
+ a1 = &inst->Src[1].Register;
+ a2 = &inst->Src[2].Register;
+ /* sel.{b32,b16} dst, src2, tmp, src1 */
+ instr = instr_create(ctx, 3, OPC_SEL_B32);
+ vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0);
+
+ break;
+ }
+
+ put_dst(ctx, inst, dst);
+}
+
+/*
+ * USNE(a,b) = (a != b) ? 1 : 0
+ * cmps.u32.ne dst, a, b
+ *
+ * USEQ(a,b) = (a == b) ? 1 : 0
+ * cmps.u32.eq dst, a, b
+ *
+ * ISGE(a,b) = (a > b) ? 1 : 0
+ * cmps.s32.ge dst, a, b
+ *
+ * USGE(a,b) = (a > b) ? 1 : 0
+ * cmps.u32.ge dst, a, b
+ *
+ * ISLT(a,b) = (a < b) ? 1 : 0
+ * cmps.s32.lt dst, a, b
+ *
+ * USLT(a,b) = (a < b) ? 1 : 0
+ * cmps.u32.lt dst, a, b
+ *
+ * UCMP(a,b,c) = (a < 0) ? b : c
+ * cmps.u32.lt tmp0, a, {0}
+ * sel.b16 dst, b, tmp0, c
+ */
+static void
+trans_icmp(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct ir3_instruction *instr;
+ struct tgsi_dst_register *dst = get_dst(ctx, inst);
+ struct tgsi_src_register constval0;
+ struct tgsi_src_register *a0, *a1, *a2;
+ unsigned condition;
+
+ a0 = &inst->Src[0].Register; /* a */
+ a1 = &inst->Src[1].Register; /* b */
+
+ switch (t->tgsi_opc) {
+ case TGSI_OPCODE_USNE:
+ condition = IR3_COND_NE;
+ break;
+ case TGSI_OPCODE_USEQ:
+ condition = IR3_COND_EQ;
+ break;
+ case TGSI_OPCODE_ISGE:
+ case TGSI_OPCODE_USGE:
+ condition = IR3_COND_GE;
+ break;
+ case TGSI_OPCODE_ISLT:
+ case TGSI_OPCODE_USLT:
+ condition = IR3_COND_LT;
+ break;
+ case TGSI_OPCODE_UCMP:
+ get_immediate(ctx, &constval0, 0);
+ a0 = &inst->Src[0].Register; /* a */
+ a1 = &constval0; /* {0} */
+ condition = IR3_COND_LT;
+ break;
+
+ default:
+ compile_assert(ctx, 0);
+ return;
+ }
+
+ if (is_const(a0) && is_const(a1))
+ a0 = get_unconst(ctx, a0);
+
+ if (t->tgsi_opc == TGSI_OPCODE_UCMP) {
+ struct tgsi_dst_register tmp_dst;
+ struct tgsi_src_register *tmp_src;
+ tmp_src = get_internal_temp(ctx, &tmp_dst);
+ /* cmps.u32.lt tmp, a0, a1 */
+ instr = instr_create(ctx, 2, t->opc);
+ instr->cat2.condition = condition;
+ vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
+
+ a1 = &inst->Src[1].Register;
+ a2 = &inst->Src[2].Register;
+ /* sel.{b32,b16} dst, src2, tmp, src1 */
+ instr = instr_create(ctx, 3, OPC_SEL_B32);
+ vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0);
+ } else {
+ /* cmps.{u32,s32}.<cond> dst, a0, a1 */
+ instr = instr_create(ctx, 2, t->opc);
+ instr->cat2.condition = condition;
+ vectorize(ctx, instr, dst, 2, a0, 0, a1, 0);
+ }
+ put_dst(ctx, inst, dst);
+}
+
+/*
+ * Conditional / Flow control
+ */
+
+static void
+push_branch(struct ir3_compile_context *ctx, bool inv,
+ struct ir3_instruction *instr, struct ir3_instruction *cond)
+{
+ unsigned int idx = ctx->branch_count++;
+ compile_assert(ctx, idx < ARRAY_SIZE(ctx->branch));
+ ctx->branch[idx].instr = instr;
+ ctx->branch[idx].inv = inv;
+ /* else side of branch has same condition: */
+ if (!inv)
+ ctx->branch[idx].cond = cond;
+}
+
+static struct ir3_instruction *
+pop_branch(struct ir3_compile_context *ctx)
+{
+ unsigned int idx = --ctx->branch_count;
+ return ctx->branch[idx].instr;
+}
+
+static void
+trans_if(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct ir3_instruction *instr, *cond;
+ struct tgsi_src_register *src = &inst->Src[0].Register;
+ struct tgsi_dst_register tmp_dst;
+ struct tgsi_src_register *tmp_src;
+ struct tgsi_src_register constval;
+
+ get_immediate(ctx, &constval, fui(0.0));
+ tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+ if (is_const(src))
+ src = get_unconst(ctx, src);
+
+ /* cmps.f.ne tmp0, b, {0.0} */
+ instr = instr_create(ctx, 2, OPC_CMPS_F);
+ add_dst_reg(ctx, instr, &tmp_dst, 0);
+ add_src_reg(ctx, instr, src, src->SwizzleX);
+ add_src_reg(ctx, instr, &constval, constval.SwizzleX);
+ instr->cat2.condition = IR3_COND_NE;
+
+ compile_assert(ctx, instr->regs[1]->flags & IR3_REG_SSA); /* because get_unconst() */
+ cond = instr->regs[1]->instr;
+
+ /* meta:flow tmp0 */
+ instr = instr_create(ctx, -1, OPC_META_FLOW);
+ ir3_reg_create(instr, 0, 0); /* dummy dst */
+ add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X);
+
+ push_branch(ctx, false, instr, cond);
+ instr->flow.if_block = push_block(ctx);
+}
+
+static void
+trans_else(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct ir3_instruction *instr;
+
+ pop_block(ctx);
+
+ instr = pop_branch(ctx);
+
+ compile_assert(ctx, (instr->category == -1) &&
+ (instr->opc == OPC_META_FLOW));
+
+ push_branch(ctx, true, instr, NULL);
+ instr->flow.else_block = push_block(ctx);
+}
+
+static struct ir3_instruction *
+find_temporary(struct ir3_block *block, unsigned n)
+{
+ if (block->parent && !block->temporaries[n])
+ return find_temporary(block->parent, n);
+ return block->temporaries[n];
+}
+
+static struct ir3_instruction *
+find_output(struct ir3_block *block, unsigned n)
+{
+ if (block->parent && !block->outputs[n])
+ return find_output(block->parent, n);
+ return block->outputs[n];
+}
+
+static struct ir3_instruction *
+create_phi(struct ir3_compile_context *ctx, struct ir3_instruction *cond,
+ struct ir3_instruction *a, struct ir3_instruction *b)
+{
+ struct ir3_instruction *phi;
+
+ compile_assert(ctx, cond);
+
+ /* Either side of the condition could be null.. which
+ * indicates a variable written on only one side of the
+ * branch. Normally this should only be variables not
+ * used outside of that side of the branch. So we could
+ * just 'return a ? a : b;' in that case. But for better
+ * defined undefined behavior we just stick in imm{0.0}.
+ * In the common case of a value only used within the
+ * one side of the branch, the PHI instruction will not
+ * get scheduled
+ */
+ if (!a)
+ a = create_immed(ctx, 0.0);
+ if (!b)
+ b = create_immed(ctx, 0.0);
+
+ phi = instr_create(ctx, -1, OPC_META_PHI);
+ ir3_reg_create(phi, 0, 0); /* dummy dst */
+ ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = cond;
+ ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = a;
+ ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = b;
+
+ return phi;
+}
+
+static void
+trans_endif(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct ir3_instruction *instr;
+ struct ir3_block *ifb, *elseb;
+ struct ir3_instruction **ifout, **elseout;
+ unsigned i, ifnout = 0, elsenout = 0;
+
+ pop_block(ctx);
+
+ instr = pop_branch(ctx);
+
+ compile_assert(ctx, (instr->category == -1) &&
+ (instr->opc == OPC_META_FLOW));
+
+ ifb = instr->flow.if_block;
+ elseb = instr->flow.else_block;
+ /* if there is no else block, the parent block is used for the
+ * branch-not-taken src of the PHI instructions:
+ */
+ if (!elseb)
+ elseb = ifb->parent;
+
+ /* worst case sizes: */
+ ifnout = ifb->ntemporaries + ifb->noutputs;
+ elsenout = elseb->ntemporaries + elseb->noutputs;
+
+ ifout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * ifnout);
+ if (elseb != ifb->parent)
+ elseout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * elsenout);
+
+ ifnout = 0;
+ elsenout = 0;
+
+ /* generate PHI instructions for any temporaries written: */
+ for (i = 0; i < ifb->ntemporaries; i++) {
+ struct ir3_instruction *a = ifb->temporaries[i];
+ struct ir3_instruction *b = elseb->temporaries[i];
+
+ /* if temporary written in if-block, or if else block
+ * is present and temporary written in else-block:
+ */
+ if (a || ((elseb != ifb->parent) && b)) {
+ struct ir3_instruction *phi;
+
+ /* if only written on one side, find the closest
+ * enclosing update on other side:
+ */
+ if (!a)
+ a = find_temporary(ifb, i);
+ if (!b)
+ b = find_temporary(elseb, i);
+
+ ifout[ifnout] = a;
+ a = create_output(ifb, a, ifnout++);
+
+ if (elseb != ifb->parent) {
+ elseout[elsenout] = b;
+ b = create_output(elseb, b, elsenout++);
+ }
+
+ phi = create_phi(ctx, instr, a, b);
+ ctx->block->temporaries[i] = phi;
+ }
+ }
+
+ compile_assert(ctx, ifb->noutputs == elseb->noutputs);
+
+ /* .. and any outputs written: */
+ for (i = 0; i < ifb->noutputs; i++) {
+ struct ir3_instruction *a = ifb->outputs[i];
+ struct ir3_instruction *b = elseb->outputs[i];
+
+ /* if output written in if-block, or if else block
+ * is present and output written in else-block:
+ */
+ if (a || ((elseb != ifb->parent) && b)) {
+ struct ir3_instruction *phi;
+
+ /* if only written on one side, find the closest
+ * enclosing update on other side:
+ */
+ if (!a)
+ a = find_output(ifb, i);
+ if (!b)
+ b = find_output(elseb, i);
+
+ ifout[ifnout] = a;
+ a = create_output(ifb, a, ifnout++);
+
+ if (elseb != ifb->parent) {
+ elseout[elsenout] = b;
+ b = create_output(elseb, b, elsenout++);
+ }
+
+ phi = create_phi(ctx, instr, a, b);
+ ctx->block->outputs[i] = phi;
+ }
+ }
+
+ ifb->noutputs = ifnout;
+ ifb->outputs = ifout;
+
+ if (elseb != ifb->parent) {
+ elseb->noutputs = elsenout;
+ elseb->outputs = elseout;
+ }
+
+ // TODO maybe we want to compact block->inputs?
+}
+
+/*
+ * Kill
+ */
+
+static void
+trans_kill(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct ir3_instruction *instr, *immed, *cond = NULL;
+ bool inv = false;
+
+ switch (t->tgsi_opc) {
+ case TGSI_OPCODE_KILL:
+ /* unconditional kill, use enclosing if condition: */
+ if (ctx->branch_count > 0) {
+ unsigned int idx = ctx->branch_count - 1;
+ cond = ctx->branch[idx].cond;
+ inv = ctx->branch[idx].inv;
+ } else {
+ cond = create_immed(ctx, 1.0);
+ }
+
+ break;
+ }
+
+ compile_assert(ctx, cond);
+
+ immed = create_immed(ctx, 0.0);
+
+ /* cmps.f.ne p0.x, cond, {0.0} */
+ instr = instr_create(ctx, 2, OPC_CMPS_F);
+ instr->cat2.condition = IR3_COND_NE;
+ ir3_reg_create(instr, regid(REG_P0, 0), 0);
+ ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
+ ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
+ cond = instr;
+
+ /* kill p0.x */
+ instr = instr_create(ctx, 0, OPC_KILL);
+ instr->cat0.inv = inv;
+ ir3_reg_create(instr, 0, 0); /* dummy dst */
+ ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
+
+ ctx->kill[ctx->kill_count++] = instr;
+}
+
+/*
+ * Kill-If
+ */
+
+static void
+trans_killif(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct tgsi_src_register *src = &inst->Src[0].Register;
+ struct ir3_instruction *instr, *immed, *cond = NULL;
+ bool inv = false;
+
+ immed = create_immed(ctx, 0.0);
+
+ /* cmps.f.ne p0.x, cond, {0.0} */
+ instr = instr_create(ctx, 2, OPC_CMPS_F);
+ instr->cat2.condition = IR3_COND_NE;
+ ir3_reg_create(instr, regid(REG_P0, 0), 0);
+ ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
+ add_src_reg(ctx, instr, src, src->SwizzleX);
+
+ cond = instr;
+
+ /* kill p0.x */
+ instr = instr_create(ctx, 0, OPC_KILL);
+ instr->cat0.inv = inv;
+ ir3_reg_create(instr, 0, 0); /* dummy dst */
+ ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
+
+ ctx->kill[ctx->kill_count++] = instr;
+
+}
+/*
+ * I2F / U2F / F2I / F2U
+ */
+
+static void
+trans_cov(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct ir3_instruction *instr;
+ struct tgsi_dst_register *dst = get_dst(ctx, inst);
+ struct tgsi_src_register *src = &inst->Src[0].Register;
+
+ // cov.f32s32 dst, tmp0 /
+ instr = instr_create(ctx, 1, 0);
+ switch (t->tgsi_opc) {
+ case TGSI_OPCODE_U2F:
+ instr->cat1.src_type = TYPE_U32;
+ instr->cat1.dst_type = TYPE_F32;
+ break;
+ case TGSI_OPCODE_I2F:
+ instr->cat1.src_type = TYPE_S32;
+ instr->cat1.dst_type = TYPE_F32;
+ break;
+ case TGSI_OPCODE_F2U:
+ instr->cat1.src_type = TYPE_F32;
+ instr->cat1.dst_type = TYPE_U32;
+ break;
+ case TGSI_OPCODE_F2I:
+ instr->cat1.src_type = TYPE_F32;
+ instr->cat1.dst_type = TYPE_S32;
+ break;
+
+ }
+ vectorize(ctx, instr, dst, 1, src, 0);
+}
+
+/*
+ * Handlers for TGSI instructions which do have 1:1 mapping to native
+ * instructions:
+ */
+
+static void
+instr_cat0(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ instr_create(ctx, 0, t->opc);
+}
+
+static void
+instr_cat1(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct tgsi_dst_register *dst = get_dst(ctx, inst);
+ struct tgsi_src_register *src = &inst->Src[0].Register;
+ create_mov(ctx, dst, src);
+ put_dst(ctx, inst, dst);
+}
+
+static void
+instr_cat2(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct tgsi_dst_register *dst = get_dst(ctx, inst);
+ struct tgsi_src_register *src0 = &inst->Src[0].Register;
+ struct tgsi_src_register *src1 = &inst->Src[1].Register;
+ struct ir3_instruction *instr;
+ unsigned src0_flags = 0, src1_flags = 0;
+
+ switch (t->tgsi_opc) {
+ case TGSI_OPCODE_ABS:
+ case TGSI_OPCODE_IABS:
+ src0_flags = IR3_REG_ABS;
+ break;
+ case TGSI_OPCODE_SUB:
+ case TGSI_OPCODE_INEG:
+ src1_flags = IR3_REG_NEGATE;
+ break;
+ }
+
+ switch (t->opc) {
+ case OPC_ABSNEG_F:
+ case OPC_ABSNEG_S:
+ case OPC_CLZ_B:
+ case OPC_CLZ_S:
+ case OPC_SIGN_F:
+ case OPC_FLOOR_F:
+ case OPC_CEIL_F:
+ case OPC_RNDNE_F:
+ case OPC_RNDAZ_F:
+ case OPC_TRUNC_F:
+ case OPC_NOT_B:
+ case OPC_BFREV_B:
+ case OPC_SETRM:
+ case OPC_CBITS_B:
+ /* these only have one src reg */
+ instr = instr_create(ctx, 2, t->opc);
+ vectorize(ctx, instr, dst, 1, src0, src0_flags);
+ break;
+ default:
+ if (is_const(src0) && is_const(src1))
+ src0 = get_unconst(ctx, src0);
+
+ instr = instr_create(ctx, 2, t->opc);
+ vectorize(ctx, instr, dst, 2, src0, src0_flags,
+ src1, src1_flags);
+ break;
+ }
+
+ put_dst(ctx, inst, dst);
+}
+
+static void
+instr_cat3(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct tgsi_dst_register *dst = get_dst(ctx, inst);
+ struct tgsi_src_register *src0 = &inst->Src[0].Register;
+ struct tgsi_src_register *src1 = &inst->Src[1].Register;
+ struct ir3_instruction *instr;
+
+ /* in particular, can't handle const for src1 for cat3..
+ * for mad, we can swap first two src's if needed:
+ */
+ if (is_rel_or_const(src1)) {
+ if (is_mad(t->opc) && !is_rel_or_const(src0)) {
+ struct tgsi_src_register *tmp;
+ tmp = src0;
+ src0 = src1;
+ src1 = tmp;
+ } else {
+ src1 = get_unconst(ctx, src1);
+ }
+ }
+
+ instr = instr_create(ctx, 3, t->opc);
+ vectorize(ctx, instr, dst, 3, src0, 0, src1, 0,
+ &inst->Src[2].Register, 0);
+ put_dst(ctx, inst, dst);
+}
+
+static void
+instr_cat4(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct tgsi_dst_register *dst = get_dst(ctx, inst);
+ struct tgsi_src_register *src = &inst->Src[0].Register;
+ struct ir3_instruction *instr;
+ unsigned i;
+
+ /* seems like blob compiler avoids const as src.. */
+ if (is_const(src))
+ src = get_unconst(ctx, src);
+
+ /* we need to replicate into each component: */
+ for (i = 0; i < 4; i++) {
+ if (dst->WriteMask & (1 << i)) {
+ instr = instr_create(ctx, 4, t->opc);
+ add_dst_reg(ctx, instr, dst, i);
+ add_src_reg(ctx, instr, src, src->SwizzleX);
+ }
+ }
+
+ put_dst(ctx, inst, dst);
+}
+
+static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
+#define INSTR(n, f, ...) \
+ [TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ }
+
+ INSTR(MOV, instr_cat1),
+ INSTR(RCP, instr_cat4, .opc = OPC_RCP),
+ INSTR(RSQ, instr_cat4, .opc = OPC_RSQ),
+ INSTR(SQRT, instr_cat4, .opc = OPC_SQRT),
+ INSTR(MUL, instr_cat2, .opc = OPC_MUL_F),
+ INSTR(ADD, instr_cat2, .opc = OPC_ADD_F),
+ INSTR(SUB, instr_cat2, .opc = OPC_ADD_F),
+ INSTR(MIN, instr_cat2, .opc = OPC_MIN_F),
+ INSTR(MAX, instr_cat2, .opc = OPC_MAX_F),
+ INSTR(UADD, instr_cat2, .opc = OPC_ADD_U),
+ INSTR(IMIN, instr_cat2, .opc = OPC_MIN_S),
+ INSTR(UMIN, instr_cat2, .opc = OPC_MIN_U),
+ INSTR(IMAX, instr_cat2, .opc = OPC_MAX_S),
+ INSTR(UMAX, instr_cat2, .opc = OPC_MAX_U),
+ INSTR(AND, instr_cat2, .opc = OPC_AND_B),
+ INSTR(OR, instr_cat2, .opc = OPC_OR_B),
+ INSTR(NOT, instr_cat2, .opc = OPC_NOT_B),
+ INSTR(XOR, instr_cat2, .opc = OPC_XOR_B),
+ INSTR(UMUL, instr_cat2, .opc = OPC_MUL_U),
+ INSTR(SHL, instr_cat2, .opc = OPC_SHL_B),
+ INSTR(USHR, instr_cat2, .opc = OPC_SHR_B),
+ INSTR(ISHR, instr_cat2, .opc = OPC_ASHR_B),
+ INSTR(IABS, instr_cat2, .opc = OPC_ABSNEG_S),
+ INSTR(INEG, instr_cat2, .opc = OPC_ABSNEG_S),
+ INSTR(AND, instr_cat2, .opc = OPC_AND_B),
+ INSTR(MAD, instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16),
+ INSTR(TRUNC, instr_cat2, .opc = OPC_TRUNC_F),
+ INSTR(CLAMP, trans_clamp),
+ INSTR(FLR, instr_cat2, .opc = OPC_FLOOR_F),
+ INSTR(ROUND, instr_cat2, .opc = OPC_RNDNE_F),
+ INSTR(SSG, instr_cat2, .opc = OPC_SIGN_F),
+ INSTR(CEIL, instr_cat2, .opc = OPC_CEIL_F),
+ INSTR(ARL, trans_arl),
+ INSTR(EX2, instr_cat4, .opc = OPC_EXP2),
+ INSTR(LG2, instr_cat4, .opc = OPC_LOG2),
+ INSTR(ABS, instr_cat2, .opc = OPC_ABSNEG_F),
+ INSTR(COS, instr_cat4, .opc = OPC_COS),
+ INSTR(SIN, instr_cat4, .opc = OPC_SIN),
+ INSTR(TEX, trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TEX),
+ INSTR(TXP, trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TXP),
+ INSTR(SGT, trans_cmp),
+ INSTR(SLT, trans_cmp),
+ INSTR(FSLT, trans_cmp),
+ INSTR(SGE, trans_cmp),
+ INSTR(FSGE, trans_cmp),
+ INSTR(SLE, trans_cmp),
+ INSTR(SNE, trans_cmp),
+ INSTR(FSNE, trans_cmp),
+ INSTR(SEQ, trans_cmp),
+ INSTR(FSEQ, trans_cmp),
+ INSTR(CMP, trans_cmp),
+ INSTR(USNE, trans_icmp, .opc = OPC_CMPS_U),
+ INSTR(USEQ, trans_icmp, .opc = OPC_CMPS_U),
+ INSTR(ISGE, trans_icmp, .opc = OPC_CMPS_S),
+ INSTR(USGE, trans_icmp, .opc = OPC_CMPS_U),
+ INSTR(ISLT, trans_icmp, .opc = OPC_CMPS_S),
+ INSTR(USLT, trans_icmp, .opc = OPC_CMPS_U),
+ INSTR(UCMP, trans_icmp, .opc = OPC_CMPS_U),
+ INSTR(IF, trans_if),
+ INSTR(UIF, trans_if),
+ INSTR(ELSE, trans_else),
+ INSTR(ENDIF, trans_endif),
+ INSTR(END, instr_cat0, .opc = OPC_END),
+ INSTR(KILL, trans_kill, .opc = OPC_KILL),
+ INSTR(KILL_IF, trans_killif, .opc = OPC_KILL),
+ INSTR(I2F, trans_cov),
+ INSTR(U2F, trans_cov),
+ INSTR(F2I, trans_cov),
+ INSTR(F2U, trans_cov),
+};
+
+static ir3_semantic
+decl_semantic(const struct tgsi_declaration_semantic *sem)
+{
+ return ir3_semantic_name(sem->Name, sem->Index);
+}
+
+static struct ir3_instruction *
+decl_in_frag_bary(struct ir3_compile_context *ctx, unsigned regid,
+ unsigned j, unsigned inloc)
+{
+ struct ir3_instruction *instr;
+ struct ir3_register *src;
+
+ /* bary.f dst, #inloc, r0.x */
+ instr = instr_create(ctx, 2, OPC_BARY_F);
+ ir3_reg_create(instr, regid, 0); /* dummy dst */
+ ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc;
+ src = ir3_reg_create(instr, 0, IR3_REG_SSA);
+ src->wrmask = 0x3;
+ src->instr = ctx->frag_pos;
+
+ return instr;
+}
+
+/* TGSI_SEMANTIC_POSITION
+ * """"""""""""""""""""""
+ *
+ * For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that
+ * fragment shader input contains the fragment's window position. The X
+ * component starts at zero and always increases from left to right.
+ * The Y component starts at zero and always increases but Y=0 may either
+ * indicate the top of the window or the bottom depending on the fragment
+ * coordinate origin convention (see TGSI_PROPERTY_FS_COORD_ORIGIN).
+ * The Z coordinate ranges from 0 to 1 to represent depth from the front
+ * to the back of the Z buffer. The W component contains the reciprocol
+ * of the interpolated vertex position W component.
+ */
+static struct ir3_instruction *
+decl_in_frag_coord(struct ir3_compile_context *ctx, unsigned regid,
+ unsigned j)
+{
+ struct ir3_instruction *instr, *src;
+
+ compile_assert(ctx, !ctx->frag_coord[j]);
+
+ ctx->frag_coord[j] = create_input(ctx->block, NULL, 0);
+
+
+ switch (j) {
+ case 0: /* .x */
+ case 1: /* .y */
+ /* for frag_coord, we get unsigned values.. we need
+ * to subtract (integer) 8 and divide by 16 (right-
+ * shift by 4) then convert to float:
+ */
+
+ /* add.s tmp, src, -8 */
+ instr = instr_create(ctx, 2, OPC_ADD_S);
+ ir3_reg_create(instr, regid, 0); /* dummy dst */
+ ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_coord[j];
+ ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -8;
+ src = instr;
+
+ /* shr.b tmp, tmp, 4 */
+ instr = instr_create(ctx, 2, OPC_SHR_B);
+ ir3_reg_create(instr, regid, 0); /* dummy dst */
+ ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+ ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4;
+ src = instr;
+
+ /* mov.u32f32 dst, tmp */
+ instr = instr_create(ctx, 1, 0);
+ instr->cat1.src_type = TYPE_U32;
+ instr->cat1.dst_type = TYPE_F32;
+ ir3_reg_create(instr, regid, 0); /* dummy dst */
+ ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+
+ break;
+ case 2: /* .z */
+ case 3: /* .w */
+ /* seems that we can use these as-is: */
+ instr = ctx->frag_coord[j];
+ break;
+ default:
+ compile_error(ctx, "invalid channel\n");
+ instr = create_immed(ctx, 0.0);
+ break;
+ }
+
+ return instr;
+}
+
+/* TGSI_SEMANTIC_FACE
+ * """"""""""""""""""
+ *
+ * This label applies to fragment shader inputs only and indicates that
+ * the register contains front/back-face information of the form (F, 0,
+ * 0, 1). The first component will be positive when the fragment belongs
+ * to a front-facing polygon, and negative when the fragment belongs to a
+ * back-facing polygon.
+ */
+static struct ir3_instruction *
+decl_in_frag_face(struct ir3_compile_context *ctx, unsigned regid,
+ unsigned j)
+{
+ struct ir3_instruction *instr, *src;
+
+ switch (j) {
+ case 0: /* .x */
+ compile_assert(ctx, !ctx->frag_face);
+
+ ctx->frag_face = create_input(ctx->block, NULL, 0);
+
+ /* for faceness, we always get -1 or 0 (int).. but TGSI expects
+ * positive vs negative float.. and piglit further seems to
+ * expect -1.0 or 1.0:
+ *
+ * mul.s tmp, hr0.x, 2
+ * add.s tmp, tmp, 1
+ * mov.s16f32, dst, tmp
+ *
+ */
+
+ instr = instr_create(ctx, 2, OPC_MUL_S);
+ ir3_reg_create(instr, regid, 0); /* dummy dst */
+ ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_face;
+ ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
+ src = instr;
+
+ instr = instr_create(ctx, 2, OPC_ADD_S);
+ ir3_reg_create(instr, regid, 0); /* dummy dst */
+ ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+ ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
+ src = instr;
+
+ instr = instr_create(ctx, 1, 0); /* mov */
+ instr->cat1.src_type = TYPE_S32;
+ instr->cat1.dst_type = TYPE_F32;
+ ir3_reg_create(instr, regid, 0); /* dummy dst */
+ ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+
+ break;
+ case 1: /* .y */
+ case 2: /* .z */
+ instr = create_immed(ctx, 0.0);
+ break;
+ case 3: /* .w */
+ instr = create_immed(ctx, 1.0);
+ break;
+ default:
+ compile_error(ctx, "invalid channel\n");
+ instr = create_immed(ctx, 0.0);
+ break;
+ }
+
+ return instr;
+}
+
+static void
+decl_in(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
+{
+ struct ir3_shader_variant *so = ctx->so;
+ unsigned name = decl->Semantic.Name;
+ unsigned i;
+
+ /* I don't think we should get frag shader input without
+ * semantic info? Otherwise how do inputs get linked to
+ * vert outputs?
+ */
+ compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) ||
+ decl->Declaration.Semantic);
+
+ for (i = decl->Range.First; i <= decl->Range.Last; i++) {
+ unsigned n = so->inputs_count++;
+ unsigned r = regid(i, 0);
+ unsigned ncomp, j;
+
+ /* we'll figure out the actual components used after scheduling */
+ ncomp = 4;
+
+ DBG("decl in -> r%d", i);
+
+ compile_assert(ctx, n < ARRAY_SIZE(so->inputs));
+
+ so->inputs[n].semantic = decl_semantic(&decl->Semantic);
+ so->inputs[n].compmask = (1 << ncomp) - 1;
+ so->inputs[n].regid = r;
+ so->inputs[n].inloc = ctx->next_inloc;
+
+ for (j = 0; j < ncomp; j++) {
+ struct ir3_instruction *instr = NULL;
+
+ if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+ /* for fragment shaders, POSITION and FACE are handled
+ * specially, not using normal varying / bary.f
+ */
+ if (name == TGSI_SEMANTIC_POSITION) {
+ so->inputs[n].bary = false;
+ so->frag_coord = true;
+ instr = decl_in_frag_coord(ctx, r + j, j);
+ } else if (name == TGSI_SEMANTIC_FACE) {
+ so->inputs[n].bary = false;
+ so->frag_face = true;
+ instr = decl_in_frag_face(ctx, r + j, j);
+ } else {
+ so->inputs[n].bary = true;
+ instr = decl_in_frag_bary(ctx, r + j, j,
+ so->inputs[n].inloc + j - 8);
+ }
+ } else {
+ instr = create_input(ctx->block, NULL, (i * 4) + j);
+ }
+
+ ctx->block->inputs[(i * 4) + j] = instr;
+ }
+
+ if (so->inputs[n].bary || (ctx->type == TGSI_PROCESSOR_VERTEX)) {
+ ctx->next_inloc += ncomp;
+ so->total_in += ncomp;
+ }
+ }
+}
+
+static void
+decl_out(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
+{
+ struct ir3_shader_variant *so = ctx->so;
+ unsigned comp = 0;
+ unsigned name = decl->Semantic.Name;
+ unsigned i;
+
+ compile_assert(ctx, decl->Declaration.Semantic);
+
+ DBG("decl out[%d] -> r%d", name, decl->Range.First);
+
+ if (ctx->type == TGSI_PROCESSOR_VERTEX) {
+ switch (name) {
+ case TGSI_SEMANTIC_POSITION:
+ so->writes_pos = true;
+ break;
+ case TGSI_SEMANTIC_PSIZE:
+ so->writes_psize = true;
+ break;
+ case TGSI_SEMANTIC_COLOR:
+ case TGSI_SEMANTIC_BCOLOR:
+ case TGSI_SEMANTIC_GENERIC:
+ case TGSI_SEMANTIC_FOG:
+ case TGSI_SEMANTIC_TEXCOORD:
+ break;
+ default:
+ compile_error(ctx, "unknown VS semantic name: %s\n",
+ tgsi_semantic_names[name]);
+ }
+ } else {
+ switch (name) {
+ case TGSI_SEMANTIC_POSITION:
+ comp = 2; /* tgsi will write to .z component */
+ so->writes_pos = true;
+ break;
+ case TGSI_SEMANTIC_COLOR:
+ break;
+ default:
+ compile_error(ctx, "unknown FS semantic name: %s\n",
+ tgsi_semantic_names[name]);
+ }
+ }
+
+ for (i = decl->Range.First; i <= decl->Range.Last; i++) {
+ unsigned n = so->outputs_count++;
+ unsigned ncomp, j;
+
+ ncomp = 4;
+
+ compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
+
+ so->outputs[n].semantic = decl_semantic(&decl->Semantic);
+ so->outputs[n].regid = regid(i, comp);
+
+ /* avoid undefined outputs, stick a dummy mov from imm{0.0},
+ * which if the output is actually assigned will be over-
+ * written
+ */
+ for (j = 0; j < ncomp; j++)
+ ctx->block->outputs[(i * 4) + j] = create_immed(ctx, 0.0);
+ }
+}
+
+/* from TGSI perspective, we actually have inputs. But most of the "inputs"
+ * for a fragment shader are just bary.f instructions. The *actual* inputs
+ * from the hw perspective are the frag_pos and optionally frag_coord and
+ * frag_face.
+ */
+static void
+fixup_frag_inputs(struct ir3_compile_context *ctx)
+{
+ struct ir3_shader_variant *so = ctx->so;
+ struct ir3_block *block = ctx->block;
+ struct ir3_instruction **inputs;
+ struct ir3_instruction *instr;
+ int n, regid = 0;
+
+ block->ninputs = 0;
+
+ n = 4; /* always have frag_pos */
+ n += COND(so->frag_face, 4);
+ n += COND(so->frag_coord, 4);
+
+ inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *)));
+
+ if (so->frag_face) {
+ /* this ultimately gets assigned to hr0.x so doesn't conflict
+ * with frag_coord/frag_pos..
+ */
+ inputs[block->ninputs++] = ctx->frag_face;
+ ctx->frag_face->regs[0]->num = 0;
+
+ /* remaining channels not used, but let's avoid confusing
+ * other parts that expect inputs to come in groups of vec4
+ */
+ inputs[block->ninputs++] = NULL;
+ inputs[block->ninputs++] = NULL;
+ inputs[block->ninputs++] = NULL;
+ }
+
+ /* since we don't know where to set the regid for frag_coord,
+ * we have to use r0.x for it. But we don't want to *always*
+ * use r1.x for frag_pos as that could increase the register
+ * footprint on simple shaders:
+ */
+ if (so->frag_coord) {
+ ctx->frag_coord[0]->regs[0]->num = regid++;
+ ctx->frag_coord[1]->regs[0]->num = regid++;
+ ctx->frag_coord[2]->regs[0]->num = regid++;
+ ctx->frag_coord[3]->regs[0]->num = regid++;
+
+ inputs[block->ninputs++] = ctx->frag_coord[0];
+ inputs[block->ninputs++] = ctx->frag_coord[1];
+ inputs[block->ninputs++] = ctx->frag_coord[2];
+ inputs[block->ninputs++] = ctx->frag_coord[3];
+ }
+
+ /* we always have frag_pos: */
+ so->pos_regid = regid;
+
+ /* r0.x */
+ instr = create_input(block, NULL, block->ninputs);
+ instr->regs[0]->num = regid++;
+ inputs[block->ninputs++] = instr;
+ ctx->frag_pos->regs[1]->instr = instr;
+
+ /* r0.y */
+ instr = create_input(block, NULL, block->ninputs);
+ instr->regs[0]->num = regid++;
+ inputs[block->ninputs++] = instr;
+ ctx->frag_pos->regs[2]->instr = instr;
+
+ block->inputs = inputs;
+}
+
+static void
+compile_instructions(struct ir3_compile_context *ctx)
+{
+ push_block(ctx);
+
+ /* for fragment shader, we have a single input register (usually
+ * r0.xy) which is used as the base for bary.f varying fetch instrs:
+ */
+ if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+ struct ir3_instruction *instr;
+ instr = ir3_instr_create(ctx->block, -1, OPC_META_FI);
+ ir3_reg_create(instr, 0, 0);
+ ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.x */
+ ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.y */
+ ctx->frag_pos = instr;
+ }
+
+ while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
+ tgsi_parse_token(&ctx->parser);
+
+ switch (ctx->parser.FullToken.Token.Type) {
+ case TGSI_TOKEN_TYPE_DECLARATION: {
+ struct tgsi_full_declaration *decl =
+ &ctx->parser.FullToken.FullDeclaration;
+ if (decl->Declaration.File == TGSI_FILE_OUTPUT) {
+ decl_out(ctx, decl);
+ } else if (decl->Declaration.File == TGSI_FILE_INPUT) {
+ decl_in(ctx, decl);
+ }
+ break;
+ }
+ case TGSI_TOKEN_TYPE_IMMEDIATE: {
+ /* TODO: if we know the immediate is small enough, and only
+ * used with instructions that can embed an immediate, we
+ * can skip this:
+ */
+ struct tgsi_full_immediate *imm =
+ &ctx->parser.FullToken.FullImmediate;
+ unsigned n = ctx->so->immediates_count++;
+ compile_assert(ctx, n < ARRAY_SIZE(ctx->so->immediates));
+ memcpy(ctx->so->immediates[n].val, imm->u, 16);
+ break;
+ }
+ case TGSI_TOKEN_TYPE_INSTRUCTION: {
+ struct tgsi_full_instruction *inst =
+ &ctx->parser.FullToken.FullInstruction;
+ unsigned opc = inst->Instruction.Opcode;
+ const struct instr_translater *t = &translaters[opc];
+
+ if (t->fxn) {
+ t->fxn(t, ctx, inst);
+ ctx->num_internal_temps = 0;
+ } else {
+ compile_error(ctx, "unknown TGSI opc: %s\n",
+ tgsi_get_opcode_name(opc));
+ }
+
+ switch (inst->Instruction.Saturate) {
+ case TGSI_SAT_ZERO_ONE:
+ create_clamp_imm(ctx, &inst->Dst[0].Register,
+ fui(0.0), fui(1.0));
+ break;
+ case TGSI_SAT_MINUS_PLUS_ONE:
+ create_clamp_imm(ctx, &inst->Dst[0].Register,
+ fui(-1.0), fui(1.0));
+ break;
+ }
+
+ instr_finish(ctx);
+
+ break;
+ }
+ default:
+ break;
+ }
+ }
+}
+
+static void
+compile_dump(struct ir3_compile_context *ctx)
+{
+ const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag";
+ static unsigned n = 0;
+ char fname[16];
+ FILE *f;
+ snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++);
+ f = fopen(fname, "w");
+ if (!f)
+ return;
+ ir3_block_depth(ctx->block);
+ ir3_dump(ctx->ir, name, ctx->block, f);
+ fclose(f);
+}
+
+int
+ir3_compile_shader(struct ir3_shader_variant *so,
+ const struct tgsi_token *tokens, struct ir3_shader_key key)
+{
+ struct ir3_compile_context ctx;
+ struct ir3_block *block;
+ struct ir3_instruction **inputs;
+ unsigned i, j, actual_in;
+ int ret = 0;
+
+ assert(!so->ir);
+
+ so->ir = ir3_create();
+
+ assert(so->ir);
+
+ if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) {
+ ret = -1;
+ goto out;
+ }
+
+ compile_instructions(&ctx);
+
+ block = ctx.block;
+
+ /* keep track of the inputs from TGSI perspective.. */
+ inputs = block->inputs;
+
+ /* but fixup actual inputs for frag shader: */
+ if (ctx.type == TGSI_PROCESSOR_FRAGMENT)
+ fixup_frag_inputs(&ctx);
+
+ /* at this point, for binning pass, throw away unneeded outputs: */
+ if (key.binning_pass) {
+ for (i = 0, j = 0; i < so->outputs_count; i++) {
+ unsigned name = sem2name(so->outputs[i].semantic);
+ unsigned idx = sem2name(so->outputs[i].semantic);
+
+ /* throw away everything but first position/psize */
+ if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) ||
+ (name == TGSI_SEMANTIC_PSIZE))) {
+ if (i != j) {
+ so->outputs[j] = so->outputs[i];
+ block->outputs[(j*4)+0] = block->outputs[(i*4)+0];
+ block->outputs[(j*4)+1] = block->outputs[(i*4)+1];
+ block->outputs[(j*4)+2] = block->outputs[(i*4)+2];
+ block->outputs[(j*4)+3] = block->outputs[(i*4)+3];
+ }
+ j++;
+ }
+ }
+ so->outputs_count = j;
+ block->noutputs = j * 4;
+ }
+
+ /* at this point, we want the kill's in the outputs array too,
+ * so that they get scheduled (since they have no dst).. we've
+ * already ensured that the array is big enough in push_block():
+ */
+ if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
+ for (i = 0; i < ctx.kill_count; i++)
+ block->outputs[block->noutputs++] = ctx.kill[i];
+ }
+
+ if (fd_mesa_debug & FD_DBG_OPTDUMP)
+ compile_dump(&ctx);
+
+ ret = ir3_block_flatten(block);
+ if (ret < 0)
+ goto out;
+ if ((ret > 0) && (fd_mesa_debug & FD_DBG_OPTDUMP))
+ compile_dump(&ctx);
+
+ ir3_block_cp(block);
+
+ if (fd_mesa_debug & FD_DBG_OPTDUMP)
+ compile_dump(&ctx);
+
+ ir3_block_depth(block);
+
+ if (fd_mesa_debug & FD_DBG_OPTMSGS) {
+ printf("AFTER DEPTH:\n");
+ ir3_dump_instr_list(block->head);
+ }
+
+ ir3_block_sched(block);
+
+ if (fd_mesa_debug & FD_DBG_OPTMSGS) {
+ printf("AFTER SCHED:\n");
+ ir3_dump_instr_list(block->head);
+ }
+
+ ret = ir3_block_ra(block, so->type, key.half_precision,
+ so->frag_coord, so->frag_face, &so->has_samp);
+ if (ret)
+ goto out;
+
+ if (fd_mesa_debug & FD_DBG_OPTMSGS) {
+ printf("AFTER RA:\n");
+ ir3_dump_instr_list(block->head);
+ }
+
+ /* fixup input/outputs: */
+ for (i = 0; i < so->outputs_count; i++) {
+ so->outputs[i].regid = block->outputs[i*4]->regs[0]->num;
+ /* preserve hack for depth output.. tgsi writes depth to .z,
+ * but what we give the hw is the scalar register:
+ */
+ if ((ctx.type == TGSI_PROCESSOR_FRAGMENT) &&
+ (sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION))
+ so->outputs[i].regid += 2;
+ }
+ /* Note that some or all channels of an input may be unused: */
+ actual_in = 0;
+ for (i = 0; i < so->inputs_count; i++) {
+ unsigned j, regid = ~0, compmask = 0;
+ so->inputs[i].ncomp = 0;
+ for (j = 0; j < 4; j++) {
+ struct ir3_instruction *in = inputs[(i*4) + j];
+ if (in) {
+ compmask |= (1 << j);
+ regid = in->regs[0]->num - j;
+ actual_in++;
+ so->inputs[i].ncomp++;
+ }
+ }
+ so->inputs[i].regid = regid;
+ so->inputs[i].compmask = compmask;
+ }
+
+ /* fragment shader always gets full vec4's even if it doesn't
+ * fetch all components, but vertex shader we need to update
+ * with the actual number of components fetch, otherwise thing
+ * will hang due to mismaptch between VFD_DECODE's and
+ * TOTALATTRTOVS
+ */
+ if (so->type == SHADER_VERTEX)
+ so->total_in = actual_in;
+
+out:
+ if (ret) {
+ ir3_destroy(so->ir);
+ so->ir = NULL;
+ }
+ compile_free(&ctx);
+
+ return ret;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
new file mode 100644
index 00000000000..9b11b3d8abf
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
@@ -0,0 +1,42 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#ifndef FD3_COMPILER_H_
+#define FD3_COMPILER_H_
+
+#include "ir3_shader.h"
+
+
+int ir3_compile_shader(struct ir3_shader_variant *so,
+ const struct tgsi_token *tokens,
+ struct ir3_shader_key key);
+int ir3_compile_shader_old(struct ir3_shader_variant *so,
+ const struct tgsi_token *tokens,
+ struct ir3_shader_key key);
+
+#endif /* FD3_COMPILER_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_old.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_old.c
new file mode 100644
index 00000000000..1e1ca7ad813
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_old.c
@@ -0,0 +1,1524 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#include <stdarg.h>
+
+#include "pipe/p_state.h"
+#include "util/u_string.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_ureg.h"
+#include "tgsi/tgsi_info.h"
+#include "tgsi/tgsi_strings.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_scan.h"
+
+#include "freedreno_lowering.h"
+#include "freedreno_util.h"
+
+#include "ir3_compiler.h"
+#include "ir3_shader.h"
+
+#include "instr-a3xx.h"
+#include "ir3.h"
+
+
+struct ir3_compile_context {
+ const struct tgsi_token *tokens;
+ bool free_tokens;
+ struct ir3 *ir;
+ struct ir3_block *block;
+ struct ir3_shader_variant *so;
+
+ struct tgsi_parse_context parser;
+ unsigned type;
+
+ struct tgsi_shader_info info;
+
+ /* last input dst (for setting (ei) flag): */
+ struct ir3_register *last_input;
+
+ /* last instruction with relative addressing: */
+ struct ir3_instruction *last_rel;
+
+ /* for calculating input/output positions/linkages: */
+ unsigned next_inloc;
+
+ unsigned num_internal_temps;
+ struct tgsi_src_register internal_temps[6];
+
+ /* track registers which need to synchronize w/ "complex alu" cat3
+ * instruction pipeline:
+ */
+ regmask_t needs_ss;
+
+ /* track registers which need to synchronize with texture fetch
+ * pipeline:
+ */
+ regmask_t needs_sy;
+
+ /* inputs start at r0, temporaries start after last input, and
+ * outputs start after last temporary.
+ *
+ * We could be more clever, because this is not a hw restriction,
+ * but probably best just to implement an optimizing pass to
+ * reduce the # of registers used and get rid of redundant mov's
+ * (to output register).
+ */
+ unsigned base_reg[TGSI_FILE_COUNT];
+
+ /* idx/slot for last compiler generated immediate */
+ unsigned immediate_idx;
+
+ /* stack of branch instructions that start (potentially nested)
+ * branch instructions, so that we can fix up the branch targets
+ * so that we can fix up the branch target on the corresponding
+ * END instruction
+ */
+ struct ir3_instruction *branch[16];
+ unsigned int branch_count;
+
+ /* used when dst is same as one of the src, to avoid overwriting a
+ * src element before the remaining scalar instructions that make
+ * up the vector operation
+ */
+ struct tgsi_dst_register tmp_dst;
+ struct tgsi_src_register *tmp_src;
+};
+
+
+static void vectorize(struct ir3_compile_context *ctx,
+ struct ir3_instruction *instr, struct tgsi_dst_register *dst,
+ int nsrcs, ...);
+static void create_mov(struct ir3_compile_context *ctx,
+ struct tgsi_dst_register *dst, struct tgsi_src_register *src);
+
+static unsigned
+compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so,
+ const struct tgsi_token *tokens)
+{
+ unsigned ret, base = 0;
+ struct tgsi_shader_info *info = &ctx->info;
+ const struct fd_lowering_config lconfig = {
+ .color_two_side = so->key.color_two_side,
+ .lower_DST = true,
+ .lower_XPD = true,
+ .lower_SCS = true,
+ .lower_LRP = true,
+ .lower_FRC = true,
+ .lower_POW = true,
+ .lower_LIT = true,
+ .lower_EXP = true,
+ .lower_LOG = true,
+ .lower_DP4 = true,
+ .lower_DP3 = true,
+ .lower_DPH = true,
+ .lower_DP2 = true,
+ .lower_DP2A = true,
+ };
+
+ ctx->tokens = fd_transform_lowering(&lconfig, tokens, &ctx->info);
+ ctx->free_tokens = !!ctx->tokens;
+ if (!ctx->tokens) {
+ /* no lowering */
+ ctx->tokens = tokens;
+ }
+ ctx->ir = so->ir;
+ ctx->block = ir3_block_create(ctx->ir, 0, 0, 0);
+ ctx->so = so;
+ ctx->last_input = NULL;
+ ctx->last_rel = NULL;
+ ctx->next_inloc = 8;
+ ctx->num_internal_temps = 0;
+ ctx->branch_count = 0;
+
+ regmask_init(&ctx->needs_ss);
+ regmask_init(&ctx->needs_sy);
+ memset(ctx->base_reg, 0, sizeof(ctx->base_reg));
+
+ /* Immediates go after constants: */
+ ctx->base_reg[TGSI_FILE_CONSTANT] = 0;
+ ctx->base_reg[TGSI_FILE_IMMEDIATE] =
+ info->file_max[TGSI_FILE_CONSTANT] + 1;
+
+ /* if full precision and fragment shader, don't clobber
+ * r0.x w/ bary fetch:
+ */
+ if ((so->type == SHADER_FRAGMENT) && !so->key.half_precision)
+ base = 1;
+
+ /* Temporaries after outputs after inputs: */
+ ctx->base_reg[TGSI_FILE_INPUT] = base;
+ ctx->base_reg[TGSI_FILE_OUTPUT] = base +
+ info->file_max[TGSI_FILE_INPUT] + 1;
+ ctx->base_reg[TGSI_FILE_TEMPORARY] = base +
+ info->file_max[TGSI_FILE_INPUT] + 1 +
+ info->file_max[TGSI_FILE_OUTPUT] + 1;
+
+ so->first_immediate = ctx->base_reg[TGSI_FILE_IMMEDIATE];
+ ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1);
+
+ ret = tgsi_parse_init(&ctx->parser, ctx->tokens);
+ if (ret != TGSI_PARSE_OK)
+ return ret;
+
+ ctx->type = ctx->parser.FullHeader.Processor.Processor;
+
+ return ret;
+}
+
+static void
+compile_error(struct ir3_compile_context *ctx, const char *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ _debug_vprintf(format, ap);
+ va_end(ap);
+ tgsi_dump(ctx->tokens, 0);
+ debug_assert(0);
+}
+
+#define compile_assert(ctx, cond) do { \
+ if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
+ } while (0)
+
+static void
+compile_free(struct ir3_compile_context *ctx)
+{
+ if (ctx->free_tokens)
+ free((void *)ctx->tokens);
+ tgsi_parse_free(&ctx->parser);
+}
+
+struct instr_translater {
+ void (*fxn)(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst);
+ unsigned tgsi_opc;
+ opc_t opc;
+ opc_t hopc; /* opc to use for half_precision mode, if different */
+ unsigned arg;
+};
+
+static void
+handle_last_rel(struct ir3_compile_context *ctx)
+{
+ if (ctx->last_rel) {
+ ctx->last_rel->flags |= IR3_INSTR_UL;
+ ctx->last_rel = NULL;
+ }
+}
+
+static struct ir3_instruction *
+instr_create(struct ir3_compile_context *ctx, int category, opc_t opc)
+{
+ return ir3_instr_create(ctx->block, category, opc);
+}
+
+static void
+add_nop(struct ir3_compile_context *ctx, unsigned count)
+{
+ while (count-- > 0)
+ instr_create(ctx, 0, OPC_NOP);
+}
+
+static unsigned
+src_flags(struct ir3_compile_context *ctx, struct ir3_register *reg)
+{
+ unsigned flags = 0;
+
+ if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
+ return flags;
+
+ if (regmask_get(&ctx->needs_ss, reg)) {
+ flags |= IR3_INSTR_SS;
+ regmask_init(&ctx->needs_ss);
+ }
+
+ if (regmask_get(&ctx->needs_sy, reg)) {
+ flags |= IR3_INSTR_SY;
+ regmask_init(&ctx->needs_sy);
+ }
+
+ return flags;
+}
+
+static struct ir3_register *
+add_dst_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
+ const struct tgsi_dst_register *dst, unsigned chan)
+{
+ unsigned flags = 0, num = 0;
+ struct ir3_register *reg;
+
+ switch (dst->File) {
+ case TGSI_FILE_OUTPUT:
+ case TGSI_FILE_TEMPORARY:
+ num = dst->Index + ctx->base_reg[dst->File];
+ break;
+ case TGSI_FILE_ADDRESS:
+ num = REG_A0;
+ break;
+ default:
+ compile_error(ctx, "unsupported dst register file: %s\n",
+ tgsi_file_name(dst->File));
+ break;
+ }
+
+ if (dst->Indirect)
+ flags |= IR3_REG_RELATIV;
+ if (ctx->so->key.half_precision)
+ flags |= IR3_REG_HALF;
+
+ reg = ir3_reg_create(instr, regid(num, chan), flags);
+
+ if (dst->Indirect)
+ ctx->last_rel = instr;
+
+ return reg;
+}
+
+static struct ir3_register *
+add_src_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
+ const struct tgsi_src_register *src, unsigned chan)
+{
+ unsigned flags = 0, num = 0;
+ struct ir3_register *reg;
+
+ /* TODO we need to use a mov to temp for const >= 64.. or maybe
+ * we could use relative addressing..
+ */
+ compile_assert(ctx, src->Index < 64);
+
+ switch (src->File) {
+ case TGSI_FILE_IMMEDIATE:
+ /* TODO if possible, use actual immediate instead of const.. but
+ * TGSI has vec4 immediates, we can only embed scalar (of limited
+ * size, depending on instruction..)
+ */
+ case TGSI_FILE_CONSTANT:
+ flags |= IR3_REG_CONST;
+ num = src->Index + ctx->base_reg[src->File];
+ break;
+ case TGSI_FILE_OUTPUT:
+ /* NOTE: we should only end up w/ OUTPUT file for things like
+ * clamp()'ing saturated dst instructions
+ */
+ case TGSI_FILE_INPUT:
+ case TGSI_FILE_TEMPORARY:
+ num = src->Index + ctx->base_reg[src->File];
+ break;
+ default:
+ compile_error(ctx, "unsupported src register file: %s\n",
+ tgsi_file_name(src->File));
+ break;
+ }
+
+ if (src->Absolute)
+ flags |= IR3_REG_ABS;
+ if (src->Negate)
+ flags |= IR3_REG_NEGATE;
+ if (src->Indirect)
+ flags |= IR3_REG_RELATIV;
+ if (ctx->so->key.half_precision)
+ flags |= IR3_REG_HALF;
+
+ reg = ir3_reg_create(instr, regid(num, chan), flags);
+
+ if (src->Indirect)
+ ctx->last_rel = instr;
+
+ instr->flags |= src_flags(ctx, reg);
+
+ return reg;
+}
+
+static void
+src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
+{
+ src->File = dst->File;
+ src->Indirect = dst->Indirect;
+ src->Dimension = dst->Dimension;
+ src->Index = dst->Index;
+ src->Absolute = 0;
+ src->Negate = 0;
+ src->SwizzleX = TGSI_SWIZZLE_X;
+ src->SwizzleY = TGSI_SWIZZLE_Y;
+ src->SwizzleZ = TGSI_SWIZZLE_Z;
+ src->SwizzleW = TGSI_SWIZZLE_W;
+}
+
+/* Get internal-temp src/dst to use for a sequence of instructions
+ * generated by a single TGSI op.
+ */
+static struct tgsi_src_register *
+get_internal_temp(struct ir3_compile_context *ctx,
+ struct tgsi_dst_register *tmp_dst)
+{
+ struct tgsi_src_register *tmp_src;
+ int n;
+
+ tmp_dst->File = TGSI_FILE_TEMPORARY;
+ tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
+ tmp_dst->Indirect = 0;
+ tmp_dst->Dimension = 0;
+
+ /* assign next temporary: */
+ n = ctx->num_internal_temps++;
+ compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
+ tmp_src = &ctx->internal_temps[n];
+
+ tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1;
+
+ src_from_dst(tmp_src, tmp_dst);
+
+ return tmp_src;
+}
+
+/* Get internal half-precision temp src/dst to use for a sequence of
+ * instructions generated by a single TGSI op.
+ */
+static struct tgsi_src_register *
+get_internal_temp_hr(struct ir3_compile_context *ctx,
+ struct tgsi_dst_register *tmp_dst)
+{
+ struct tgsi_src_register *tmp_src;
+ int n;
+
+ if (ctx->so->key.half_precision)
+ return get_internal_temp(ctx, tmp_dst);
+
+ tmp_dst->File = TGSI_FILE_TEMPORARY;
+ tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
+ tmp_dst->Indirect = 0;
+ tmp_dst->Dimension = 0;
+
+ /* assign next temporary: */
+ n = ctx->num_internal_temps++;
+ compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
+ tmp_src = &ctx->internal_temps[n];
+
+ /* just use hr0 because no one else should be using half-
+ * precision regs:
+ */
+ tmp_dst->Index = 0;
+
+ src_from_dst(tmp_src, tmp_dst);
+
+ return tmp_src;
+}
+
+static inline bool
+is_const(struct tgsi_src_register *src)
+{
+ return (src->File == TGSI_FILE_CONSTANT) ||
+ (src->File == TGSI_FILE_IMMEDIATE);
+}
+
+static inline bool
+is_relative(struct tgsi_src_register *src)
+{
+ return src->Indirect;
+}
+
+static inline bool
+is_rel_or_const(struct tgsi_src_register *src)
+{
+ return is_relative(src) || is_const(src);
+}
+
+static type_t
+get_ftype(struct ir3_compile_context *ctx)
+{
+ return ctx->so->key.half_precision ? TYPE_F16 : TYPE_F32;
+}
+
+static type_t
+get_utype(struct ir3_compile_context *ctx)
+{
+ return ctx->so->key.half_precision ? TYPE_U16 : TYPE_U32;
+}
+
+static unsigned
+src_swiz(struct tgsi_src_register *src, int chan)
+{
+ switch (chan) {
+ case 0: return src->SwizzleX;
+ case 1: return src->SwizzleY;
+ case 2: return src->SwizzleZ;
+ case 3: return src->SwizzleW;
+ }
+ assert(0);
+ return 0;
+}
+
+/* for instructions that cannot take a const register as src, if needed
+ * generate a move to temporary gpr:
+ */
+static struct tgsi_src_register *
+get_unconst(struct ir3_compile_context *ctx, struct tgsi_src_register *src)
+{
+ struct tgsi_dst_register tmp_dst;
+ struct tgsi_src_register *tmp_src;
+
+ compile_assert(ctx, is_rel_or_const(src));
+
+ tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+ create_mov(ctx, &tmp_dst, src);
+
+ return tmp_src;
+}
+
+static void
+get_immediate(struct ir3_compile_context *ctx,
+ struct tgsi_src_register *reg, uint32_t val)
+{
+ unsigned neg, swiz, idx, i;
+ /* actually maps 1:1 currently.. not sure if that is safe to rely on: */
+ static const unsigned swiz2tgsi[] = {
+ TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
+ };
+
+ for (i = 0; i < ctx->immediate_idx; i++) {
+ swiz = i % 4;
+ idx = i / 4;
+
+ if (ctx->so->immediates[idx].val[swiz] == val) {
+ neg = 0;
+ break;
+ }
+
+ if (ctx->so->immediates[idx].val[swiz] == -val) {
+ neg = 1;
+ break;
+ }
+ }
+
+ if (i == ctx->immediate_idx) {
+ /* need to generate a new immediate: */
+ swiz = i % 4;
+ idx = i / 4;
+ neg = 0;
+ ctx->so->immediates[idx].val[swiz] = val;
+ ctx->so->immediates_count = idx + 1;
+ ctx->immediate_idx++;
+ }
+
+ reg->File = TGSI_FILE_IMMEDIATE;
+ reg->Indirect = 0;
+ reg->Dimension = 0;
+ reg->Index = idx;
+ reg->Absolute = 0;
+ reg->Negate = neg;
+ reg->SwizzleX = swiz2tgsi[swiz];
+ reg->SwizzleY = swiz2tgsi[swiz];
+ reg->SwizzleZ = swiz2tgsi[swiz];
+ reg->SwizzleW = swiz2tgsi[swiz];
+}
+
+static void
+create_mov(struct ir3_compile_context *ctx, struct tgsi_dst_register *dst,
+ struct tgsi_src_register *src)
+{
+ type_t type_mov = get_ftype(ctx);
+ unsigned i;
+
+ for (i = 0; i < 4; i++) {
+ /* move to destination: */
+ if (dst->WriteMask & (1 << i)) {
+ struct ir3_instruction *instr;
+
+ if (src->Absolute || src->Negate) {
+ /* can't have abs or neg on a mov instr, so use
+ * absneg.f instead to handle these cases:
+ */
+ instr = instr_create(ctx, 2, OPC_ABSNEG_F);
+ } else {
+ instr = instr_create(ctx, 1, 0);
+ instr->cat1.src_type = type_mov;
+ instr->cat1.dst_type = type_mov;
+ }
+
+ add_dst_reg(ctx, instr, dst, i);
+ add_src_reg(ctx, instr, src, src_swiz(src, i));
+ } else {
+ add_nop(ctx, 1);
+ }
+ }
+}
+
+static void
+create_clamp(struct ir3_compile_context *ctx,
+ struct tgsi_dst_register *dst, struct tgsi_src_register *val,
+ struct tgsi_src_register *minval, struct tgsi_src_register *maxval)
+{
+ struct ir3_instruction *instr;
+
+ instr = instr_create(ctx, 2, OPC_MAX_F);
+ vectorize(ctx, instr, dst, 2, val, 0, minval, 0);
+
+ instr = instr_create(ctx, 2, OPC_MIN_F);
+ vectorize(ctx, instr, dst, 2, val, 0, maxval, 0);
+}
+
+static void
+create_clamp_imm(struct ir3_compile_context *ctx,
+ struct tgsi_dst_register *dst,
+ uint32_t minval, uint32_t maxval)
+{
+ struct tgsi_src_register minconst, maxconst;
+ struct tgsi_src_register src;
+
+ src_from_dst(&src, dst);
+
+ get_immediate(ctx, &minconst, minval);
+ get_immediate(ctx, &maxconst, maxval);
+
+ create_clamp(ctx, dst, &src, &minconst, &maxconst);
+}
+
+static struct tgsi_dst_register *
+get_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst)
+{
+ struct tgsi_dst_register *dst = &inst->Dst[0].Register;
+ unsigned i;
+ for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+ struct tgsi_src_register *src = &inst->Src[i].Register;
+ if ((src->File == dst->File) && (src->Index == dst->Index)) {
+ if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) &&
+ (src->SwizzleX == TGSI_SWIZZLE_X) &&
+ (src->SwizzleY == TGSI_SWIZZLE_Y) &&
+ (src->SwizzleZ == TGSI_SWIZZLE_Z) &&
+ (src->SwizzleW == TGSI_SWIZZLE_W))
+ continue;
+ ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst);
+ ctx->tmp_dst.WriteMask = dst->WriteMask;
+ dst = &ctx->tmp_dst;
+ break;
+ }
+ }
+ return dst;
+}
+
+static void
+put_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst,
+ struct tgsi_dst_register *dst)
+{
+ /* if necessary, add mov back into original dst: */
+ if (dst != &inst->Dst[0].Register) {
+ create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src);
+ }
+}
+
+/* helper to generate the necessary repeat and/or additional instructions
+ * to turn a scalar instruction into a vector operation:
+ */
+static void
+vectorize(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
+ struct tgsi_dst_register *dst, int nsrcs, ...)
+{
+ va_list ap;
+ int i, j, n = 0;
+ bool indirect = dst->Indirect;
+
+ add_dst_reg(ctx, instr, dst, TGSI_SWIZZLE_X);
+
+ va_start(ap, nsrcs);
+ for (j = 0; j < nsrcs; j++) {
+ struct tgsi_src_register *src =
+ va_arg(ap, struct tgsi_src_register *);
+ unsigned flags = va_arg(ap, unsigned);
+ struct ir3_register *reg;
+ if (flags & IR3_REG_IMMED) {
+ reg = ir3_reg_create(instr, 0, IR3_REG_IMMED);
+ /* this is an ugly cast.. should have put flags first! */
+ reg->iim_val = *(int *)&src;
+ } else {
+ reg = add_src_reg(ctx, instr, src, TGSI_SWIZZLE_X);
+ indirect |= src->Indirect;
+ }
+ reg->flags |= flags & ~IR3_REG_NEGATE;
+ if (flags & IR3_REG_NEGATE)
+ reg->flags ^= IR3_REG_NEGATE;
+ }
+ va_end(ap);
+
+ for (i = 0; i < 4; i++) {
+ if (dst->WriteMask & (1 << i)) {
+ struct ir3_instruction *cur;
+
+ if (n++ == 0) {
+ cur = instr;
+ } else {
+ cur = ir3_instr_clone(instr);
+ cur->flags &= ~(IR3_INSTR_SY | IR3_INSTR_SS | IR3_INSTR_JP);
+ }
+
+ /* fix-up dst register component: */
+ cur->regs[0]->num = regid(cur->regs[0]->num >> 2, i);
+
+ /* fix-up src register component: */
+ va_start(ap, nsrcs);
+ for (j = 0; j < nsrcs; j++) {
+ struct tgsi_src_register *src =
+ va_arg(ap, struct tgsi_src_register *);
+ unsigned flags = va_arg(ap, unsigned);
+ if (!(flags & IR3_REG_IMMED)) {
+ cur->regs[j+1]->num =
+ regid(cur->regs[j+1]->num >> 2,
+ src_swiz(src, i));
+ cur->flags |= src_flags(ctx, cur->regs[j+1]);
+ }
+ }
+ va_end(ap);
+
+ if (indirect)
+ ctx->last_rel = cur;
+ }
+ }
+
+ /* pad w/ nop's.. at least until we are clever enough to
+ * figure out if we really need to..
+ */
+ add_nop(ctx, 4 - n);
+}
+
+/*
+ * Handlers for TGSI instructions which do not have a 1:1 mapping to
+ * native instructions:
+ */
+
+static void
+trans_clamp(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct tgsi_dst_register *dst = get_dst(ctx, inst);
+ struct tgsi_src_register *src0 = &inst->Src[0].Register;
+ struct tgsi_src_register *src1 = &inst->Src[1].Register;
+ struct tgsi_src_register *src2 = &inst->Src[2].Register;
+
+ create_clamp(ctx, dst, src0, src1, src2);
+
+ put_dst(ctx, inst, dst);
+}
+
+/* ARL(x) = x, but mova from hrN.x to a0.. */
+static void
+trans_arl(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct ir3_instruction *instr;
+ struct tgsi_dst_register tmp_dst;
+ struct tgsi_src_register *tmp_src;
+ struct tgsi_dst_register *dst = &inst->Dst[0].Register;
+ struct tgsi_src_register *src = &inst->Src[0].Register;
+ unsigned chan = src->SwizzleX;
+ compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS);
+
+ handle_last_rel(ctx);
+
+ tmp_src = get_internal_temp_hr(ctx, &tmp_dst);
+
+ /* cov.{f32,f16}s16 Rtmp, Rsrc */
+ instr = instr_create(ctx, 1, 0);
+ instr->cat1.src_type = get_ftype(ctx);
+ instr->cat1.dst_type = TYPE_S16;
+ add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
+ add_src_reg(ctx, instr, src, chan);
+
+ add_nop(ctx, 3);
+
+ /* shl.b Rtmp, Rtmp, 2 */
+ instr = instr_create(ctx, 2, OPC_SHL_B);
+ add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
+ add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
+ ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
+
+ add_nop(ctx, 3);
+
+ /* mova a0, Rtmp */
+ instr = instr_create(ctx, 1, 0);
+ instr->cat1.src_type = TYPE_S16;
+ instr->cat1.dst_type = TYPE_S16;
+ add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF;
+ add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
+
+ /* need to ensure 5 instr slots before a0 is used: */
+ add_nop(ctx, 6);
+}
+
+/* texture fetch/sample instructions: */
+static void
+trans_samp(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct ir3_register *r;
+ struct ir3_instruction *instr;
+ struct tgsi_src_register *coord = &inst->Src[0].Register;
+ struct tgsi_src_register *samp = &inst->Src[1].Register;
+ unsigned tex = inst->Texture.Texture;
+ int8_t *order;
+ unsigned i, flags = 0, src_wrmask;
+ bool needs_mov = false;
+
+ switch (t->arg) {
+ case TGSI_OPCODE_TEX:
+ if (tex == TGSI_TEXTURE_2D) {
+ order = (int8_t[4]){ 0, 1, -1, -1 };
+ src_wrmask = TGSI_WRITEMASK_XY;
+ } else {
+ order = (int8_t[4]){ 0, 1, 2, -1 };
+ src_wrmask = TGSI_WRITEMASK_XYZ;
+ }
+ break;
+ case TGSI_OPCODE_TXP:
+ if (tex == TGSI_TEXTURE_2D) {
+ order = (int8_t[4]){ 0, 1, 3, -1 };
+ src_wrmask = TGSI_WRITEMASK_XYZ;
+ } else {
+ order = (int8_t[4]){ 0, 1, 2, 3 };
+ src_wrmask = TGSI_WRITEMASK_XYZW;
+ }
+ flags |= IR3_INSTR_P;
+ break;
+ default:
+ compile_assert(ctx, 0);
+ break;
+ }
+
+ if ((tex == TGSI_TEXTURE_3D) || (tex == TGSI_TEXTURE_CUBE)) {
+ add_nop(ctx, 3);
+ flags |= IR3_INSTR_3D;
+ }
+
+ /* cat5 instruction cannot seem to handle const or relative: */
+ if (is_rel_or_const(coord))
+ needs_mov = true;
+
+ /* The texture sample instructions need to coord in successive
+ * registers/components (ie. src.xy but not src.yx). And TXP
+ * needs the .w component in .z for 2D.. so in some cases we
+ * might need to emit some mov instructions to shuffle things
+ * around:
+ */
+ for (i = 1; (i < 4) && (order[i] >= 0) && !needs_mov; i++)
+ if (src_swiz(coord, i) != (src_swiz(coord, 0) + order[i]))
+ needs_mov = true;
+
+ if (needs_mov) {
+ struct tgsi_dst_register tmp_dst;
+ struct tgsi_src_register *tmp_src;
+ unsigned j;
+
+ type_t type_mov = get_ftype(ctx);
+
+ /* need to move things around: */
+ tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+ for (j = 0; (j < 4) && (order[j] >= 0); j++) {
+ instr = instr_create(ctx, 1, 0);
+ instr->cat1.src_type = type_mov;
+ instr->cat1.dst_type = type_mov;
+ add_dst_reg(ctx, instr, &tmp_dst, j);
+ add_src_reg(ctx, instr, coord,
+ src_swiz(coord, order[j]));
+ }
+
+ coord = tmp_src;
+
+ add_nop(ctx, 4 - j);
+ }
+
+ instr = instr_create(ctx, 5, t->opc);
+ instr->cat5.type = get_ftype(ctx);
+ instr->cat5.samp = samp->Index;
+ instr->cat5.tex = samp->Index;
+ instr->flags |= flags;
+
+ r = add_dst_reg(ctx, instr, &inst->Dst[0].Register, 0);
+ r->wrmask = inst->Dst[0].Register.WriteMask;
+
+ add_src_reg(ctx, instr, coord, coord->SwizzleX)->wrmask = src_wrmask;
+
+ /* after add_src_reg() so we don't set (sy) on sam instr itself! */
+ regmask_set(&ctx->needs_sy, r);
+}
+
+/*
+ * SEQ(a,b) = (a == b) ? 1.0 : 0.0
+ * cmps.f.eq tmp0, b, a
+ * cov.u16f16 dst, tmp0
+ *
+ * SNE(a,b) = (a != b) ? 1.0 : 0.0
+ * cmps.f.eq tmp0, b, a
+ * add.s tmp0, tmp0, -1
+ * sel.f16 dst, {0.0}, tmp0, {1.0}
+ *
+ * SGE(a,b) = (a >= b) ? 1.0 : 0.0
+ * cmps.f.ge tmp0, a, b
+ * cov.u16f16 dst, tmp0
+ *
+ * SLE(a,b) = (a <= b) ? 1.0 : 0.0
+ * cmps.f.ge tmp0, b, a
+ * cov.u16f16 dst, tmp0
+ *
+ * SGT(a,b) = (a > b) ? 1.0 : 0.0
+ * cmps.f.ge tmp0, b, a
+ * add.s tmp0, tmp0, -1
+ * sel.f16 dst, {0.0}, tmp0, {1.0}
+ *
+ * SLT(a,b) = (a < b) ? 1.0 : 0.0
+ * cmps.f.ge tmp0, a, b
+ * add.s tmp0, tmp0, -1
+ * sel.f16 dst, {0.0}, tmp0, {1.0}
+ *
+ * CMP(a,b,c) = (a < 0.0) ? b : c
+ * cmps.f.ge tmp0, a, {0.0}
+ * add.s tmp0, tmp0, -1
+ * sel.f16 dst, c, tmp0, b
+ */
+static void
+trans_cmp(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct ir3_instruction *instr;
+ struct tgsi_dst_register tmp_dst;
+ struct tgsi_src_register *tmp_src;
+ struct tgsi_src_register constval0, constval1;
+ /* final instruction for CMP() uses orig src1 and src2: */
+ struct tgsi_dst_register *dst = get_dst(ctx, inst);
+ struct tgsi_src_register *a0, *a1;
+ unsigned condition;
+
+ tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+ switch (t->tgsi_opc) {
+ case TGSI_OPCODE_SEQ:
+ case TGSI_OPCODE_SNE:
+ a0 = &inst->Src[1].Register; /* b */
+ a1 = &inst->Src[0].Register; /* a */
+ condition = IR3_COND_EQ;
+ break;
+ case TGSI_OPCODE_SGE:
+ case TGSI_OPCODE_SLT:
+ a0 = &inst->Src[0].Register; /* a */
+ a1 = &inst->Src[1].Register; /* b */
+ condition = IR3_COND_GE;
+ break;
+ case TGSI_OPCODE_SLE:
+ case TGSI_OPCODE_SGT:
+ a0 = &inst->Src[1].Register; /* b */
+ a1 = &inst->Src[0].Register; /* a */
+ condition = IR3_COND_GE;
+ break;
+ case TGSI_OPCODE_CMP:
+ get_immediate(ctx, &constval0, fui(0.0));
+ a0 = &inst->Src[0].Register; /* a */
+ a1 = &constval0; /* {0.0} */
+ condition = IR3_COND_GE;
+ break;
+ default:
+ compile_assert(ctx, 0);
+ return;
+ }
+
+ if (is_const(a0) && is_const(a1))
+ a0 = get_unconst(ctx, a0);
+
+ /* cmps.f.ge tmp, a0, a1 */
+ instr = instr_create(ctx, 2, OPC_CMPS_F);
+ instr->cat2.condition = condition;
+ vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
+
+ switch (t->tgsi_opc) {
+ case TGSI_OPCODE_SEQ:
+ case TGSI_OPCODE_SGE:
+ case TGSI_OPCODE_SLE:
+ /* cov.u16f16 dst, tmp0 */
+ instr = instr_create(ctx, 1, 0);
+ instr->cat1.src_type = get_utype(ctx);
+ instr->cat1.dst_type = get_ftype(ctx);
+ vectorize(ctx, instr, dst, 1, tmp_src, 0);
+ break;
+ case TGSI_OPCODE_SNE:
+ case TGSI_OPCODE_SGT:
+ case TGSI_OPCODE_SLT:
+ case TGSI_OPCODE_CMP:
+ /* add.s tmp, tmp, -1 */
+ instr = instr_create(ctx, 2, OPC_ADD_S);
+ vectorize(ctx, instr, &tmp_dst, 2, tmp_src, 0, -1, IR3_REG_IMMED);
+
+ if (t->tgsi_opc == TGSI_OPCODE_CMP) {
+ /* sel.{f32,f16} dst, src2, tmp, src1 */
+ instr = instr_create(ctx, 3,
+ ctx->so->key.half_precision ? OPC_SEL_F16 : OPC_SEL_F32);
+ vectorize(ctx, instr, dst, 3,
+ &inst->Src[2].Register, 0,
+ tmp_src, 0,
+ &inst->Src[1].Register, 0);
+ } else {
+ get_immediate(ctx, &constval0, fui(0.0));
+ get_immediate(ctx, &constval1, fui(1.0));
+ /* sel.{f32,f16} dst, {0.0}, tmp0, {1.0} */
+ instr = instr_create(ctx, 3,
+ ctx->so->key.half_precision ? OPC_SEL_F16 : OPC_SEL_F32);
+ vectorize(ctx, instr, dst, 3,
+ &constval0, 0, tmp_src, 0, &constval1, 0);
+ }
+
+ break;
+ }
+
+ put_dst(ctx, inst, dst);
+}
+
+/*
+ * Conditional / Flow control
+ */
+
+static unsigned
+find_instruction(struct ir3_compile_context *ctx, struct ir3_instruction *instr)
+{
+ unsigned i;
+ for (i = 0; i < ctx->ir->instrs_count; i++)
+ if (ctx->ir->instrs[i] == instr)
+ return i;
+ return ~0;
+}
+
+static void
+push_branch(struct ir3_compile_context *ctx, struct ir3_instruction *instr)
+{
+ ctx->branch[ctx->branch_count++] = instr;
+}
+
+static void
+pop_branch(struct ir3_compile_context *ctx)
+{
+ struct ir3_instruction *instr;
+
+ /* if we were clever enough, we'd patch this up after the fact,
+ * and set (jp) flag on whatever the next instruction was, rather
+ * than inserting an extra nop..
+ */
+ instr = instr_create(ctx, 0, OPC_NOP);
+ instr->flags |= IR3_INSTR_JP;
+
+ /* pop the branch instruction from the stack and fix up branch target: */
+ instr = ctx->branch[--ctx->branch_count];
+ instr->cat0.immed = ctx->ir->instrs_count - find_instruction(ctx, instr) - 1;
+}
+
+/* We probably don't really want to translate if/else/endif into branches..
+ * the blob driver evaluates both legs of the if and then uses the sel
+ * instruction to pick which sides of the branch to "keep".. but figuring
+ * that out will take somewhat more compiler smarts. So hopefully branches
+ * don't kill performance too badly.
+ */
+static void
+trans_if(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct ir3_instruction *instr;
+ struct tgsi_src_register *src = &inst->Src[0].Register;
+ struct tgsi_src_register constval;
+
+ get_immediate(ctx, &constval, fui(0.0));
+
+ if (is_const(src))
+ src = get_unconst(ctx, src);
+
+ instr = instr_create(ctx, 2, OPC_CMPS_F);
+ ir3_reg_create(instr, regid(REG_P0, 0), 0);
+ add_src_reg(ctx, instr, src, src->SwizzleX);
+ add_src_reg(ctx, instr, &constval, constval.SwizzleX);
+ instr->cat2.condition = IR3_COND_EQ;
+
+ instr = instr_create(ctx, 0, OPC_BR);
+ push_branch(ctx, instr);
+}
+
+static void
+trans_else(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct ir3_instruction *instr;
+
+ /* for first half of if/else/endif, generate a jump past the else: */
+ instr = instr_create(ctx, 0, OPC_JUMP);
+
+ pop_branch(ctx);
+ push_branch(ctx, instr);
+}
+
+static void
+trans_endif(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ pop_branch(ctx);
+}
+
+/*
+ * Handlers for TGSI instructions which do have 1:1 mapping to native
+ * instructions:
+ */
+
+static void
+instr_cat0(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ instr_create(ctx, 0, t->opc);
+}
+
+static void
+instr_cat1(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct tgsi_dst_register *dst = get_dst(ctx, inst);
+ struct tgsi_src_register *src = &inst->Src[0].Register;
+
+ /* mov instructions can't handle a negate on src: */
+ if (src->Negate) {
+ struct tgsi_src_register constval;
+ struct ir3_instruction *instr;
+
+ /* since right now, we are using uniformly either TYPE_F16 or
+ * TYPE_F32, and we don't utilize the conversion possibilities
+ * of mov instructions, we can get away with substituting an
+ * add.f which can handle negate. Might need to revisit this
+ * in the future if we start supporting widening/narrowing or
+ * conversion to/from integer..
+ */
+ instr = instr_create(ctx, 2, OPC_ADD_F);
+ get_immediate(ctx, &constval, fui(0.0));
+ vectorize(ctx, instr, dst, 2, src, 0, &constval, 0);
+ } else {
+ create_mov(ctx, dst, src);
+ /* create_mov() generates vector sequence, so no vectorize() */
+ }
+ put_dst(ctx, inst, dst);
+}
+
+static void
+instr_cat2(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct tgsi_dst_register *dst = get_dst(ctx, inst);
+ struct tgsi_src_register *src0 = &inst->Src[0].Register;
+ struct tgsi_src_register *src1 = &inst->Src[1].Register;
+ struct ir3_instruction *instr;
+ unsigned src0_flags = 0, src1_flags = 0;
+
+ switch (t->tgsi_opc) {
+ case TGSI_OPCODE_ABS:
+ src0_flags = IR3_REG_ABS;
+ break;
+ case TGSI_OPCODE_SUB:
+ src1_flags = IR3_REG_NEGATE;
+ break;
+ }
+
+ switch (t->opc) {
+ case OPC_ABSNEG_F:
+ case OPC_ABSNEG_S:
+ case OPC_CLZ_B:
+ case OPC_CLZ_S:
+ case OPC_SIGN_F:
+ case OPC_FLOOR_F:
+ case OPC_CEIL_F:
+ case OPC_RNDNE_F:
+ case OPC_RNDAZ_F:
+ case OPC_TRUNC_F:
+ case OPC_NOT_B:
+ case OPC_BFREV_B:
+ case OPC_SETRM:
+ case OPC_CBITS_B:
+ /* these only have one src reg */
+ instr = instr_create(ctx, 2, t->opc);
+ vectorize(ctx, instr, dst, 1, src0, src0_flags);
+ break;
+ default:
+ if (is_const(src0) && is_const(src1))
+ src0 = get_unconst(ctx, src0);
+
+ instr = instr_create(ctx, 2, t->opc);
+ vectorize(ctx, instr, dst, 2, src0, src0_flags,
+ src1, src1_flags);
+ break;
+ }
+
+ put_dst(ctx, inst, dst);
+}
+
+static void
+instr_cat3(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct tgsi_dst_register *dst = get_dst(ctx, inst);
+ struct tgsi_src_register *src0 = &inst->Src[0].Register;
+ struct tgsi_src_register *src1 = &inst->Src[1].Register;
+ struct ir3_instruction *instr;
+
+ /* in particular, can't handle const for src1 for cat3..
+ * for mad, we can swap first two src's if needed:
+ */
+ if (is_rel_or_const(src1)) {
+ if (is_mad(t->opc) && !is_rel_or_const(src0)) {
+ struct tgsi_src_register *tmp;
+ tmp = src0;
+ src0 = src1;
+ src1 = tmp;
+ } else {
+ src1 = get_unconst(ctx, src1);
+ }
+ }
+
+ instr = instr_create(ctx, 3,
+ ctx->so->key.half_precision ? t->hopc : t->opc);
+ vectorize(ctx, instr, dst, 3, src0, 0, src1, 0,
+ &inst->Src[2].Register, 0);
+ put_dst(ctx, inst, dst);
+}
+
+static void
+instr_cat4(const struct instr_translater *t,
+ struct ir3_compile_context *ctx,
+ struct tgsi_full_instruction *inst)
+{
+ struct tgsi_dst_register *dst = get_dst(ctx, inst);
+ struct tgsi_src_register *src = &inst->Src[0].Register;
+ struct ir3_instruction *instr;
+ unsigned i, n;
+
+ /* seems like blob compiler avoids const as src.. */
+ if (is_const(src))
+ src = get_unconst(ctx, src);
+
+ /* worst case: */
+ add_nop(ctx, 6);
+
+ /* we need to replicate into each component: */
+ for (i = 0, n = 0; i < 4; i++) {
+ if (dst->WriteMask & (1 << i)) {
+ if (n++)
+ add_nop(ctx, 1);
+ instr = instr_create(ctx, 4, t->opc);
+ add_dst_reg(ctx, instr, dst, i);
+ add_src_reg(ctx, instr, src, src->SwizzleX);
+ }
+ }
+
+ regmask_set(&ctx->needs_ss, instr->regs[0]);
+ put_dst(ctx, inst, dst);
+}
+
+static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
+#define INSTR(n, f, ...) \
+ [TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ }
+
+ INSTR(MOV, instr_cat1),
+ INSTR(RCP, instr_cat4, .opc = OPC_RCP),
+ INSTR(RSQ, instr_cat4, .opc = OPC_RSQ),
+ INSTR(SQRT, instr_cat4, .opc = OPC_SQRT),
+ INSTR(MUL, instr_cat2, .opc = OPC_MUL_F),
+ INSTR(ADD, instr_cat2, .opc = OPC_ADD_F),
+ INSTR(SUB, instr_cat2, .opc = OPC_ADD_F),
+ INSTR(MIN, instr_cat2, .opc = OPC_MIN_F),
+ INSTR(MAX, instr_cat2, .opc = OPC_MAX_F),
+ INSTR(MAD, instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16),
+ INSTR(TRUNC, instr_cat2, .opc = OPC_TRUNC_F),
+ INSTR(CLAMP, trans_clamp),
+ INSTR(FLR, instr_cat2, .opc = OPC_FLOOR_F),
+ INSTR(ROUND, instr_cat2, .opc = OPC_RNDNE_F),
+ INSTR(SSG, instr_cat2, .opc = OPC_SIGN_F),
+ INSTR(ARL, trans_arl),
+ INSTR(EX2, instr_cat4, .opc = OPC_EXP2),
+ INSTR(LG2, instr_cat4, .opc = OPC_LOG2),
+ INSTR(ABS, instr_cat2, .opc = OPC_ABSNEG_F),
+ INSTR(COS, instr_cat4, .opc = OPC_COS),
+ INSTR(SIN, instr_cat4, .opc = OPC_SIN),
+ INSTR(TEX, trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TEX),
+ INSTR(TXP, trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TXP),
+ INSTR(SGT, trans_cmp),
+ INSTR(SLT, trans_cmp),
+ INSTR(SGE, trans_cmp),
+ INSTR(SLE, trans_cmp),
+ INSTR(SNE, trans_cmp),
+ INSTR(SEQ, trans_cmp),
+ INSTR(CMP, trans_cmp),
+ INSTR(IF, trans_if),
+ INSTR(ELSE, trans_else),
+ INSTR(ENDIF, trans_endif),
+ INSTR(END, instr_cat0, .opc = OPC_END),
+ INSTR(KILL, instr_cat0, .opc = OPC_KILL),
+};
+
+static ir3_semantic
+decl_semantic(const struct tgsi_declaration_semantic *sem)
+{
+ return ir3_semantic_name(sem->Name, sem->Index);
+}
+
+static int
+decl_in(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
+{
+ struct ir3_shader_variant *so = ctx->so;
+ unsigned base = ctx->base_reg[TGSI_FILE_INPUT];
+ unsigned i, flags = 0;
+ int nop = 0;
+
+ /* I don't think we should get frag shader input without
+ * semantic info? Otherwise how do inputs get linked to
+ * vert outputs?
+ */
+ compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) ||
+ decl->Declaration.Semantic);
+
+ if (ctx->so->key.half_precision)
+ flags |= IR3_REG_HALF;
+
+ for (i = decl->Range.First; i <= decl->Range.Last; i++) {
+ unsigned n = so->inputs_count++;
+ unsigned r = regid(i + base, 0);
+ unsigned ncomp;
+
+ /* TODO use ctx->info.input_usage_mask[decl->Range.n] to figure out ncomp: */
+ ncomp = 4;
+
+ DBG("decl in -> r%d", i + base); // XXX
+
+ compile_assert(ctx, n < ARRAY_SIZE(so->inputs));
+
+ so->inputs[n].semantic = decl_semantic(&decl->Semantic);
+ so->inputs[n].compmask = (1 << ncomp) - 1;
+ so->inputs[n].ncomp = ncomp;
+ so->inputs[n].regid = r;
+ so->inputs[n].inloc = ctx->next_inloc;
+ so->inputs[n].bary = true; /* all that is supported */
+ ctx->next_inloc += ncomp;
+
+ so->total_in += ncomp;
+
+ /* for frag shaders, we need to generate the corresponding bary instr: */
+ if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+ unsigned j;
+
+ for (j = 0; j < ncomp; j++) {
+ struct ir3_instruction *instr;
+ struct ir3_register *dst;
+
+ instr = instr_create(ctx, 2, OPC_BARY_F);
+
+ /* dst register: */
+ dst = ir3_reg_create(instr, r + j, flags);
+ ctx->last_input = dst;
+
+ /* input position: */
+ ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val =
+ so->inputs[n].inloc + j - 8;
+
+ /* input base (always r0.xy): */
+ ir3_reg_create(instr, regid(0,0), 0)->wrmask = 0x3;
+ }
+
+ nop = 6;
+ }
+ }
+
+ return nop;
+}
+
+static void
+decl_out(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
+{
+ struct ir3_shader_variant *so = ctx->so;
+ unsigned base = ctx->base_reg[TGSI_FILE_OUTPUT];
+ unsigned comp = 0;
+ unsigned name = decl->Semantic.Name;
+ unsigned i;
+
+ compile_assert(ctx, decl->Declaration.Semantic); // TODO is this ever not true?
+
+ DBG("decl out[%d] -> r%d", name, decl->Range.First + base); // XXX
+
+ if (ctx->type == TGSI_PROCESSOR_VERTEX) {
+ switch (name) {
+ case TGSI_SEMANTIC_POSITION:
+ so->writes_pos = true;
+ break;
+ case TGSI_SEMANTIC_PSIZE:
+ so->writes_psize = true;
+ break;
+ case TGSI_SEMANTIC_COLOR:
+ case TGSI_SEMANTIC_BCOLOR:
+ case TGSI_SEMANTIC_GENERIC:
+ case TGSI_SEMANTIC_FOG:
+ case TGSI_SEMANTIC_TEXCOORD:
+ break;
+ default:
+ compile_error(ctx, "unknown VS semantic name: %s\n",
+ tgsi_semantic_names[name]);
+ }
+ } else {
+ switch (name) {
+ case TGSI_SEMANTIC_POSITION:
+ comp = 2; /* tgsi will write to .z component */
+ so->writes_pos = true;
+ break;
+ case TGSI_SEMANTIC_COLOR:
+ break;
+ default:
+ compile_error(ctx, "unknown FS semantic name: %s\n",
+ tgsi_semantic_names[name]);
+ }
+ }
+
+ for (i = decl->Range.First; i <= decl->Range.Last; i++) {
+ unsigned n = so->outputs_count++;
+ compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
+ so->outputs[n].semantic = decl_semantic(&decl->Semantic);
+ so->outputs[n].regid = regid(i + base, comp);
+ }
+}
+
+static void
+decl_samp(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
+{
+ ctx->so->has_samp = true;
+}
+
+static void
+compile_instructions(struct ir3_compile_context *ctx)
+{
+ struct ir3 *ir = ctx->ir;
+ int nop = 0;
+
+ while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
+ tgsi_parse_token(&ctx->parser);
+
+ switch (ctx->parser.FullToken.Token.Type) {
+ case TGSI_TOKEN_TYPE_DECLARATION: {
+ struct tgsi_full_declaration *decl =
+ &ctx->parser.FullToken.FullDeclaration;
+ if (decl->Declaration.File == TGSI_FILE_OUTPUT) {
+ decl_out(ctx, decl);
+ } else if (decl->Declaration.File == TGSI_FILE_INPUT) {
+ nop = decl_in(ctx, decl);
+ } else if (decl->Declaration.File == TGSI_FILE_SAMPLER) {
+ decl_samp(ctx, decl);
+ }
+ break;
+ }
+ case TGSI_TOKEN_TYPE_IMMEDIATE: {
+ /* TODO: if we know the immediate is small enough, and only
+ * used with instructions that can embed an immediate, we
+ * can skip this:
+ */
+ struct tgsi_full_immediate *imm =
+ &ctx->parser.FullToken.FullImmediate;
+ unsigned n = ctx->so->immediates_count++;
+ memcpy(ctx->so->immediates[n].val, imm->u, 16);
+ break;
+ }
+ case TGSI_TOKEN_TYPE_INSTRUCTION: {
+ struct tgsi_full_instruction *inst =
+ &ctx->parser.FullToken.FullInstruction;
+ unsigned opc = inst->Instruction.Opcode;
+ const struct instr_translater *t = &translaters[opc];
+
+ add_nop(ctx, nop);
+ nop = 0;
+
+ if (t->fxn) {
+ t->fxn(t, ctx, inst);
+ ctx->num_internal_temps = 0;
+ } else {
+ compile_error(ctx, "unknown TGSI opc: %s\n",
+ tgsi_get_opcode_name(opc));
+ }
+
+ switch (inst->Instruction.Saturate) {
+ case TGSI_SAT_ZERO_ONE:
+ create_clamp_imm(ctx, &inst->Dst[0].Register,
+ fui(0.0), fui(1.0));
+ break;
+ case TGSI_SAT_MINUS_PLUS_ONE:
+ create_clamp_imm(ctx, &inst->Dst[0].Register,
+ fui(-1.0), fui(1.0));
+ break;
+ }
+
+ break;
+ }
+ default:
+ break;
+ }
+ }
+
+ if (ir->instrs_count > 0)
+ ir->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
+
+ if (ctx->last_input)
+ ctx->last_input->flags |= IR3_REG_EI;
+
+ handle_last_rel(ctx);
+}
+
+int
+ir3_compile_shader_old(struct ir3_shader_variant *so,
+ const struct tgsi_token *tokens, struct ir3_shader_key key)
+{
+ struct ir3_compile_context ctx;
+
+ assert(!so->ir);
+
+ so->ir = ir3_create();
+
+ assert(so->ir);
+
+ if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK)
+ return -1;
+
+ compile_instructions(&ctx);
+
+ compile_free(&ctx);
+
+ return 0;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
new file mode 100644
index 00000000000..73c2a27c6eb
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -0,0 +1,158 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#include "ir3.h"
+
+/*
+ * Copy Propagate:
+ *
+ * TODO probably want some sort of visitor sort of interface to
+ * avoid duplicating the same graph traversal logic everywhere..
+ *
+ */
+
+static void block_cp(struct ir3_block *block);
+static struct ir3_instruction * instr_cp(struct ir3_instruction *instr, bool keep);
+
+static bool is_eligible_mov(struct ir3_instruction *instr)
+{
+ if ((instr->category == 1) &&
+ (instr->cat1.src_type == instr->cat1.dst_type)) {
+ struct ir3_register *dst = instr->regs[0];
+ struct ir3_register *src = instr->regs[1];
+ if (dst->flags & IR3_REG_ADDR)
+ return false;
+ if ((src->flags & IR3_REG_SSA) &&
+ /* TODO: propagate abs/neg modifiers if possible */
+ !(src->flags & (IR3_REG_ABS | IR3_REG_NEGATE | IR3_REG_RELATIV)))
+ return true;
+ }
+ return false;
+}
+
+static void walk_children(struct ir3_instruction *instr, bool keep)
+{
+ unsigned i;
+
+ /* walk down the graph from each src: */
+ for (i = 1; i < instr->regs_count; i++) {
+ struct ir3_register *src = instr->regs[i];
+ if (src->flags & IR3_REG_SSA)
+ src->instr = instr_cp(src->instr, keep);
+ }
+}
+
+static struct ir3_instruction *
+instr_cp_fanin(struct ir3_instruction *instr)
+{
+ unsigned i;
+
+ /* we need to handle fanin specially, to detect cases
+ * when we need to keep a mov
+ */
+
+ for (i = 1; i < instr->regs_count; i++) {
+ struct ir3_register *src = instr->regs[i];
+ if (src->flags & IR3_REG_SSA) {
+ struct ir3_instruction *cand =
+ instr_cp(src->instr, false);
+
+ /* if the candidate is a fanout, then keep
+ * the move.
+ *
+ * This is a bit, um, fragile, but it should
+ * catch the extra mov's that the front-end
+ * puts in for us already in these cases.
+ */
+ if (is_meta(cand) && (cand->opc == OPC_META_FO))
+ cand = instr_cp(src->instr, true);
+
+ src->instr = cand;
+ }
+ }
+
+ walk_children(instr, false);
+
+ return instr;
+
+}
+
+static struct ir3_instruction *
+instr_cp(struct ir3_instruction *instr, bool keep)
+{
+ /* if we've already visited this instruction, bail now: */
+ if (ir3_instr_check_mark(instr))
+ return instr;
+
+ if (is_meta(instr) && (instr->opc == OPC_META_FI))
+ return instr_cp_fanin(instr);
+
+ if (is_eligible_mov(instr) && !keep) {
+ struct ir3_register *src = instr->regs[1];
+ return instr_cp(src->instr, false);
+ }
+
+ walk_children(instr, false);
+
+ return instr;
+}
+
+static void block_cp(struct ir3_block *block)
+{
+ unsigned i, j;
+
+ for (i = 0; i < block->noutputs; i++) {
+ if (block->outputs[i]) {
+ struct ir3_instruction *out =
+ instr_cp(block->outputs[i], false);
+
+ /* To deal with things like this:
+ *
+ * 43: MOV OUT[2], TEMP[5]
+ * 44: MOV OUT[0], TEMP[5]
+ *
+ * we need to ensure that no two outputs point to
+ * the same instruction
+ */
+ for (j = 0; j < i; j++) {
+ if (block->outputs[j] == out) {
+ out = instr_cp(block->outputs[i], true);
+ break;
+ }
+ }
+
+ block->outputs[i] = out;
+ }
+ }
+}
+
+void ir3_block_cp(struct ir3_block *block)
+{
+ ir3_clear_mark(block->shader);
+ block_cp(block);
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
new file mode 100644
index 00000000000..dcc0362f0c8
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
@@ -0,0 +1,159 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#include "util/u_math.h"
+
+#include "ir3.h"
+
+/*
+ * Instruction Depth:
+ *
+ * Calculates weighted instruction depth, ie. the sum of # of needed
+ * instructions plus delay slots back to original input (ie INPUT or
+ * CONST). That is to say, an instructions depth is:
+ *
+ * depth(instr) {
+ * d = 0;
+ * // for each src register:
+ * foreach (src in instr->regs[1..n])
+ * d = max(d, delayslots(src->instr, n) + depth(src->instr));
+ * return d + 1;
+ * }
+ *
+ * After an instruction's depth is calculated, it is inserted into the
+ * blocks depth sorted list, which is used by the scheduling pass.
+ */
+
+/* calculate required # of delay slots between the instruction that
+ * assigns a value and the one that consumes
+ */
+int ir3_delayslots(struct ir3_instruction *assigner,
+ struct ir3_instruction *consumer, unsigned n)
+{
+ /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
+ * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
+ * handled with sync bits
+ */
+
+ if (is_meta(assigner))
+ return 0;
+
+ if (writes_addr(assigner))
+ return 6;
+
+ /* handled via sync flags: */
+ if (is_sfu(assigner) || is_tex(assigner))
+ return 0;
+
+ /* assigner must be alu: */
+ if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer)) {
+ return 6;
+ } else if ((consumer->category == 3) &&
+ is_mad(consumer->opc) && (n == 2)) {
+ /* special case, 3rd src to cat3 not required on first cycle */
+ return 1;
+ } else {
+ return 3;
+ }
+}
+
+static void insert_by_depth(struct ir3_instruction *instr)
+{
+ struct ir3_block *block = instr->block;
+ struct ir3_instruction *n = block->head;
+ struct ir3_instruction *p = NULL;
+
+ while (n && (n != instr) && (n->depth > instr->depth)) {
+ p = n;
+ n = n->next;
+ }
+
+ instr->next = n;
+ if (p)
+ p->next = instr;
+ else
+ block->head = instr;
+}
+
+static void ir3_instr_depth(struct ir3_instruction *instr)
+{
+ unsigned i;
+
+ /* if we've already visited this instruction, bail now: */
+ if (ir3_instr_check_mark(instr))
+ return;
+
+ instr->depth = 0;
+
+ for (i = 1; i < instr->regs_count; i++) {
+ struct ir3_register *src = instr->regs[i];
+ if (src->flags & IR3_REG_SSA) {
+ unsigned sd;
+
+ /* visit child to compute it's depth: */
+ ir3_instr_depth(src->instr);
+
+ sd = ir3_delayslots(src->instr, instr, i-1) +
+ src->instr->depth;
+
+ instr->depth = MAX2(instr->depth, sd);
+ }
+ }
+
+ /* meta-instructions don't add cycles, other than PHI.. which
+ * might translate to a real instruction..
+ *
+ * well, not entirely true, fan-in/out, etc might need to need
+ * to generate some extra mov's in edge cases, etc.. probably
+ * we might want to do depth calculation considering the worst
+ * case for these??
+ */
+ if (!is_meta(instr))
+ instr->depth++;
+
+ insert_by_depth(instr);
+}
+
+void ir3_block_depth(struct ir3_block *block)
+{
+ unsigned i;
+
+ block->head = NULL;
+
+ ir3_clear_mark(block->shader);
+ for (i = 0; i < block->noutputs; i++)
+ if (block->outputs[i])
+ ir3_instr_depth(block->outputs[i]);
+
+ /* at this point, any unvisited input is unused: */
+ for (i = 0; i < block->ninputs; i++) {
+ struct ir3_instruction *in = block->inputs[i];
+ if (in && !ir3_instr_check_mark(in))
+ block->inputs[i] = NULL;
+ }
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_dump.c b/src/gallium/drivers/freedreno/ir3/ir3_dump.c
new file mode 100644
index 00000000000..1a6f49d51cd
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_dump.c
@@ -0,0 +1,425 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#include <stdarg.h>
+
+#include "ir3.h"
+
+#define PTRID(x) ((unsigned long)(x))
+
+struct ir3_dump_ctx {
+ FILE *f;
+ bool verbose;
+};
+
+static void dump_instr_name(struct ir3_dump_ctx *ctx,
+ struct ir3_instruction *instr)
+{
+ /* for debugging: */
+ if (ctx->verbose) {
+#ifdef DEBUG
+ fprintf(ctx->f, "%04u:", instr->serialno);
+#endif
+ fprintf(ctx->f, "%03u: ", instr->depth);
+ }
+
+ if (instr->flags & IR3_INSTR_SY)
+ fprintf(ctx->f, "(sy)");
+ if (instr->flags & IR3_INSTR_SS)
+ fprintf(ctx->f, "(ss)");
+
+ if (is_meta(instr)) {
+ switch(instr->opc) {
+ case OPC_META_PHI:
+ fprintf(ctx->f, "&#934;");
+ break;
+ case OPC_META_DEREF:
+ fprintf(ctx->f, "(*)");
+ break;
+ default:
+ /* shouldn't hit here.. just for debugging: */
+ switch (instr->opc) {
+ case OPC_META_INPUT: fprintf(ctx->f, "_meta:in"); break;
+ case OPC_META_OUTPUT: fprintf(ctx->f, "_meta:out"); break;
+ case OPC_META_FO: fprintf(ctx->f, "_meta:fo"); break;
+ case OPC_META_FI: fprintf(ctx->f, "_meta:fi"); break;
+ case OPC_META_FLOW: fprintf(ctx->f, "_meta:flow"); break;
+
+ default: fprintf(ctx->f, "_meta:%d", instr->opc); break;
+ }
+ break;
+ }
+ } else if (instr->category == 1) {
+ static const char *type[] = {
+ [TYPE_F16] = "f16",
+ [TYPE_F32] = "f32",
+ [TYPE_U16] = "u16",
+ [TYPE_U32] = "u32",
+ [TYPE_S16] = "s16",
+ [TYPE_S32] = "s32",
+ [TYPE_U8] = "u8",
+ [TYPE_S8] = "s8",
+ };
+ if (instr->cat1.src_type == instr->cat1.dst_type)
+ fprintf(ctx->f, "mov");
+ else
+ fprintf(ctx->f, "cov");
+ fprintf(ctx->f, ".%s%s", type[instr->cat1.src_type], type[instr->cat1.dst_type]);
+ } else {
+ fprintf(ctx->f, "%s", ir3_instr_name(instr));
+ if (instr->flags & IR3_INSTR_3D)
+ fprintf(ctx->f, ".3d");
+ if (instr->flags & IR3_INSTR_A)
+ fprintf(ctx->f, ".a");
+ if (instr->flags & IR3_INSTR_O)
+ fprintf(ctx->f, ".o");
+ if (instr->flags & IR3_INSTR_P)
+ fprintf(ctx->f, ".p");
+ if (instr->flags & IR3_INSTR_S)
+ fprintf(ctx->f, ".s");
+ if (instr->flags & IR3_INSTR_S2EN)
+ fprintf(ctx->f, ".s2en");
+ }
+}
+
+static void dump_reg_name(struct ir3_dump_ctx *ctx,
+ struct ir3_register *reg)
+{
+ if ((reg->flags & IR3_REG_ABS) && (reg->flags & IR3_REG_NEGATE))
+ fprintf(ctx->f, "(absneg)");
+ else if (reg->flags & IR3_REG_NEGATE)
+ fprintf(ctx->f, "(neg)");
+ else if (reg->flags & IR3_REG_ABS)
+ fprintf(ctx->f, "(abs)");
+
+ if (reg->flags & IR3_REG_IMMED) {
+ fprintf(ctx->f, "imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val);
+ } else if (reg->flags & IR3_REG_SSA) {
+ if (ctx->verbose) {
+ fprintf(ctx->f, "_[");
+ dump_instr_name(ctx, reg->instr);
+ fprintf(ctx->f, "]");
+ }
+ } else {
+ if (reg->flags & IR3_REG_HALF)
+ fprintf(ctx->f, "h");
+ if (reg->flags & IR3_REG_CONST)
+ fprintf(ctx->f, "c%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]);
+ else
+ fprintf(ctx->f, "r%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]);
+ }
+}
+
+static void ir3_instr_dump(struct ir3_dump_ctx *ctx,
+ struct ir3_instruction *instr);
+static void ir3_block_dump(struct ir3_dump_ctx *ctx,
+ struct ir3_block *block, const char *name);
+
+static void dump_instr(struct ir3_dump_ctx *ctx,
+ struct ir3_instruction *instr)
+{
+ /* if we've already visited this instruction, bail now: */
+ if (ir3_instr_check_mark(instr))
+ return;
+
+ /* some meta-instructions need to be handled specially: */
+ if (is_meta(instr)) {
+ if ((instr->opc == OPC_META_FO) ||
+ (instr->opc == OPC_META_FI)) {
+ unsigned i;
+ for (i = 1; i < instr->regs_count; i++) {
+ struct ir3_register *reg = instr->regs[i];
+ if (reg->flags & IR3_REG_SSA)
+ dump_instr(ctx, reg->instr);
+ }
+ } else if (instr->opc == OPC_META_FLOW) {
+ struct ir3_register *reg = instr->regs[1];
+ ir3_block_dump(ctx, instr->flow.if_block, "if");
+ if (instr->flow.else_block)
+ ir3_block_dump(ctx, instr->flow.else_block, "else");
+ if (reg->flags & IR3_REG_SSA)
+ dump_instr(ctx, reg->instr);
+ } else if ((instr->opc == OPC_META_PHI) ||
+ (instr->opc == OPC_META_DEREF)) {
+ /* treat like a normal instruction: */
+ ir3_instr_dump(ctx, instr);
+ }
+ } else {
+ ir3_instr_dump(ctx, instr);
+ }
+}
+
+/* arrarraggh! if link is to something outside of the current block, we
+ * need to defer emitting the link until the end of the block, since the
+ * edge triggers pre-creation of the node it links to inside the cluster,
+ * even though it is meant to be outside..
+ */
+static struct {
+ char buf[40960];
+ unsigned n;
+} edge_buf;
+
+/* helper to print or defer: */
+static void printdef(struct ir3_dump_ctx *ctx,
+ bool defer, const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap, fmt);
+ if (defer) {
+ unsigned n = edge_buf.n;
+ n += vsnprintf(&edge_buf.buf[n], sizeof(edge_buf.buf) - n,
+ fmt, ap);
+ edge_buf.n = n;
+ } else {
+ vfprintf(ctx->f, fmt, ap);
+ }
+ va_end(ap);
+}
+
+static void dump_link2(struct ir3_dump_ctx *ctx,
+ struct ir3_instruction *instr, const char *target, bool defer)
+{
+ /* some meta-instructions need to be handled specially: */
+ if (is_meta(instr)) {
+ if (instr->opc == OPC_META_INPUT) {
+ printdef(ctx, defer, "input%lx:<in%u>:w -> %s",
+ PTRID(instr->inout.block),
+ instr->regs[0]->num, target);
+ } else if (instr->opc == OPC_META_FO) {
+ struct ir3_register *reg = instr->regs[1];
+ dump_link2(ctx, reg->instr, target, defer);
+ printdef(ctx, defer, "[label=\".%c\"]",
+ "xyzw"[instr->fo.off & 0x3]);
+ } else if (instr->opc == OPC_META_FI) {
+ unsigned i;
+
+ /* recursively dump all parents and links */
+ for (i = 1; i < instr->regs_count; i++) {
+ struct ir3_register *reg = instr->regs[i];
+ if (reg->flags & IR3_REG_SSA) {
+ dump_link2(ctx, reg->instr, target, defer);
+ printdef(ctx, defer, "[label=\".%c\"]",
+ "xyzw"[(i - 1) & 0x3]);
+ }
+ }
+ } else if (instr->opc == OPC_META_OUTPUT) {
+ printdef(ctx, defer, "output%lx:<out%u>:w -> %s",
+ PTRID(instr->inout.block),
+ instr->regs[0]->num, target);
+ } else if ((instr->opc == OPC_META_PHI) ||
+ (instr->opc == OPC_META_DEREF)) {
+ /* treat like a normal instruction: */
+ printdef(ctx, defer, "instr%lx:<dst0> -> %s", PTRID(instr), target);
+ }
+ } else {
+ printdef(ctx, defer, "instr%lx:<dst0> -> %s", PTRID(instr), target);
+ }
+}
+
+static void dump_link(struct ir3_dump_ctx *ctx,
+ struct ir3_instruction *instr,
+ struct ir3_block *block, const char *target)
+{
+ bool defer = instr->block != block;
+ dump_link2(ctx, instr, target, defer);
+ printdef(ctx, defer, "\n");
+}
+
+static struct ir3_register *follow_flow(struct ir3_register *reg)
+{
+ if (reg->flags & IR3_REG_SSA) {
+ struct ir3_instruction *instr = reg->instr;
+ /* go with the flow.. */
+ if (is_meta(instr) && (instr->opc == OPC_META_FLOW))
+ return instr->regs[1];
+ }
+ return reg;
+}
+
+static void ir3_instr_dump(struct ir3_dump_ctx *ctx,
+ struct ir3_instruction *instr)
+{
+ unsigned i;
+
+ fprintf(ctx->f, "instr%lx [shape=record,style=filled,fillcolor=lightgrey,label=\"{",
+ PTRID(instr));
+ dump_instr_name(ctx, instr);
+
+ /* destination register: */
+ fprintf(ctx->f, "|<dst0>");
+
+ /* source register(s): */
+ for (i = 1; i < instr->regs_count; i++) {
+ struct ir3_register *reg = follow_flow(instr->regs[i]);
+
+ fprintf(ctx->f, "|");
+
+ if (reg->flags & IR3_REG_SSA)
+ fprintf(ctx->f, "<src%u> ", (i - 1));
+
+ dump_reg_name(ctx, reg);
+ }
+
+ fprintf(ctx->f, "}\"];\n");
+
+ /* and recursively dump dependent instructions: */
+ for (i = 1; i < instr->regs_count; i++) {
+ struct ir3_register *reg = instr->regs[i];
+ char target[32]; /* link target */
+
+ if (!(reg->flags & IR3_REG_SSA))
+ continue;
+
+ snprintf(target, sizeof(target), "instr%lx:<src%u>",
+ PTRID(instr), (i - 1));
+
+ dump_instr(ctx, reg->instr);
+ dump_link(ctx, follow_flow(reg)->instr, instr->block, target);
+ }
+}
+
+static void ir3_block_dump(struct ir3_dump_ctx *ctx,
+ struct ir3_block *block, const char *name)
+{
+ unsigned i, n;
+
+ n = edge_buf.n;
+
+ fprintf(ctx->f, "subgraph cluster%lx {\n", PTRID(block));
+ fprintf(ctx->f, "label=\"%s\";\n", name);
+
+ /* draw inputs: */
+ fprintf(ctx->f, "input%lx [shape=record,label=\"inputs", PTRID(block));
+ for (i = 0; i < block->ninputs; i++)
+ if (block->inputs[i])
+ fprintf(ctx->f, "|<in%u> i%u.%c", i, (i >> 2), "xyzw"[i & 0x3]);
+ fprintf(ctx->f, "\"];\n");
+
+ /* draw instruction graph: */
+ for (i = 0; i < block->noutputs; i++)
+ dump_instr(ctx, block->outputs[i]);
+
+ /* draw outputs: */
+ fprintf(ctx->f, "output%lx [shape=record,label=\"outputs", PTRID(block));
+ for (i = 0; i < block->noutputs; i++)
+ fprintf(ctx->f, "|<out%u> o%u.%c", i, (i >> 2), "xyzw"[i & 0x3]);
+ fprintf(ctx->f, "\"];\n");
+
+ /* and links to outputs: */
+ for (i = 0; i < block->noutputs; i++) {
+ char target[32]; /* link target */
+
+ /* NOTE: there could be outputs that are never assigned,
+ * so skip them
+ */
+ if (!block->outputs[i])
+ continue;
+
+ snprintf(target, sizeof(target), "output%lx:<out%u>:e",
+ PTRID(block), i);
+
+ dump_link(ctx, block->outputs[i], block, target);
+ }
+
+ fprintf(ctx->f, "}\n");
+
+ /* and links to inputs: */
+ if (block->parent) {
+ for (i = 0; i < block->ninputs; i++) {
+ char target[32]; /* link target */
+
+ if (!block->inputs[i])
+ continue;
+
+ dump_instr(ctx, block->inputs[i]);
+
+ snprintf(target, sizeof(target), "input%lx:<in%u>:e",
+ PTRID(block), i);
+
+ dump_link(ctx, block->inputs[i], block, target);
+ }
+ }
+
+ /* dump deferred edges: */
+ if (edge_buf.n > n) {
+ fprintf(ctx->f, "%*s", edge_buf.n - n, &edge_buf.buf[n]);
+ edge_buf.n = n;
+ }
+}
+
+void ir3_dump(struct ir3 *shader, const char *name,
+ struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3? */,
+ FILE *f)
+{
+ struct ir3_dump_ctx ctx = {
+ .f = f,
+ };
+ ir3_clear_mark(shader);
+ fprintf(ctx.f, "digraph G {\n");
+ fprintf(ctx.f, "rankdir=RL;\n");
+ fprintf(ctx.f, "nodesep=0.25;\n");
+ fprintf(ctx.f, "ranksep=1.5;\n");
+ ir3_block_dump(&ctx, block, name);
+ fprintf(ctx.f, "}\n");
+}
+
+/*
+ * For Debugging:
+ */
+
+void
+ir3_dump_instr_single(struct ir3_instruction *instr)
+{
+ struct ir3_dump_ctx ctx = {
+ .f = stdout,
+ .verbose = true,
+ };
+ unsigned i;
+
+ dump_instr_name(&ctx, instr);
+ for (i = 0; i < instr->regs_count; i++) {
+ struct ir3_register *reg = instr->regs[i];
+ printf(i ? ", " : " ");
+ dump_reg_name(&ctx, reg);
+ }
+ printf("\n");
+}
+
+void
+ir3_dump_instr_list(struct ir3_instruction *instr)
+{
+ unsigned n = 0;
+
+ while (instr) {
+ ir3_dump_instr_single(instr);
+ if (!is_meta(instr))
+ n++;
+ instr = instr->next;
+ }
+ printf("%u instructions\n", n);
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_flatten.c b/src/gallium/drivers/freedreno/ir3/ir3_flatten.c
new file mode 100644
index 00000000000..9389227034c
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_flatten.c
@@ -0,0 +1,155 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#include <stdarg.h>
+
+#include "ir3.h"
+
+/*
+ * Flatten: flatten out legs of if/else, etc
+ *
+ * TODO probably should use some heuristic to decide to not flatten
+ * if one side of the other is too large / deeply nested / whatever?
+ */
+
+struct ir3_flatten_ctx {
+ struct ir3_block *block;
+ unsigned cnt;
+};
+
+static struct ir3_register *unwrap(struct ir3_register *reg)
+{
+
+ if (reg->flags & IR3_REG_SSA) {
+ struct ir3_instruction *instr = reg->instr;
+ if (is_meta(instr)) {
+ switch (instr->opc) {
+ case OPC_META_OUTPUT:
+ case OPC_META_FLOW:
+ if (instr->regs_count > 1)
+ return instr->regs[1];
+ return NULL;
+ default:
+ break;
+ }
+ }
+ }
+ return reg;
+}
+
+static void ir3_instr_flatten(struct ir3_flatten_ctx *ctx,
+ struct ir3_instruction *instr)
+{
+ unsigned i;
+
+ /* if we've already visited this instruction, bail now: */
+ if (ir3_instr_check_mark(instr))
+ return;
+
+ instr->block = ctx->block;
+
+ /* TODO: maybe some threshold to decide whether to
+ * flatten or not??
+ */
+ if (is_meta(instr)) {
+ if (instr->opc == OPC_META_PHI) {
+ struct ir3_register *cond, *t, *f;
+
+ cond = unwrap(instr->regs[1]);
+ t = unwrap(instr->regs[2]); /* true val */
+ f = unwrap(instr->regs[3]); /* false val */
+
+ /* must have cond, but t or f may be null if only written
+ * one one side of the if/else (in which case we can just
+ * convert the PHI to a simple move).
+ */
+ assert(cond);
+ assert(t || f);
+
+ if (t && f) {
+ /* convert the PHI instruction to sel.{b16,b32} */
+ instr->category = 3;
+
+ /* instruction type based on dst size: */
+ if (instr->regs[0]->flags & IR3_REG_HALF)
+ instr->opc = OPC_SEL_B16;
+ else
+ instr->opc = OPC_SEL_B32;
+
+ instr->regs[1] = t;
+ instr->regs[2] = cond;
+ instr->regs[3] = f;
+ } else {
+ /* convert to simple mov: */
+ instr->category = 1;
+ instr->cat1.dst_type = TYPE_F32;
+ instr->cat1.src_type = TYPE_F32;
+ instr->regs_count = 2;
+ instr->regs[1] = t ? t : f;
+ }
+
+ ctx->cnt++;
+ } else if ((instr->opc == OPC_META_INPUT) &&
+ (instr->regs_count == 2)) {
+ type_t ftype;
+
+ if (instr->regs[0]->flags & IR3_REG_HALF)
+ ftype = TYPE_F16;
+ else
+ ftype = TYPE_F32;
+
+ /* convert meta:input to mov: */
+ instr->category = 1;
+ instr->cat1.src_type = ftype;
+ instr->cat1.dst_type = ftype;
+ }
+ }
+
+ /* recursively visit children: */
+ for (i = 1; i < instr->regs_count; i++) {
+ struct ir3_register *src = instr->regs[i];
+ if (src->flags & IR3_REG_SSA)
+ ir3_instr_flatten(ctx, src->instr);
+ }
+}
+
+/* return >= 0 is # of phi's flattened, < 0 is error */
+int ir3_block_flatten(struct ir3_block *block)
+{
+ struct ir3_flatten_ctx ctx = {
+ .block = block,
+ };
+ unsigned i;
+
+ ir3_clear_mark(block->shader);
+ for(i = 0; i < block->noutputs; i++)
+ if (block->outputs[i])
+ ir3_instr_flatten(&ctx, block->outputs[i]);
+
+ return ctx.cnt;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
new file mode 100644
index 00000000000..b916dd51393
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -0,0 +1,790 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+
+#include "ir3.h"
+#include "ir3_visitor.h"
+
+/*
+ * Register Assignment:
+ *
+ * NOTE: currently only works on a single basic block.. need to think
+ * about how multiple basic blocks are going to get scheduled. But
+ * I think I want to re-arrange how blocks work, ie. get rid of the
+ * block nesting thing..
+ *
+ * NOTE: we could do register coalescing (eliminate moves) as part of
+ * the RA step.. OTOH I think we need to do scheduling before register
+ * assignment. And if we remove a mov that effects scheduling (unless
+ * we leave a placeholder nop, which seems lame), so I'm not really
+ * sure how practical this is to do both in a single stage. But OTOH
+ * I'm not really sure a sane way for the CP stage to realize when it
+ * cannot remove a mov due to multi-register constraints..
+ *
+ */
+
+struct ir3_ra_ctx {
+ struct ir3_block *block;
+ enum shader_t type;
+ bool half_precision;
+ bool frag_coord;
+ bool frag_face;
+ bool has_samp;
+ int cnt;
+ bool error;
+};
+
+/* sorta ugly way to retrofit half-precision support.. rather than
+ * passing extra param around, just OR in a high bit. All the low
+ * value arithmetic (ie. +/- offset within a contiguous vec4, etc)
+ * will continue to work as long as you don't underflow (and that
+ * would go badly anyways).
+ */
+#define REG_HALF 0x8000
+
+struct ir3_ra_assignment {
+ int8_t off; /* offset of instruction dst within range */
+ uint8_t num; /* number of components for the range */
+};
+
+static void ra_assign(struct ir3_ra_ctx *ctx,
+ struct ir3_instruction *assigner, int num);
+static struct ir3_ra_assignment ra_calc(struct ir3_instruction *instr);
+
+/*
+ * Register Allocation:
+ */
+
+#define REG(n, wm, f) (struct ir3_register){ \
+ .flags = (f), \
+ .num = (n), \
+ .wrmask = TGSI_WRITEMASK_ ## wm, \
+ }
+
+/* check that the register exists, is a GPR and is not special (a0/p0) */
+static struct ir3_register * reg_check(struct ir3_instruction *instr, unsigned n)
+{
+ if ((n < instr->regs_count) && reg_gpr(instr->regs[n]))
+ return instr->regs[n];
+ return NULL;
+}
+
+static int output_base(struct ir3_ra_ctx *ctx)
+{
+ /* ugg, for fragment shader we need to have input at r0.x
+ * (or at least if there is a way to configure it, I can't
+ * see how because the blob driver always uses r0.x (ie.
+ * all zeros)
+ */
+ if (ctx->type == SHADER_FRAGMENT) {
+ if (ctx->half_precision)
+ return ctx->frag_face ? 4 : 3;
+ return ctx->frag_coord ? 8 : 4;
+ }
+ return 0;
+}
+
+/* live means read before written */
+static void compute_liveregs(struct ir3_ra_ctx *ctx,
+ struct ir3_instruction *instr, regmask_t *liveregs)
+{
+ struct ir3_block *block = instr->block;
+ regmask_t written;
+ unsigned i, j;
+
+ regmask_init(liveregs);
+ regmask_init(&written);
+
+ for (instr = instr->next; instr; instr = instr->next) {
+ struct ir3_register *r;
+
+ if (is_meta(instr))
+ continue;
+
+ /* check first src's read: */
+ for (j = 1; j < instr->regs_count; j++) {
+ r = reg_check(instr, j);
+ if (r)
+ regmask_set_if_not(liveregs, r, &written);
+ }
+
+ /* then dst written (if assigned already): */
+ if (instr->flags & IR3_INSTR_MARK) {
+ r = reg_check(instr, 0);
+ if (r)
+ regmask_set(&written, r);
+ }
+ }
+
+ /* be sure to account for output registers too: */
+ for (i = 0; i < block->noutputs; i++) {
+ struct ir3_register reg = REG(output_base(ctx) + i, X, 0);
+ regmask_set_if_not(liveregs, &reg, &written);
+ }
+}
+
+/* calculate registers that are clobbered before last use of 'assigner'.
+ * This needs to be done backwards, although it could possibly be
+ * combined into compute_liveregs(). (Ie. compute_liveregs() could
+ * reverse the list, then do this part backwards reversing the list
+ * again back to original order.) Otoh, probably I should try to
+ * construct a proper interference graph instead.
+ *
+ * XXX this need to follow the same recursion path that is used for
+ * to rename/assign registers (ie. ra_assign_src()).. this is a bit
+ * ugly right now, maybe refactor into node iterator sort of things
+ * that iterates nodes in the correct order?
+ */
+static bool compute_clobbers(struct ir3_ra_ctx *ctx,
+ struct ir3_instruction *instr, struct ir3_instruction *assigner,
+ regmask_t *liveregs)
+{
+ unsigned i;
+ bool live = false, was_live = false;
+
+ if (instr == NULL) {
+ struct ir3_block *block = ctx->block;
+
+ /* if at the end, check outputs: */
+ for (i = 0; i < block->noutputs; i++)
+ if (block->outputs[i] == assigner)
+ return true;
+ return false;
+ }
+
+ for (i = 1; i < instr->regs_count; i++) {
+ struct ir3_register *reg = instr->regs[i];
+ if ((reg->flags & IR3_REG_SSA) && (reg->instr == assigner)) {
+ if (is_meta(instr)) {
+ switch (instr->opc) {
+ case OPC_META_INPUT:
+ // TODO
+ assert(0);
+ break;
+ case OPC_META_FO:
+ case OPC_META_FI:
+ was_live |= compute_clobbers(ctx, instr->next,
+ instr, liveregs);
+ break;
+ default:
+ break;
+ }
+ }
+ live = true;
+ break;
+ }
+ }
+
+ was_live |= compute_clobbers(ctx, instr->next, assigner, liveregs);
+
+ if (was_live && (instr->regs_count > 0) &&
+ (instr->flags & IR3_INSTR_MARK) &&
+ !is_meta(instr))
+ regmask_set(liveregs, instr->regs[0]);
+
+ return live || was_live;
+}
+
+static int find_available(regmask_t *liveregs, int size, bool half)
+{
+ unsigned i;
+ unsigned f = half ? IR3_REG_HALF : 0;
+ for (i = 0; i < MAX_REG - size; i++) {
+ if (!regmask_get(liveregs, &REG(i, X, f))) {
+ unsigned start = i++;
+ for (; (i < MAX_REG) && ((i - start) < size); i++)
+ if (regmask_get(liveregs, &REG(i, X, f)))
+ break;
+ if ((i - start) >= size)
+ return start;
+ }
+ }
+ assert(0);
+ return -1;
+}
+
+static int alloc_block(struct ir3_ra_ctx *ctx,
+ struct ir3_instruction *instr, int size)
+{
+ if (!instr) {
+ /* special case, allocating shader outputs. At this
+ * point, nothing is allocated, just start the shader
+ * outputs at r0.x and let compute_liveregs() take
+ * care of the rest from here:
+ */
+ return 0;
+ } else {
+ struct ir3_register *dst = instr->regs[0];
+ regmask_t liveregs;
+
+ compute_liveregs(ctx, instr, &liveregs);
+
+ // XXX XXX XXX XXX XXX XXX XXX XXX XXX
+ // XXX hack.. maybe ra_calc should give us a list of
+ // instrs to compute_clobbers() on?
+ if (is_meta(instr) && (instr->opc == OPC_META_INPUT) &&
+ (instr->regs_count == 1)) {
+ unsigned i, base = instr->regs[0]->num & ~0x3;
+ for (i = 0; i < 4; i++) {
+ struct ir3_instruction *in = ctx->block->inputs[base + i];
+ if (in)
+ compute_clobbers(ctx, in->next, in, &liveregs);
+ }
+ } else
+ // XXX XXX XXX XXX XXX XXX XXX XXX XXX
+ compute_clobbers(ctx, instr->next, instr, &liveregs);
+
+ return find_available(&liveregs, size,
+ !!(dst->flags & IR3_REG_HALF));
+ }
+}
+
+/*
+ * Constraint Calculation:
+ */
+
+struct ra_calc_visitor {
+ struct ir3_visitor base;
+ struct ir3_ra_assignment a;
+};
+
+static inline struct ra_calc_visitor *ra_calc_visitor(struct ir3_visitor *v)
+{
+ return (struct ra_calc_visitor *)v;
+}
+
+/* calculate register assignment for the instruction. If the register
+ * written by this instruction is required to be part of a range, to
+ * handle other (input/output/sam/bary.f/etc) contiguous register range
+ * constraints, that is calculated handled here.
+ */
+static void ra_calc_dst(struct ir3_visitor *v,
+ struct ir3_instruction *instr, struct ir3_register *reg)
+{
+ struct ra_calc_visitor *c = ra_calc_visitor(v);
+ if (is_tex(instr)) {
+ c->a.off = 0;
+ c->a.num = 4;
+ } else {
+ c->a.off = 0;
+ c->a.num = 1;
+ }
+}
+
+static void
+ra_calc_dst_shader_input(struct ir3_visitor *v,
+ struct ir3_instruction *instr, struct ir3_register *reg)
+{
+ struct ra_calc_visitor *c = ra_calc_visitor(v);
+ struct ir3_block *block = instr->block;
+ struct ir3_register *dst = instr->regs[0];
+ unsigned base = dst->num & ~0x3;
+ unsigned i, num = 0;
+
+ assert(!(dst->flags & IR3_REG_IA));
+
+ /* check what input components we need: */
+ for (i = 0; i < 4; i++) {
+ unsigned idx = base + i;
+ if ((idx < block->ninputs) && block->inputs[idx])
+ num = i + 1;
+ }
+
+ c->a.off = dst->num - base;
+ c->a.num = num;
+}
+
+static void ra_calc_src_fanin(struct ir3_visitor *v,
+ struct ir3_instruction *instr, struct ir3_register *reg)
+{
+ struct ra_calc_visitor *c = ra_calc_visitor(v);
+ unsigned srcn = ir3_instr_regno(instr, reg) - 1;
+ c->a.off += srcn;
+ c->a.num += srcn;
+ c->a.num = MAX2(c->a.num, instr->regs_count - 1);
+}
+
+static const struct ir3_visitor_funcs calc_visitor_funcs = {
+ .instr = ir3_visit_instr,
+ .dst_shader_input = ra_calc_dst_shader_input,
+ .dst_fanout = ra_calc_dst,
+ .dst_fanin = ra_calc_dst,
+ .dst = ra_calc_dst,
+ .src_fanout = ir3_visit_reg,
+ .src_fanin = ra_calc_src_fanin,
+ .src = ir3_visit_reg,
+};
+
+static struct ir3_ra_assignment ra_calc(struct ir3_instruction *assigner)
+{
+ struct ra_calc_visitor v = {
+ .base.funcs = &calc_visitor_funcs,
+ };
+
+ ir3_visit_instr(&v.base, assigner);
+
+ return v.a;
+}
+
+/*
+ * Register Assignment:
+ */
+
+struct ra_assign_visitor {
+ struct ir3_visitor base;
+ struct ir3_ra_ctx *ctx;
+ int num;
+};
+
+static inline struct ra_assign_visitor *ra_assign_visitor(struct ir3_visitor *v)
+{
+ return (struct ra_assign_visitor *)v;
+}
+
+static type_t half_type(type_t type)
+{
+ switch (type) {
+ case TYPE_F32: return TYPE_F16;
+ case TYPE_U32: return TYPE_U16;
+ case TYPE_S32: return TYPE_S16;
+ /* instructions may already be fixed up: */
+ case TYPE_F16:
+ case TYPE_U16:
+ case TYPE_S16:
+ return type;
+ default:
+ assert(0);
+ return ~0;
+ }
+}
+
+/* some instructions need fix-up if dst register is half precision: */
+static void fixup_half_instr_dst(struct ir3_instruction *instr)
+{
+ switch (instr->category) {
+ case 1: /* move instructions */
+ instr->cat1.dst_type = half_type(instr->cat1.dst_type);
+ break;
+ case 3:
+ switch (instr->opc) {
+ case OPC_MAD_F32:
+ instr->opc = OPC_MAD_F16;
+ break;
+ case OPC_SEL_B32:
+ instr->opc = OPC_SEL_B16;
+ break;
+ case OPC_SEL_S32:
+ instr->opc = OPC_SEL_S16;
+ break;
+ case OPC_SEL_F32:
+ instr->opc = OPC_SEL_F16;
+ break;
+ case OPC_SAD_S32:
+ instr->opc = OPC_SAD_S16;
+ break;
+ /* instructions may already be fixed up: */
+ case OPC_MAD_F16:
+ case OPC_SEL_B16:
+ case OPC_SEL_S16:
+ case OPC_SEL_F16:
+ case OPC_SAD_S16:
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ break;
+ case 5:
+ instr->cat5.type = half_type(instr->cat5.type);
+ break;
+ }
+}
+/* some instructions need fix-up if src register is half precision: */
+static void fixup_half_instr_src(struct ir3_instruction *instr)
+{
+ switch (instr->category) {
+ case 1: /* move instructions */
+ instr->cat1.src_type = half_type(instr->cat1.src_type);
+ break;
+ }
+}
+
+static void ra_assign_reg(struct ir3_visitor *v,
+ struct ir3_instruction *instr, struct ir3_register *reg)
+{
+ struct ra_assign_visitor *a = ra_assign_visitor(v);
+
+ if (is_flow(instr) && (instr->opc == OPC_KILL))
+ return;
+
+ reg->flags &= ~IR3_REG_SSA;
+ reg->num = a->num & ~REG_HALF;
+
+ assert(reg->num >= 0);
+
+ if (a->num & REG_HALF) {
+ reg->flags |= IR3_REG_HALF;
+ /* if dst reg being assigned, patch up the instr: */
+ if (reg == instr->regs[0])
+ fixup_half_instr_dst(instr);
+ else
+ fixup_half_instr_src(instr);
+ }
+}
+
+static void ra_assign_dst_shader_input(struct ir3_visitor *v,
+ struct ir3_instruction *instr, struct ir3_register *reg)
+{
+ struct ra_assign_visitor *a = ra_assign_visitor(v);
+ unsigned i, base = reg->num & ~0x3;
+ int off = base - reg->num;
+
+ ra_assign_reg(v, instr, reg);
+ reg->flags |= IR3_REG_IA;
+
+ /* trigger assignment of all our companion input components: */
+ for (i = 0; i < 4; i++) {
+ struct ir3_instruction *in = instr->block->inputs[i+base];
+ if (in && is_meta(in) && (in->opc == OPC_META_INPUT))
+ ra_assign(a->ctx, in, a->num + off + i);
+ }
+}
+
+static void ra_assign_dst_fanout(struct ir3_visitor *v,
+ struct ir3_instruction *instr, struct ir3_register *reg)
+{
+ struct ra_assign_visitor *a = ra_assign_visitor(v);
+ struct ir3_register *src = instr->regs[1];
+ ra_assign_reg(v, instr, reg);
+ if (src->flags & IR3_REG_SSA)
+ ra_assign(a->ctx, src->instr, a->num - instr->fo.off);
+}
+
+static void ra_assign_src_fanout(struct ir3_visitor *v,
+ struct ir3_instruction *instr, struct ir3_register *reg)
+{
+ struct ra_assign_visitor *a = ra_assign_visitor(v);
+ ra_assign_reg(v, instr, reg);
+ ra_assign(a->ctx, instr, a->num + instr->fo.off);
+}
+
+
+static void ra_assign_src_fanin(struct ir3_visitor *v,
+ struct ir3_instruction *instr, struct ir3_register *reg)
+{
+ struct ra_assign_visitor *a = ra_assign_visitor(v);
+ unsigned j, srcn = ir3_instr_regno(instr, reg) - 1;
+ ra_assign_reg(v, instr, reg);
+ ra_assign(a->ctx, instr, a->num - srcn);
+ for (j = 1; j < instr->regs_count; j++) {
+ struct ir3_register *reg = instr->regs[j];
+ if (reg->flags & IR3_REG_SSA) /* could be renamed already */
+ ra_assign(a->ctx, reg->instr, a->num - srcn + j - 1);
+ }
+}
+
+static const struct ir3_visitor_funcs assign_visitor_funcs = {
+ .instr = ir3_visit_instr,
+ .dst_shader_input = ra_assign_dst_shader_input,
+ .dst_fanout = ra_assign_dst_fanout,
+ .dst_fanin = ra_assign_reg,
+ .dst = ra_assign_reg,
+ .src_fanout = ra_assign_src_fanout,
+ .src_fanin = ra_assign_src_fanin,
+ .src = ra_assign_reg,
+};
+
+static void ra_assign(struct ir3_ra_ctx *ctx,
+ struct ir3_instruction *assigner, int num)
+{
+ struct ra_assign_visitor v = {
+ .base.funcs = &assign_visitor_funcs,
+ .ctx = ctx,
+ .num = num,
+ };
+
+ /* if we've already visited this instruction, bail now: */
+ if (ir3_instr_check_mark(assigner)) {
+ debug_assert(assigner->regs[0]->num == (num & ~REG_HALF));
+ if (assigner->regs[0]->num != (num & ~REG_HALF)) {
+ /* impossible situation, should have been resolved
+ * at an earlier stage by inserting extra mov's:
+ */
+ ctx->error = true;
+ }
+ return;
+ }
+
+ ir3_visit_instr(&v.base, assigner);
+}
+
+/*
+ *
+ */
+
+static void ir3_instr_ra(struct ir3_ra_ctx *ctx,
+ struct ir3_instruction *instr)
+{
+ struct ir3_register *dst;
+ unsigned num;
+
+ /* skip over nop's */
+ if (instr->regs_count == 0)
+ return;
+
+ dst = instr->regs[0];
+
+ /* if we've already visited this instruction, bail now: */
+ if (instr->flags & IR3_INSTR_MARK)
+ return;
+
+ /* allocate register(s): */
+ if (is_addr(instr)) {
+ num = instr->regs[2]->num;
+ } else if (reg_gpr(dst)) {
+ struct ir3_ra_assignment a;
+ a = ra_calc(instr);
+ num = alloc_block(ctx, instr, a.num) + a.off;
+ } else if (dst->flags & IR3_REG_ADDR) {
+ dst->flags &= ~IR3_REG_ADDR;
+ num = regid(REG_A0, 0) | REG_HALF;
+ } else {
+ /* predicate register (p0).. etc */
+ return;
+ }
+
+ ra_assign(ctx, instr, num);
+}
+
+/* flatten into shader: */
+// XXX this should probably be somewhere else:
+static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+ struct ir3_instruction *n;
+ struct ir3 *shader = block->shader;
+ struct ir3_instruction *end =
+ ir3_instr_create(block, 0, OPC_END);
+ struct ir3_instruction *last_input = NULL;
+ struct ir3_instruction *last_rel = NULL;
+ regmask_t needs_ss_war; /* write after read */
+ regmask_t needs_ss;
+ regmask_t needs_sy;
+
+ regmask_init(&needs_ss_war);
+ regmask_init(&needs_ss);
+ regmask_init(&needs_sy);
+
+ shader->instrs_count = 0;
+
+ for (n = block->head; n; n = n->next) {
+ struct ir3_register *reg;
+ unsigned i;
+
+ if (is_meta(n))
+ continue;
+
+ for (i = 1; i < n->regs_count; i++) {
+ reg = n->regs[i];
+
+ if (reg_gpr(reg)) {
+
+ /* TODO: we probably only need (ss) for alu
+ * instr consuming sfu result.. need to make
+ * some tests for both this and (sy)..
+ */
+ if (regmask_get(&needs_ss, reg)) {
+ n->flags |= IR3_INSTR_SS;
+ regmask_init(&needs_ss);
+ }
+
+ if (regmask_get(&needs_sy, reg)) {
+ n->flags |= IR3_INSTR_SY;
+ regmask_init(&needs_sy);
+ }
+ }
+
+ /* TODO: is it valid to have address reg loaded from a
+ * relative src (ie. mova a0, c<a0.x+4>)? If so, the
+ * last_rel check below should be moved ahead of this:
+ */
+ if (reg->flags & IR3_REG_RELATIV)
+ last_rel = n;
+ }
+
+ if (n->regs_count > 0) {
+ reg = n->regs[0];
+ if (regmask_get(&needs_ss_war, reg)) {
+ n->flags |= IR3_INSTR_SS;
+ regmask_init(&needs_ss_war); // ??? I assume?
+ }
+
+ if (last_rel && (reg->num == regid(REG_A0, 0))) {
+ last_rel->flags |= IR3_INSTR_UL;
+ last_rel = NULL;
+ }
+ }
+
+ /* cat5+ does not have an (ss) bit, if needed we need to
+ * insert a nop to carry the sync flag. Would be kinda
+ * clever if we were aware of this during scheduling, but
+ * this should be a pretty rare case:
+ */
+ if ((n->flags & IR3_INSTR_SS) && (n->category >= 5)) {
+ struct ir3_instruction *nop;
+ nop = ir3_instr_create(block, 0, OPC_NOP);
+ nop->flags |= IR3_INSTR_SS;
+ n->flags &= ~IR3_INSTR_SS;
+ }
+
+ /* need to be able to set (ss) on first instruction: */
+ if ((shader->instrs_count == 0) && (n->category >= 5))
+ ir3_instr_create(block, 0, OPC_NOP);
+
+ if (is_nop(n) && shader->instrs_count) {
+ struct ir3_instruction *last =
+ shader->instrs[shader->instrs_count-1];
+ if (is_nop(last) && (last->repeat < 5)) {
+ last->repeat++;
+ last->flags |= n->flags;
+ continue;
+ }
+ }
+
+ shader->instrs[shader->instrs_count++] = n;
+
+ if (is_sfu(n))
+ regmask_set(&needs_ss, n->regs[0]);
+
+ if (is_tex(n)) {
+ /* this ends up being the # of samp instructions.. but that
+ * is ok, everything else only cares whether it is zero or
+ * not. We do this here, rather than when we encounter a
+ * SAMP decl, because (especially in binning pass shader)
+ * the samp instruction(s) could get eliminated if the
+ * result is not used.
+ */
+ ctx->has_samp = true;
+ regmask_set(&needs_sy, n->regs[0]);
+ }
+
+ /* both tex/sfu appear to not always immediately consume
+ * their src register(s):
+ */
+ if (is_tex(n) || is_sfu(n)) {
+ for (i = 1; i < n->regs_count; i++) {
+ reg = n->regs[i];
+ if (reg_gpr(reg))
+ regmask_set(&needs_ss_war, reg);
+ }
+ }
+
+ if (is_input(n))
+ last_input = n;
+ }
+
+ if (last_input)
+ last_input->regs[0]->flags |= IR3_REG_EI;
+
+ if (last_rel)
+ last_rel->flags |= IR3_INSTR_UL;
+
+ shader->instrs[shader->instrs_count++] = end;
+
+ shader->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
+}
+
+static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+ struct ir3_instruction *n;
+
+ if (!block->parent) {
+ unsigned i, j;
+ int base, off = output_base(ctx);
+
+ base = alloc_block(ctx, NULL, block->noutputs + off);
+
+ if (ctx->half_precision)
+ base |= REG_HALF;
+
+ for (i = 0; i < block->noutputs; i++)
+ if (block->outputs[i] && !is_kill(block->outputs[i]))
+ ra_assign(ctx, block->outputs[i], base + i + off);
+
+ if (ctx->type == SHADER_FRAGMENT) {
+ i = 0;
+ if (ctx->frag_face) {
+ /* if we have frag_face, it gets hr0.x */
+ ra_assign(ctx, block->inputs[i], REG_HALF | 0);
+ i += 4;
+ }
+ for (j = 0; i < block->ninputs; i++, j++)
+ if (block->inputs[i])
+ ra_assign(ctx, block->inputs[i], (base & ~REG_HALF) + j);
+ } else {
+ for (i = 0; i < block->ninputs; i++)
+ if (block->inputs[i])
+ ir3_instr_ra(ctx, block->inputs[i]);
+ }
+ }
+
+ /* then loop over instruction list and assign registers:
+ */
+ n = block->head;
+ while (n) {
+ ir3_instr_ra(ctx, n);
+ if (ctx->error)
+ return -1;
+ n = n->next;
+ }
+
+ legalize(ctx, block);
+
+ return 0;
+}
+
+int ir3_block_ra(struct ir3_block *block, enum shader_t type,
+ bool half_precision, bool frag_coord, bool frag_face,
+ bool *has_samp)
+{
+ struct ir3_ra_ctx ctx = {
+ .block = block,
+ .type = type,
+ .half_precision = half_precision,
+ .frag_coord = frag_coord,
+ .frag_face = frag_face,
+ };
+ int ret;
+
+ ir3_clear_mark(block->shader);
+ ret = block_ra(&ctx, block);
+ *has_samp = ctx.has_samp;
+
+ return ret;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
new file mode 100644
index 00000000000..3ef67731926
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -0,0 +1,401 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+
+#include "util/u_math.h"
+
+#include "ir3.h"
+
+enum {
+ SCHEDULED = -1,
+ DELAYED = -2,
+};
+
+/*
+ * Instruction Scheduling:
+ *
+ * Using the depth sorted list from depth pass, attempt to recursively
+ * schedule deepest unscheduled path. The first instruction that cannot
+ * be scheduled, returns the required delay slots it needs, at which
+ * point we return back up to the top and attempt to schedule by next
+ * highest depth. After a sufficient number of instructions have been
+ * scheduled, return back to beginning of list and start again. If you
+ * reach the end of depth sorted list without being able to insert any
+ * instruction, insert nop's. Repeat until no more unscheduled
+ * instructions.
+ *
+ * There are a few special cases that need to be handled, since sched
+ * is currently independent of register allocation. Usages of address
+ * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
+ * if you have two pairs of instructions that write the same special
+ * register and then read it, then those pairs cannot be interleaved.
+ * To solve this, when we are in such a scheduling "critical section",
+ * and we encounter a conflicting write to a special register, we try
+ * to schedule any remaining instructions that use that value first.
+ */
+
+struct ir3_sched_ctx {
+ struct ir3_instruction *scheduled; /* last scheduled instr */
+ struct ir3_instruction *addr; /* current a0.x user, if any */
+ struct ir3_instruction *pred; /* current p0.x user, if any */
+ unsigned cnt;
+};
+
+static struct ir3_instruction *
+deepest(struct ir3_instruction **srcs, unsigned nsrcs)
+{
+ struct ir3_instruction *d = NULL;
+ unsigned i = 0, id = 0;
+
+ while ((i < nsrcs) && !(d = srcs[id = i]))
+ i++;
+
+ if (!d)
+ return NULL;
+
+ for (; i < nsrcs; i++)
+ if (srcs[i] && (srcs[i]->depth > d->depth))
+ d = srcs[id = i];
+
+ srcs[id] = NULL;
+
+ return d;
+}
+
+static unsigned distance(struct ir3_sched_ctx *ctx,
+ struct ir3_instruction *instr, unsigned maxd)
+{
+ struct ir3_instruction *n = ctx->scheduled;
+ unsigned d = 0;
+ while (n && (n != instr) && (d < maxd)) {
+ if (is_alu(n) || is_flow(n))
+ d++;
+ n = n->next;
+ }
+ return d;
+}
+
+/* TODO maybe we want double linked list? */
+static struct ir3_instruction * prev(struct ir3_instruction *instr)
+{
+ struct ir3_instruction *p = instr->block->head;
+ while (p && (p->next != instr))
+ p = p->next;
+ return p;
+}
+
+static void schedule(struct ir3_sched_ctx *ctx,
+ struct ir3_instruction *instr, bool remove)
+{
+ struct ir3_block *block = instr->block;
+
+ /* maybe there is a better way to handle this than just stuffing
+ * a nop.. ideally we'd know about this constraint in the
+ * scheduling and depth calculation..
+ */
+ if (ctx->scheduled && is_sfu(ctx->scheduled) && is_sfu(instr))
+ schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false);
+
+ /* remove from depth list:
+ */
+ if (remove) {
+ struct ir3_instruction *p = prev(instr);
+
+ /* NOTE: this can happen for inputs which are not
+ * read.. in that case there is no need to schedule
+ * the input, so just bail:
+ */
+ if (instr != (p ? p->next : block->head))
+ return;
+
+ if (p)
+ p->next = instr->next;
+ else
+ block->head = instr->next;
+ }
+
+ if (writes_addr(instr)) {
+ assert(ctx->addr == NULL);
+ ctx->addr = instr;
+ }
+
+ if (writes_pred(instr)) {
+ assert(ctx->pred == NULL);
+ ctx->pred = instr;
+ }
+
+ instr->flags |= IR3_INSTR_MARK;
+
+ instr->next = ctx->scheduled;
+ ctx->scheduled = instr;
+
+ ctx->cnt++;
+}
+
+/*
+ * Delay-slot calculation. Follows fanin/fanout.
+ */
+
+static unsigned delay_calc2(struct ir3_sched_ctx *ctx,
+ struct ir3_instruction *assigner,
+ struct ir3_instruction *consumer, unsigned srcn)
+{
+ unsigned delay = 0;
+
+ if (is_meta(assigner)) {
+ unsigned i;
+ for (i = 1; i < assigner->regs_count; i++) {
+ struct ir3_register *reg = assigner->regs[i];
+ if (reg->flags & IR3_REG_SSA) {
+ unsigned d = delay_calc2(ctx, reg->instr,
+ consumer, srcn);
+ delay = MAX2(delay, d);
+ }
+ }
+ } else {
+ delay = ir3_delayslots(assigner, consumer, srcn);
+ delay -= distance(ctx, assigner, delay);
+ }
+
+ return delay;
+}
+
+static unsigned delay_calc(struct ir3_sched_ctx *ctx,
+ struct ir3_instruction *instr)
+{
+ unsigned i, delay = 0;
+
+ for (i = 1; i < instr->regs_count; i++) {
+ struct ir3_register *reg = instr->regs[i];
+ if (reg->flags & IR3_REG_SSA) {
+ unsigned d = delay_calc2(ctx, reg->instr,
+ instr, i - 1);
+ delay = MAX2(delay, d);
+ }
+ }
+
+ return delay;
+}
+
+/* A negative return value signals that an instruction has been newly
+ * scheduled, return back up to the top of the stack (to block_sched())
+ */
+static int trysched(struct ir3_sched_ctx *ctx,
+ struct ir3_instruction *instr)
+{
+ struct ir3_instruction *srcs[ARRAY_SIZE(instr->regs) - 1];
+ struct ir3_instruction *src;
+ unsigned i, delay, nsrcs = 0;
+
+ /* if already scheduled: */
+ if (instr->flags & IR3_INSTR_MARK)
+ return 0;
+
+ /* figure out our src's: */
+ for (i = 1; i < instr->regs_count; i++) {
+ struct ir3_register *reg = instr->regs[i];
+ if (reg->flags & IR3_REG_SSA)
+ srcs[nsrcs++] = reg->instr;
+ }
+
+ /* for each src register in sorted order:
+ */
+ delay = 0;
+ while ((src = deepest(srcs, nsrcs))) {
+ delay = trysched(ctx, src);
+ if (delay)
+ return delay;
+ }
+
+ /* all our dependents are scheduled, figure out if
+ * we have enough delay slots to schedule ourself:
+ */
+ delay = delay_calc(ctx, instr);
+ if (delay)
+ return delay;
+
+ /* if this is a write to address/predicate register, and that
+ * register is currently in use, we need to defer until it is
+ * free:
+ */
+ if (writes_addr(instr) && ctx->addr) {
+ assert(ctx->addr != instr);
+ return DELAYED;
+ }
+ if (writes_pred(instr) && ctx->pred) {
+ assert(ctx->pred != instr);
+ return DELAYED;
+ }
+
+ schedule(ctx, instr, true);
+ return SCHEDULED;
+}
+
+static struct ir3_instruction * reverse(struct ir3_instruction *instr)
+{
+ struct ir3_instruction *reversed = NULL;
+ while (instr) {
+ struct ir3_instruction *next = instr->next;
+ instr->next = reversed;
+ reversed = instr;
+ instr = next;
+ }
+ return reversed;
+}
+
+static bool uses_current_addr(struct ir3_sched_ctx *ctx,
+ struct ir3_instruction *instr)
+{
+ unsigned i;
+ for (i = 1; i < instr->regs_count; i++) {
+ struct ir3_register *reg = instr->regs[i];
+ if (reg->flags & IR3_REG_SSA) {
+ if (is_addr(reg->instr)) {
+ struct ir3_instruction *addr;
+ addr = reg->instr->regs[1]->instr; /* the mova */
+ if (ctx->addr == addr)
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+static bool uses_current_pred(struct ir3_sched_ctx *ctx,
+ struct ir3_instruction *instr)
+{
+ unsigned i;
+ for (i = 1; i < instr->regs_count; i++) {
+ struct ir3_register *reg = instr->regs[i];
+ if ((reg->flags & IR3_REG_SSA) && (ctx->pred == reg->instr))
+ return true;
+ }
+ return false;
+}
+
+/* when we encounter an instruction that writes to the address register
+ * when it is in use, we delay that instruction and try to schedule all
+ * other instructions using the current address register:
+ */
+static int block_sched_undelayed(struct ir3_sched_ctx *ctx,
+ struct ir3_block *block)
+{
+ struct ir3_instruction *instr = block->head;
+ bool addr_in_use = false;
+ bool pred_in_use = false;
+ unsigned cnt = ~0;
+
+ while (instr) {
+ struct ir3_instruction *next = instr->next;
+ bool addr = uses_current_addr(ctx, instr);
+ bool pred = uses_current_pred(ctx, instr);
+
+ if (addr || pred) {
+ int ret = trysched(ctx, instr);
+ if (ret == SCHEDULED)
+ cnt = 0;
+ else if (ret > 0)
+ cnt = MIN2(cnt, ret);
+ if (addr)
+ addr_in_use = true;
+ if (pred)
+ pred_in_use = true;
+ }
+
+ instr = next;
+ }
+
+ if (!addr_in_use)
+ ctx->addr = NULL;
+
+ if (!pred_in_use)
+ ctx->pred = NULL;
+
+ return cnt;
+}
+
+static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+{
+ struct ir3_instruction *instr;
+
+ /* schedule all the shader input's (meta-instr) first so that
+ * the RA step sees that the input registers contain a value
+ * from the start of the shader:
+ */
+ if (!block->parent) {
+ unsigned i;
+ for (i = 0; i < block->ninputs; i++) {
+ struct ir3_instruction *in = block->inputs[i];
+ if (in)
+ schedule(ctx, in, true);
+ }
+ }
+
+ while ((instr = block->head)) {
+ /* NOTE: always grab next *before* trysched(), in case the
+ * instruction is actually scheduled (and therefore moved
+ * from depth list into scheduled list)
+ */
+ struct ir3_instruction *next = instr->next;
+ int cnt = trysched(ctx, instr);
+
+ if (cnt == DELAYED)
+ cnt = block_sched_undelayed(ctx, block);
+
+ /* -1 is signal to return up stack, but to us means same as 0: */
+ cnt = MAX2(0, cnt);
+ cnt += ctx->cnt;
+ instr = next;
+
+ /* if deepest remaining instruction cannot be scheduled, try
+ * the increasingly more shallow instructions until needed
+ * number of delay slots is filled:
+ */
+ while (instr && (cnt > ctx->cnt)) {
+ next = instr->next;
+ trysched(ctx, instr);
+ instr = next;
+ }
+
+ /* and if we run out of instructions that can be scheduled,
+ * then it is time for nop's:
+ */
+ while (cnt > ctx->cnt)
+ schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false);
+ }
+
+ /* at this point, scheduled list is in reverse order, so fix that: */
+ block->head = reverse(ctx->scheduled);
+}
+
+void ir3_block_sched(struct ir3_block *block)
+{
+ struct ir3_sched_ctx ctx = {0};
+ ir3_clear_mark(block->shader);
+ block_sched(&ctx, block);
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
new file mode 100644
index 00000000000..ddf99dbc46e
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -0,0 +1,211 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#include "pipe/p_state.h"
+#include "util/u_string.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "freedreno_context.h"
+#include "freedreno_lowering.h"
+#include "freedreno_util.h"
+
+#include "ir3_shader.h"
+#include "ir3_compiler.h"
+
+
+static void
+delete_variant(struct ir3_shader_variant *v)
+{
+ ir3_destroy(v->ir);
+ fd_bo_del(v->bo);
+ free(v);
+}
+
+static void
+assemble_variant(struct ir3_shader_variant *v)
+{
+ struct fd_context *ctx = fd_context(v->shader->pctx);
+ uint32_t sz, *bin;
+
+ bin = ir3_assemble(v->ir, &v->info);
+ sz = v->info.sizedwords * 4;
+
+ v->bo = fd_bo_new(ctx->dev, sz,
+ DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
+ DRM_FREEDRENO_GEM_TYPE_KMEM);
+
+ memcpy(fd_bo_map(v->bo), bin, sz);
+
+ free(bin);
+
+ v->instrlen = v->info.sizedwords / 8;
+ v->constlen = v->info.max_const + 1;
+}
+
+/* for vertex shader, the inputs are loaded into registers before the shader
+ * is executed, so max_regs from the shader instructions might not properly
+ * reflect the # of registers actually used:
+ */
+static void
+fixup_vp_regfootprint(struct ir3_shader_variant *v)
+{
+ unsigned i;
+ for (i = 0; i < v->inputs_count; i++) {
+ if (v->inputs[i].compmask) {
+ uint32_t regid = (v->inputs[i].regid + 3) >> 2;
+ v->info.max_reg = MAX2(v->info.max_reg, regid);
+ }
+ }
+ for (i = 0; i < v->outputs_count; i++) {
+ uint32_t regid = (v->outputs[i].regid + 3) >> 2;
+ v->info.max_reg = MAX2(v->info.max_reg, regid);
+ }
+}
+
+static struct ir3_shader_variant *
+create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
+{
+ struct ir3_shader_variant *v = CALLOC_STRUCT(ir3_shader_variant);
+ const struct tgsi_token *tokens = shader->tokens;
+ int ret;
+
+ if (!v)
+ return NULL;
+
+ v->shader = shader;
+ v->key = key;
+ v->type = shader->type;
+
+ if (fd_mesa_debug & FD_DBG_DISASM) {
+ DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", shader->type,
+ key.binning_pass, key.color_two_side, key.half_precision);
+ tgsi_dump(tokens, 0);
+ }
+
+ if (!(fd_mesa_debug & FD_DBG_NOOPT)) {
+ ret = ir3_compile_shader(v, tokens, key);
+ if (ret) {
+ debug_error("new compiler failed, trying fallback!");
+
+ v->inputs_count = 0;
+ v->outputs_count = 0;
+ v->total_in = 0;
+ v->has_samp = false;
+ v->immediates_count = 0;
+ }
+ } else {
+ ret = -1; /* force fallback to old compiler */
+ }
+
+ if (ret)
+ ret = ir3_compile_shader_old(v, tokens, key);
+
+ if (ret) {
+ debug_error("compile failed!");
+ goto fail;
+ }
+
+ assemble_variant(v);
+ if (!v->bo) {
+ debug_error("assemble failed!");
+ goto fail;
+ }
+
+ if (shader->type == SHADER_VERTEX)
+ fixup_vp_regfootprint(v);
+
+ if (fd_mesa_debug & FD_DBG_DISASM) {
+ DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
+ key.binning_pass, key.color_two_side, key.half_precision);
+ disasm_a3xx(fd_bo_map(v->bo), v->info.sizedwords, 0, v->type);
+ }
+
+ return v;
+
+fail:
+ delete_variant(v);
+ return NULL;
+}
+
+struct ir3_shader_variant *
+ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key)
+{
+ struct ir3_shader_variant *v;
+
+ /* some shader key values only apply to vertex or frag shader,
+ * so normalize the key to avoid constructing multiple identical
+ * variants:
+ */
+ if (shader->type == SHADER_FRAGMENT) {
+ key.binning_pass = false;
+ }
+ if (shader->type == SHADER_VERTEX) {
+ key.color_two_side = false;
+ key.half_precision = false;
+ }
+
+ for (v = shader->variants; v; v = v->next)
+ if (!memcmp(&key, &v->key, sizeof(key)))
+ return v;
+
+ /* compile new variant if it doesn't exist already: */
+ v = create_variant(shader, key);
+ v->next = shader->variants;
+ shader->variants = v;
+
+ return v;
+}
+
+
+void
+ir3_shader_destroy(struct ir3_shader *shader)
+{
+ struct ir3_shader_variant *v, *t;
+ for (v = shader->variants; v; ) {
+ t = v;
+ v = v->next;
+ delete_variant(t);
+ }
+ free((void *)shader->tokens);
+ free(shader);
+}
+
+struct ir3_shader *
+ir3_shader_create(struct pipe_context *pctx, const struct tgsi_token *tokens,
+ enum shader_t type)
+{
+ struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader);
+ shader->pctx = pctx;
+ shader->type = type;
+ shader->tokens = tgsi_dup_tokens(tokens);
+ return shader;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
new file mode 100644
index 00000000000..1a91fcbcb13
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -0,0 +1,163 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#ifndef IR3_SHADER_H_
+#define IR3_SHADER_H_
+
+#include "ir3.h"
+#include "disasm.h"
+
+typedef uint16_t ir3_semantic; /* semantic name + index */
+static inline ir3_semantic
+ir3_semantic_name(uint8_t name, uint16_t index)
+{
+ return (name << 8) | (index & 0xff);
+}
+
+static inline uint8_t sem2name(ir3_semantic sem)
+{
+ return sem >> 8;
+}
+
+static inline uint16_t sem2idx(ir3_semantic sem)
+{
+ return sem & 0xff;
+}
+
+/* Configuration key used to identify a shader variant.. different
+ * shader variants can be used to implement features not supported
+ * in hw (two sided color), binning-pass vertex shader, etc.
+ */
+struct ir3_shader_key {
+ /* vertex shader variant parameters: */
+ unsigned binning_pass : 1;
+
+ /* fragment shader variant parameters: */
+ unsigned color_two_side : 1;
+ unsigned half_precision : 1;
+};
+
+struct ir3_shader_variant {
+ struct fd_bo *bo;
+
+ struct ir3_shader_key key;
+
+ struct ir3_info info;
+ struct ir3 *ir;
+
+ /* the instructions length is in units of instruction groups
+ * (4 instructions, 8 dwords):
+ */
+ unsigned instrlen;
+
+ /* the constants length is in units of vec4's, and is the sum of
+ * the uniforms and the built-in compiler constants
+ */
+ unsigned constlen;
+
+ /* About Linkage:
+ * + Let the frag shader determine the position/compmask for the
+ * varyings, since it is the place where we know if the varying
+ * is actually used, and if so, which components are used. So
+ * what the hw calls "outloc" is taken from the "inloc" of the
+ * frag shader.
+ * + From the vert shader, we only need the output regid
+ */
+
+ /* for frag shader, pos_regid holds the frag_pos, ie. what is passed
+ * to bary.f instructions
+ */
+ uint8_t pos_regid;
+ bool frag_coord, frag_face;
+
+ /* varyings/outputs: */
+ unsigned outputs_count;
+ struct {
+ ir3_semantic semantic;
+ uint8_t regid;
+ } outputs[16 + 2]; /* +POSITION +PSIZE */
+ bool writes_pos, writes_psize;
+
+ /* vertices/inputs: */
+ unsigned inputs_count;
+ struct {
+ ir3_semantic semantic;
+ uint8_t regid;
+ uint8_t compmask;
+ uint8_t ncomp;
+ /* in theory inloc of fs should match outloc of vs: */
+ uint8_t inloc;
+ uint8_t bary;
+ } inputs[16 + 2]; /* +POSITION +FACE */
+
+ unsigned total_in; /* sum of inputs (scalar) */
+
+ /* do we have one or more texture sample instructions: */
+ bool has_samp;
+
+ /* const reg # of first immediate, ie. 1 == c1
+ * (not regid, because TGSI thinks in terms of vec4 registers,
+ * not scalar registers)
+ */
+ unsigned first_immediate;
+ unsigned immediates_count;
+ struct {
+ uint32_t val[4];
+ } immediates[64];
+
+ /* shader variants form a linked list: */
+ struct ir3_shader_variant *next;
+
+ /* replicated here to avoid passing extra ptrs everywhere: */
+ enum shader_t type;
+ struct ir3_shader *shader;
+};
+
+struct ir3_shader {
+ enum shader_t type;
+
+ struct pipe_context *pctx;
+ const struct tgsi_token *tokens;
+
+ struct ir3_shader_variant *variants;
+
+ /* so far, only used for blit_prog shader.. values for
+ * VPC_VARYING_INTERP[i].MODE and VPC_VARYING_PS_REPL[i].MODE
+ */
+ uint32_t vinterp[4], vpsrepl[4];
+};
+
+
+struct ir3_shader * ir3_shader_create(struct pipe_context *pctx,
+ const struct tgsi_token *tokens, enum shader_t type);
+void ir3_shader_destroy(struct ir3_shader *shader);
+
+struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader,
+ struct ir3_shader_key key);
+
+#endif /* IR3_SHADER_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_visitor.h b/src/gallium/drivers/freedreno/ir3/ir3_visitor.h
new file mode 100644
index 00000000000..1c60d1620ca
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_visitor.h
@@ -0,0 +1,154 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <[email protected]>
+ */
+
+#ifndef IR3_VISITOR_H_
+#define IR3_VISITOR_H_
+
+/**
+ * Visitor which follows dst to src relationships between instructions,
+ * first visiting the dst (writer) instruction, followed by src (reader)
+ * instruction(s).
+ *
+ * TODO maybe we want multiple different visitors to walk the
+ * graph in different ways?
+ */
+
+struct ir3_visitor;
+
+typedef void (*ir3_visit_instr_func)(struct ir3_visitor *v,
+ struct ir3_instruction *instr);
+
+typedef void (*ir3_visit_reg_func)(struct ir3_visitor *v,
+ struct ir3_instruction *instr, struct ir3_register *reg);
+
+struct ir3_visitor_funcs {
+ ir3_visit_instr_func instr; // TODO do we need??
+
+ ir3_visit_reg_func dst_shader_input;
+ ir3_visit_reg_func dst_block_input;
+ ir3_visit_reg_func dst_fanout;
+ ir3_visit_reg_func dst_fanin;
+ ir3_visit_reg_func dst;
+
+ ir3_visit_reg_func src_block_input;
+ ir3_visit_reg_func src_fanout;
+ ir3_visit_reg_func src_fanin;
+ ir3_visit_reg_func src;
+};
+
+struct ir3_visitor {
+ const struct ir3_visitor_funcs *funcs;
+ bool error;
+};
+
+#include "util/u_debug.h"
+
+static void visit_instr_dst(struct ir3_visitor *v,
+ struct ir3_instruction *instr)
+{
+ struct ir3_register *reg = instr->regs[0];
+
+ if (is_meta(instr)) {
+ switch (instr->opc) {
+ case OPC_META_INPUT:
+ if (instr->regs_count == 1)
+ v->funcs->dst_shader_input(v, instr, reg);
+ else
+ v->funcs->dst_block_input(v, instr, reg);
+ return;
+ case OPC_META_FO:
+ v->funcs->dst_fanout(v, instr, reg);
+ return;
+ case OPC_META_FI:
+ v->funcs->dst_fanin(v, instr, reg);
+ return;
+ default:
+ break;
+
+ }
+ }
+
+ v->funcs->dst(v, instr, reg);
+}
+
+static void visit_instr_src(struct ir3_visitor *v,
+ struct ir3_instruction *instr, struct ir3_register *reg)
+{
+ if (is_meta(instr)) {
+ switch (instr->opc) {
+ case OPC_META_INPUT:
+ /* shader-input does not have a src, only block input: */
+ debug_assert(instr->regs_count == 2);
+ v->funcs->src_block_input(v, instr, reg);
+ return;
+ case OPC_META_FO:
+ v->funcs->src_fanout(v, instr, reg);
+ return;
+ case OPC_META_FI:
+ v->funcs->src_fanin(v, instr, reg);
+ return;
+ default:
+ break;
+
+ }
+ }
+
+ v->funcs->src(v, instr, reg);
+}
+
+static void ir3_visit_instr(struct ir3_visitor *v,
+ struct ir3_instruction *instr)
+{
+ struct ir3_instruction *n;
+
+ /* visit instruction that assigns value: */
+ if (instr->regs_count > 0)
+ visit_instr_dst(v, instr);
+
+ /* and of any following instructions which read that value: */
+ n = instr->next;
+ while (n && !v->error) {
+ unsigned i;
+
+ for (i = 1; i < n->regs_count; i++) {
+ struct ir3_register *reg = n->regs[i];
+ if ((reg->flags & IR3_REG_SSA) && (reg->instr == instr))
+ visit_instr_src(v, n, reg);
+ }
+
+ n = n->next;
+ }
+}
+
+static void ir3_visit_reg(struct ir3_visitor *v,
+ struct ir3_instruction *instr, struct ir3_register *reg)
+{
+ /* no-op */
+}
+
+#endif /* IR3_VISITOR_H_ */