diff options
author | Rob Clark <[email protected]> | 2014-07-25 11:15:59 -0400 |
---|---|---|
committer | Rob Clark <[email protected]> | 2014-07-25 13:29:28 -0400 |
commit | db193e5ad06e7a2fbcffb3bb5df85d212eb12291 (patch) | |
tree | 58d1ec24c0af7b1acb1477eeaababe3d7eda6019 /src/gallium/drivers/freedreno/ir3 | |
parent | 7d7e6ae9c3544ce1889aa9b8a34545c6f42017e7 (diff) |
freedreno/ir3: split out shader compiler from a3xx
Move the bits we want to share between generations from fd3_program to
ir3_shader. So overall structure is:
fdN_shader_stateobj -> ir3_shader -> ir3_shader_variant -> ir3
|- ...
\- ir3_shader_variant -> ir3
So the ir3_shader becomes the topmost generation neutral object, which
manages the set of variants each of which generates, compiles, and
assembles it's own ir.
There is a bit of additional renaming to s/fd3_compiler/ir3_compiler/,
etc.
Keep the split between the gallium level stateobj and the shader helper
object because it might be a good idea to pre-compute some generation
specific register values (ie. anything that is independent of linking).
Signed-off-by: Rob Clark <[email protected]>
Diffstat (limited to 'src/gallium/drivers/freedreno/ir3')
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/disasm-a3xx.c | 805 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/instr-a3xx.h | 691 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3.c | 675 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3.h | 480 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_compiler.c | 2639 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_compiler.h | 42 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_compiler_old.c | 1524 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_cp.c | 158 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_depth.c | 159 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_dump.c | 425 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_flatten.c | 155 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_ra.c | 790 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_sched.c | 401 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_shader.c | 211 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_shader.h | 163 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_visitor.h | 154 |
16 files changed, 9472 insertions, 0 deletions
diff --git a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c new file mode 100644 index 00000000000..8c3704bf658 --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c @@ -0,0 +1,805 @@ +/* + * Copyright (c) 2013 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <assert.h> + +#include <util/u_debug.h> + +#include "disasm.h" +#include "instr-a3xx.h" + +static enum debug_t debug; + +#define printf debug_printf + +static const char *levels[] = { + "", + "\t", + "\t\t", + "\t\t\t", + "\t\t\t\t", + "\t\t\t\t\t", + "\t\t\t\t\t\t", + "\t\t\t\t\t\t\t", + "\t\t\t\t\t\t\t\t", + "\t\t\t\t\t\t\t\t\t", + "x", + "x", + "x", + "x", + "x", + "x", +}; + +static const char *component = "xyzw"; + +static const char *type[] = { + [TYPE_F16] = "f16", + [TYPE_F32] = "f32", + [TYPE_U16] = "u16", + [TYPE_U32] = "u32", + [TYPE_S16] = "s16", + [TYPE_S32] = "s32", + [TYPE_U8] = "u8", + [TYPE_S8] = "s8", +}; + +static void print_reg(reg_t reg, bool full, bool r, bool c, bool im, + bool neg, bool abs, bool addr_rel) +{ + const char type = c ? 'c' : 'r'; + + // XXX I prefer - and || for neg/abs, but preserving format used + // by libllvm-a3xx for easy diffing.. + + if (abs && neg) + printf("(absneg)"); + else if (neg) + printf("(neg)"); + else if (abs) + printf("(abs)"); + + if (r) + printf("(r)"); + + if (im) { + printf("%d", reg.iim_val); + } else if (addr_rel) { + /* I would just use %+d but trying to make it diff'able with + * libllvm-a3xx... + */ + if (reg.iim_val < 0) + printf("%s%c<a0.x - %d>", full ? "" : "h", type, -reg.iim_val); + else if (reg.iim_val > 0) + printf("%s%c<a0.x + %d>", full ? "" : "h", type, reg.iim_val); + else + printf("%s%c<a0.x>", full ? "" : "h", type); + } else if ((reg.num == REG_A0) && !c) { + printf("a0.%c", component[reg.comp]); + } else if ((reg.num == REG_P0) && !c) { + printf("p0.%c", component[reg.comp]); + } else { + printf("%s%c%d.%c", full ? "" : "h", type, reg.num, component[reg.comp]); + } +} + + +/* current instruction repeat flag: */ +static unsigned repeat; + +static void print_reg_dst(reg_t reg, bool full, bool addr_rel) +{ + print_reg(reg, full, false, false, false, false, false, addr_rel); +} + +static void print_reg_src(reg_t reg, bool full, bool r, bool c, bool im, + bool neg, bool abs, bool addr_rel) +{ + print_reg(reg, full, r, c, im, neg, abs, addr_rel); +} + +static void print_instr_cat0(instr_t *instr) +{ + instr_cat0_t *cat0 = &instr->cat0; + + switch (cat0->opc) { + case OPC_KILL: + printf(" %sp0.%c", cat0->inv ? "!" : "", + component[cat0->comp]); + break; + case OPC_BR: + printf(" %sp0.%c, #%d", cat0->inv ? "!" : "", + component[cat0->comp], cat0->immed); + break; + case OPC_JUMP: + case OPC_CALL: + printf(" #%d", cat0->immed); + break; + } + + if ((debug & PRINT_VERBOSE) && (cat0->dummy1|cat0->dummy2|cat0->dummy3|cat0->dummy4)) + printf("\t{0: %x,%x,%x,%x}", cat0->dummy1, cat0->dummy2, cat0->dummy3, cat0->dummy4); +} + +static void print_instr_cat1(instr_t *instr) +{ + instr_cat1_t *cat1 = &instr->cat1; + + if (cat1->ul) + printf("(ul)"); + + if (cat1->src_type == cat1->dst_type) { + if ((cat1->src_type == TYPE_S16) && (((reg_t)cat1->dst).num == REG_A0)) { + /* special case (nmemonic?): */ + printf("mova"); + } else { + printf("mov.%s%s", type[cat1->src_type], type[cat1->dst_type]); + } + } else { + printf("cov.%s%s", type[cat1->src_type], type[cat1->dst_type]); + } + + printf(" "); + + if (cat1->even) + printf("(even)"); + + if (cat1->pos_inf) + printf("(pos_infinity)"); + + print_reg_dst((reg_t)(cat1->dst), type_size(cat1->dst_type) == 32, + cat1->dst_rel); + + printf(", "); + + /* ugg, have to special case this.. vs print_reg().. */ + if (cat1->src_im) { + if (type_float(cat1->src_type)) + printf("(%f)", cat1->fim_val); + else + printf("%d", cat1->iim_val); + } else if (cat1->src_rel && !cat1->src_c) { + /* I would just use %+d but trying to make it diff'able with + * libllvm-a3xx... + */ + char type = cat1->src_rel_c ? 'c' : 'r'; + if (cat1->off < 0) + printf("%c<a0.x - %d>", type, -cat1->off); + else if (cat1->off > 0) + printf("%c<a0.x + %d>", type, cat1->off); + else + printf("c<a0.x>"); + } else { + print_reg_src((reg_t)(cat1->src), type_size(cat1->src_type) == 32, + cat1->src_r, cat1->src_c, cat1->src_im, false, false, false); + } + + if ((debug & PRINT_VERBOSE) && (cat1->must_be_0)) + printf("\t{1: %x}", cat1->must_be_0); +} + +static void print_instr_cat2(instr_t *instr) +{ + instr_cat2_t *cat2 = &instr->cat2; + static const char *cond[] = { + "lt", + "le", + "gt", + "ge", + "eq", + "ne", + "?6?", + }; + + switch (cat2->opc) { + case OPC_CMPS_F: + case OPC_CMPS_U: + case OPC_CMPS_S: + case OPC_CMPV_F: + case OPC_CMPV_U: + case OPC_CMPV_S: + printf(".%s", cond[cat2->cond]); + break; + } + + printf(" "); + if (cat2->ei) + printf("(ei)"); + print_reg_dst((reg_t)(cat2->dst), cat2->full ^ cat2->dst_half, false); + printf(", "); + + if (cat2->c1.src1_c) { + print_reg_src((reg_t)(cat2->c1.src1), cat2->full, cat2->src1_r, + cat2->c1.src1_c, cat2->src1_im, cat2->src1_neg, + cat2->src1_abs, false); + } else if (cat2->rel1.src1_rel) { + print_reg_src((reg_t)(cat2->rel1.src1), cat2->full, cat2->src1_r, + cat2->rel1.src1_c, cat2->src1_im, cat2->src1_neg, + cat2->src1_abs, cat2->rel1.src1_rel); + } else { + print_reg_src((reg_t)(cat2->src1), cat2->full, cat2->src1_r, + false, cat2->src1_im, cat2->src1_neg, + cat2->src1_abs, false); + } + + switch (cat2->opc) { + case OPC_ABSNEG_F: + case OPC_ABSNEG_S: + case OPC_CLZ_B: + case OPC_CLZ_S: + case OPC_SIGN_F: + case OPC_FLOOR_F: + case OPC_CEIL_F: + case OPC_RNDNE_F: + case OPC_RNDAZ_F: + case OPC_TRUNC_F: + case OPC_NOT_B: + case OPC_BFREV_B: + case OPC_SETRM: + case OPC_CBITS_B: + /* these only have one src reg */ + break; + default: + printf(", "); + if (cat2->c2.src2_c) { + print_reg_src((reg_t)(cat2->c2.src2), cat2->full, cat2->src2_r, + cat2->c2.src2_c, cat2->src2_im, cat2->src2_neg, + cat2->src2_abs, false); + } else if (cat2->rel2.src2_rel) { + print_reg_src((reg_t)(cat2->rel2.src2), cat2->full, cat2->src2_r, + cat2->rel2.src2_c, cat2->src2_im, cat2->src2_neg, + cat2->src2_abs, cat2->rel2.src2_rel); + } else { + print_reg_src((reg_t)(cat2->src2), cat2->full, cat2->src2_r, + false, cat2->src2_im, cat2->src2_neg, + cat2->src2_abs, false); + } + break; + } +} + +static void print_instr_cat3(instr_t *instr) +{ + instr_cat3_t *cat3 = &instr->cat3; + bool full = instr_cat3_full(cat3); + + printf(" "); + print_reg_dst((reg_t)(cat3->dst), full ^ cat3->dst_half, false); + printf(", "); + if (cat3->c1.src1_c) { + print_reg_src((reg_t)(cat3->c1.src1), full, + cat3->src1_r, cat3->c1.src1_c, false, cat3->src1_neg, + false, false); + } else if (cat3->rel1.src1_rel) { + print_reg_src((reg_t)(cat3->rel1.src1), full, + cat3->src1_r, cat3->rel1.src1_c, false, cat3->src1_neg, + false, cat3->rel1.src1_rel); + } else { + print_reg_src((reg_t)(cat3->src1), full, + cat3->src1_r, false, false, cat3->src1_neg, + false, false); + } + printf(", "); + print_reg_src((reg_t)cat3->src2, full, + cat3->src2_r, cat3->src2_c, false, cat3->src2_neg, + false, false); + printf(", "); + if (cat3->c2.src3_c) { + print_reg_src((reg_t)(cat3->c2.src3), full, + cat3->src3_r, cat3->c2.src3_c, false, cat3->src3_neg, + false, false); + } else if (cat3->rel2.src3_rel) { + print_reg_src((reg_t)(cat3->rel2.src3), full, + cat3->src3_r, cat3->rel2.src3_c, false, cat3->src3_neg, + false, cat3->rel2.src3_rel); + } else { + print_reg_src((reg_t)(cat3->src3), full, + cat3->src3_r, false, false, cat3->src3_neg, + false, false); + } +} + +static void print_instr_cat4(instr_t *instr) +{ + instr_cat4_t *cat4 = &instr->cat4; + + printf(" "); + print_reg_dst((reg_t)(cat4->dst), cat4->full ^ cat4->dst_half, false); + printf(", "); + + if (cat4->c.src_c) { + print_reg_src((reg_t)(cat4->c.src), cat4->full, + cat4->src_r, cat4->c.src_c, cat4->src_im, + cat4->src_neg, cat4->src_abs, false); + } else if (cat4->rel.src_rel) { + print_reg_src((reg_t)(cat4->rel.src), cat4->full, + cat4->src_r, cat4->rel.src_c, cat4->src_im, + cat4->src_neg, cat4->src_abs, cat4->rel.src_rel); + } else { + print_reg_src((reg_t)(cat4->src), cat4->full, + cat4->src_r, false, cat4->src_im, + cat4->src_neg, cat4->src_abs, false); + } + + if ((debug & PRINT_VERBOSE) && (cat4->dummy1|cat4->dummy2)) + printf("\t{4: %x,%x}", cat4->dummy1, cat4->dummy2); +} + +static void print_instr_cat5(instr_t *instr) +{ + static const struct { + bool src1, src2, samp, tex; + } info[0x1f] = { + [OPC_ISAM] = { true, false, true, true, }, + [OPC_ISAML] = { true, true, true, true, }, + [OPC_ISAMM] = { true, false, true, true, }, + [OPC_SAM] = { true, false, true, true, }, + [OPC_SAMB] = { true, true, true, true, }, + [OPC_SAML] = { true, true, true, true, }, + [OPC_SAMGQ] = { true, false, true, true, }, + [OPC_GETLOD] = { true, false, true, true, }, + [OPC_CONV] = { true, true, true, true, }, + [OPC_CONVM] = { true, true, true, true, }, + [OPC_GETSIZE] = { true, false, false, true, }, + [OPC_GETBUF] = { false, false, false, true, }, + [OPC_GETPOS] = { true, false, false, true, }, + [OPC_GETINFO] = { false, false, false, true, }, + [OPC_DSX] = { true, false, false, false, }, + [OPC_DSY] = { true, false, false, false, }, + [OPC_GATHER4R] = { true, false, true, true, }, + [OPC_GATHER4G] = { true, false, true, true, }, + [OPC_GATHER4B] = { true, false, true, true, }, + [OPC_GATHER4A] = { true, false, true, true, }, + [OPC_SAMGP0] = { true, false, true, true, }, + [OPC_SAMGP1] = { true, false, true, true, }, + [OPC_SAMGP2] = { true, false, true, true, }, + [OPC_SAMGP3] = { true, false, true, true, }, + [OPC_DSXPP_1] = { true, false, false, false, }, + [OPC_DSYPP_1] = { true, false, false, false, }, + [OPC_RGETPOS] = { false, false, false, false, }, + [OPC_RGETINFO] = { false, false, false, false, }, + }; + instr_cat5_t *cat5 = &instr->cat5; + int i; + + if (cat5->is_3d) printf(".3d"); + if (cat5->is_a) printf(".a"); + if (cat5->is_o) printf(".o"); + if (cat5->is_p) printf(".p"); + if (cat5->is_s) printf(".s"); + if (cat5->is_s2en) printf(".s2en"); + + printf(" "); + + switch (cat5->opc) { + case OPC_DSXPP_1: + case OPC_DSYPP_1: + break; + default: + printf("(%s)", type[cat5->type]); + break; + } + + printf("("); + for (i = 0; i < 4; i++) + if (cat5->wrmask & (1 << i)) + printf("%c", "xyzw"[i]); + printf(")"); + + print_reg_dst((reg_t)(cat5->dst), type_size(cat5->type) == 32, false); + + if (info[cat5->opc].src1) { + printf(", "); + print_reg_src((reg_t)(cat5->src1), cat5->full, false, false, false, + false, false, false); + } + + if (cat5->is_s2en) { + printf(", "); + print_reg_src((reg_t)(cat5->s2en.src2), cat5->full, false, false, false, + false, false, false); + printf(", "); + print_reg_src((reg_t)(cat5->s2en.src3), false, false, false, false, + false, false, false); + } else { + if (cat5->is_o || info[cat5->opc].src2) { + printf(", "); + print_reg_src((reg_t)(cat5->norm.src2), cat5->full, + false, false, false, false, false, false); + } + if (info[cat5->opc].samp) + printf(", s#%d", cat5->norm.samp); + if (info[cat5->opc].tex) + printf(", t#%d", cat5->norm.tex); + } + + if (debug & PRINT_VERBOSE) { + if (cat5->is_s2en) { + if ((debug & PRINT_VERBOSE) && (cat5->s2en.dummy1|cat5->s2en.dummy2|cat5->dummy2)) + printf("\t{5: %x,%x,%x}", cat5->s2en.dummy1, cat5->s2en.dummy2, cat5->dummy2); + } else { + if ((debug & PRINT_VERBOSE) && (cat5->norm.dummy1|cat5->dummy2)) + printf("\t{5: %x,%x}", cat5->norm.dummy1, cat5->dummy2); + } + } +} + +static int32_t u2i(uint32_t val, int nbits) +{ + return ((val >> (nbits-1)) * ~((1 << nbits) - 1)) | val; +} + +static void print_instr_cat6(instr_t *instr) +{ + instr_cat6_t *cat6 = &instr->cat6; + + printf(".%s ", type[cat6->type]); + + switch (cat6->opc) { + case OPC_LDG: + case OPC_LDP: + case OPC_LDL: + case OPC_LDLW: + case OPC_LDLV: + /* load instructions: */ + print_reg_dst((reg_t)(cat6->a.dst), type_size(cat6->type) == 32, false); + printf(","); + switch (cat6->opc) { + case OPC_LDG: + printf("g"); + break; + case OPC_LDP: + printf("p"); + break; + case OPC_LDL: + case OPC_LDLW: + case OPC_LDLV: + printf("l"); + break; + } + printf("["); + print_reg_src((reg_t)(cat6->a.src), true, + false, false, false, false, false, false); + if (cat6->a.off) + printf("%+d", cat6->a.off); + printf("]"); + break; + case OPC_PREFETCH: + /* similar to load instructions: */ + printf("g["); + print_reg_src((reg_t)(cat6->a.src), true, + false, false, false, false, false, false); + if (cat6->a.off) + printf("%+d", cat6->a.off); + printf("]"); + break; + case OPC_STG: + case OPC_STP: + case OPC_STL: + case OPC_STLW: + /* store instructions: */ + switch (cat6->opc) { + case OPC_STG: + printf("g"); + break; + case OPC_STP: + printf("p"); + break; + case OPC_STL: + case OPC_STLW: + printf("l"); + break; + } + printf("["); + print_reg_dst((reg_t)(cat6->b.dst), true, false); + if (cat6->b.off || cat6->b.off_hi) + printf("%+d", u2i((cat6->b.off_hi << 8) | cat6->b.off, 13)); + printf("]"); + printf(","); + print_reg_src((reg_t)(cat6->b.src), type_size(cat6->type) == 32, + false, false, false, false, false, false); + + break; + case OPC_STI: + /* sti has same encoding as other store instructions, but + * slightly different syntax: + */ + print_reg_dst((reg_t)(cat6->b.dst), false /* XXX is it always half? */, false); + if (cat6->b.off || cat6->b.off_hi) + printf("%+d", u2i((cat6->b.off_hi << 8) | cat6->b.off, 13)); + printf(","); + print_reg_src((reg_t)(cat6->b.src), type_size(cat6->type) == 32, + false, false, false, false, false, false); + break; + } + + printf(", %d", cat6->iim_val); + + if (debug & PRINT_VERBOSE) { + switch (cat6->opc) { + case OPC_LDG: + case OPC_LDP: + /* load instructions: */ + if (cat6->a.dummy1|cat6->a.dummy2|cat6->a.dummy3) + printf("\t{6: %x,%x,%x}", cat6->a.dummy1, cat6->a.dummy2, cat6->a.dummy3); + if ((cat6->a.must_be_one1 != 1) || (cat6->a.must_be_one2 != 1)) + printf("{?? %d,%d ??}", cat6->a.must_be_one1, cat6->a.must_be_one2); + break; + case OPC_STG: + case OPC_STP: + case OPC_STI: + /* store instructions: */ + if (cat6->b.dummy1|cat6->b.dummy2) + printf("\t{6: %x,%x}", cat6->b.dummy1, cat6->b.dummy2); + if ((cat6->b.must_be_one1 != 1) || (cat6->b.must_be_one2 != 1) || + (cat6->b.must_be_zero1 != 0)) + printf("{?? %d,%d,%d ??}", cat6->b.must_be_one1, cat6->b.must_be_one2, + cat6->b.must_be_zero1); + break; + } + } +} + +/* size of largest OPC field of all the instruction categories: */ +#define NOPC_BITS 6 + +struct opc_info { + uint16_t cat; + uint16_t opc; + const char *name; + void (*print)(instr_t *instr); +} opcs[1 << (3+NOPC_BITS)] = { +#define OPC(cat, opc, name) [((cat) << NOPC_BITS) | (opc)] = { (cat), (opc), #name, print_instr_cat##cat } + /* category 0: */ + OPC(0, OPC_NOP, nop), + OPC(0, OPC_BR, br), + OPC(0, OPC_JUMP, jump), + OPC(0, OPC_CALL, call), + OPC(0, OPC_RET, ret), + OPC(0, OPC_KILL, kill), + OPC(0, OPC_END, end), + OPC(0, OPC_EMIT, emit), + OPC(0, OPC_CUT, cut), + OPC(0, OPC_CHMASK, chmask), + OPC(0, OPC_CHSH, chsh), + OPC(0, OPC_FLOW_REV, flow_rev), + + /* category 1: */ + OPC(1, 0, ), + + /* category 2: */ + OPC(2, OPC_ADD_F, add.f), + OPC(2, OPC_MIN_F, min.f), + OPC(2, OPC_MAX_F, max.f), + OPC(2, OPC_MUL_F, mul.f), + OPC(2, OPC_SIGN_F, sign.f), + OPC(2, OPC_CMPS_F, cmps.f), + OPC(2, OPC_ABSNEG_F, absneg.f), + OPC(2, OPC_CMPV_F, cmpv.f), + OPC(2, OPC_FLOOR_F, floor.f), + OPC(2, OPC_CEIL_F, ceil.f), + OPC(2, OPC_RNDNE_F, rndne.f), + OPC(2, OPC_RNDAZ_F, rndaz.f), + OPC(2, OPC_TRUNC_F, trunc.f), + OPC(2, OPC_ADD_U, add.u), + OPC(2, OPC_ADD_S, add.s), + OPC(2, OPC_SUB_U, sub.u), + OPC(2, OPC_SUB_S, sub.s), + OPC(2, OPC_CMPS_U, cmps.u), + OPC(2, OPC_CMPS_S, cmps.s), + OPC(2, OPC_MIN_U, min.u), + OPC(2, OPC_MIN_S, min.s), + OPC(2, OPC_MAX_U, max.u), + OPC(2, OPC_MAX_S, max.s), + OPC(2, OPC_ABSNEG_S, absneg.s), + OPC(2, OPC_AND_B, and.b), + OPC(2, OPC_OR_B, or.b), + OPC(2, OPC_NOT_B, not.b), + OPC(2, OPC_XOR_B, xor.b), + OPC(2, OPC_CMPV_U, cmpv.u), + OPC(2, OPC_CMPV_S, cmpv.s), + OPC(2, OPC_MUL_U, mul.u), + OPC(2, OPC_MUL_S, mul.s), + OPC(2, OPC_MULL_U, mull.u), + OPC(2, OPC_BFREV_B, bfrev.b), + OPC(2, OPC_CLZ_S, clz.s), + OPC(2, OPC_CLZ_B, clz.b), + OPC(2, OPC_SHL_B, shl.b), + OPC(2, OPC_SHR_B, shr.b), + OPC(2, OPC_ASHR_B, ashr.b), + OPC(2, OPC_BARY_F, bary.f), + OPC(2, OPC_MGEN_B, mgen.b), + OPC(2, OPC_GETBIT_B, getbit.b), + OPC(2, OPC_SETRM, setrm), + OPC(2, OPC_CBITS_B, cbits.b), + OPC(2, OPC_SHB, shb), + OPC(2, OPC_MSAD, msad), + + /* category 3: */ + OPC(3, OPC_MAD_U16, mad.u16), + OPC(3, OPC_MADSH_U16, madsh.u16), + OPC(3, OPC_MAD_S16, mad.s16), + OPC(3, OPC_MADSH_M16, madsh.m16), + OPC(3, OPC_MAD_U24, mad.u24), + OPC(3, OPC_MAD_S24, mad.s24), + OPC(3, OPC_MAD_F16, mad.f16), + OPC(3, OPC_MAD_F32, mad.f32), + OPC(3, OPC_SEL_B16, sel.b16), + OPC(3, OPC_SEL_B32, sel.b32), + OPC(3, OPC_SEL_S16, sel.s16), + OPC(3, OPC_SEL_S32, sel.s32), + OPC(3, OPC_SEL_F16, sel.f16), + OPC(3, OPC_SEL_F32, sel.f32), + OPC(3, OPC_SAD_S16, sad.s16), + OPC(3, OPC_SAD_S32, sad.s32), + + /* category 4: */ + OPC(4, OPC_RCP, rcp), + OPC(4, OPC_RSQ, rsq), + OPC(4, OPC_LOG2, log2), + OPC(4, OPC_EXP2, exp2), + OPC(4, OPC_SIN, sin), + OPC(4, OPC_COS, cos), + OPC(4, OPC_SQRT, sqrt), + + /* category 5: */ + OPC(5, OPC_ISAM, isam), + OPC(5, OPC_ISAML, isaml), + OPC(5, OPC_ISAMM, isamm), + OPC(5, OPC_SAM, sam), + OPC(5, OPC_SAMB, samb), + OPC(5, OPC_SAML, saml), + OPC(5, OPC_SAMGQ, samgq), + OPC(5, OPC_GETLOD, getlod), + OPC(5, OPC_CONV, conv), + OPC(5, OPC_CONVM, convm), + OPC(5, OPC_GETSIZE, getsize), + OPC(5, OPC_GETBUF, getbuf), + OPC(5, OPC_GETPOS, getpos), + OPC(5, OPC_GETINFO, getinfo), + OPC(5, OPC_DSX, dsx), + OPC(5, OPC_DSY, dsy), + OPC(5, OPC_GATHER4R, gather4r), + OPC(5, OPC_GATHER4G, gather4g), + OPC(5, OPC_GATHER4B, gather4b), + OPC(5, OPC_GATHER4A, gather4a), + OPC(5, OPC_SAMGP0, samgp0), + OPC(5, OPC_SAMGP1, samgp1), + OPC(5, OPC_SAMGP2, samgp2), + OPC(5, OPC_SAMGP3, samgp3), + OPC(5, OPC_DSXPP_1, dsxpp.1), + OPC(5, OPC_DSYPP_1, dsypp.1), + OPC(5, OPC_RGETPOS, rgetpos), + OPC(5, OPC_RGETINFO, rgetinfo), + + + /* category 6: */ + OPC(6, OPC_LDG, ldg), + OPC(6, OPC_LDL, ldl), + OPC(6, OPC_LDP, ldp), + OPC(6, OPC_STG, stg), + OPC(6, OPC_STL, stl), + OPC(6, OPC_STP, stp), + OPC(6, OPC_STI, sti), + OPC(6, OPC_G2L, g2l), + OPC(6, OPC_L2G, l2g), + OPC(6, OPC_PREFETCH, prefetch), + OPC(6, OPC_LDLW, ldlw), + OPC(6, OPC_STLW, stlw), + OPC(6, OPC_RESFMT, resfmt), + OPC(6, OPC_RESINFO, resinf), + OPC(6, OPC_ATOMIC_ADD_L, atomic.add.l), + OPC(6, OPC_ATOMIC_SUB_L, atomic.sub.l), + OPC(6, OPC_ATOMIC_XCHG_L, atomic.xchg.l), + OPC(6, OPC_ATOMIC_INC_L, atomic.inc.l), + OPC(6, OPC_ATOMIC_DEC_L, atomic.dec.l), + OPC(6, OPC_ATOMIC_CMPXCHG_L, atomic.cmpxchg.l), + OPC(6, OPC_ATOMIC_MIN_L, atomic.min.l), + OPC(6, OPC_ATOMIC_MAX_L, atomic.max.l), + OPC(6, OPC_ATOMIC_AND_L, atomic.and.l), + OPC(6, OPC_ATOMIC_OR_L, atomic.or.l), + OPC(6, OPC_ATOMIC_XOR_L, atomic.xor.l), + OPC(6, OPC_LDGB_TYPED_4D, ldgb.typed.4d), + OPC(6, OPC_STGB_4D_4, stgb.4d.4), + OPC(6, OPC_STIB, stib), + OPC(6, OPC_LDC_4, ldc.4), + OPC(6, OPC_LDLV, ldlv), + + +#undef OPC +}; + +#define GETINFO(instr) (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr)])) + +// XXX hack.. probably should move this table somewhere common: +#include "ir3.h" +const char *ir3_instr_name(struct ir3_instruction *instr) +{ + if (instr->category == -1) return "??meta??"; + return opcs[(instr->category << NOPC_BITS) | instr->opc].name; +} + +static void print_instr(uint32_t *dwords, int level, int n) +{ + instr_t *instr = (instr_t *)dwords; + uint32_t opc = instr_opc(instr); + const char *name; + + printf("%s%04d[%08xx_%08xx] ", levels[level], n, dwords[1], dwords[0]); + +#if 0 + /* print unknown bits: */ + if (debug & PRINT_RAW) + printf("[%08xx_%08xx] ", dwords[1] & 0x001ff800, dwords[0] & 0x00000000); + + if (debug & PRINT_VERBOSE) + printf("%d,%02d ", instr->opc_cat, opc); +#endif + + /* NOTE: order flags are printed is a bit fugly.. but for now I + * try to match the order in llvm-a3xx disassembler for easy + * diff'ing.. + */ + + if (instr->sync) + printf("(sy)"); + if (instr->ss && (instr->opc_cat <= 4)) + printf("(ss)"); + if (instr->jmp_tgt) + printf("(jp)"); + if (instr->repeat && (instr->opc_cat <= 4)) { + printf("(rpt%d)", instr->repeat); + repeat = instr->repeat; + } else { + repeat = 0; + } + if (instr->ul && ((2 <= instr->opc_cat) && (instr->opc_cat <= 4))) + printf("(ul)"); + + name = GETINFO(instr)->name; + + if (name) { + printf("%s", name); + GETINFO(instr)->print(instr); + } else { + printf("unknown(%d,%d)", instr->opc_cat, opc); + } + + printf("\n"); +} + +int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, enum shader_t type) +{ + int i; + + assert((sizedwords % 2) == 0); + + for (i = 0; i < sizedwords; i += 2) + print_instr(&dwords[i], level, i/2); + + return 0; +} diff --git a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h new file mode 100644 index 00000000000..c67f1037ced --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h @@ -0,0 +1,691 @@ +/* + * Copyright (c) 2013 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef INSTR_A3XX_H_ +#define INSTR_A3XX_H_ + +#define PACKED __attribute__((__packed__)) + +#include <stdint.h> +#include <assert.h> + +typedef enum { + /* category 0: */ + OPC_NOP = 0, + OPC_BR = 1, + OPC_JUMP = 2, + OPC_CALL = 3, + OPC_RET = 4, + OPC_KILL = 5, + OPC_END = 6, + OPC_EMIT = 7, + OPC_CUT = 8, + OPC_CHMASK = 9, + OPC_CHSH = 10, + OPC_FLOW_REV = 11, + + /* category 1: */ + /* no opc.. all category 1 are variants of mov */ + + /* category 2: */ + OPC_ADD_F = 0, + OPC_MIN_F = 1, + OPC_MAX_F = 2, + OPC_MUL_F = 3, + OPC_SIGN_F = 4, + OPC_CMPS_F = 5, + OPC_ABSNEG_F = 6, + OPC_CMPV_F = 7, + /* 8 - invalid */ + OPC_FLOOR_F = 9, + OPC_CEIL_F = 10, + OPC_RNDNE_F = 11, + OPC_RNDAZ_F = 12, + OPC_TRUNC_F = 13, + /* 14-15 - invalid */ + OPC_ADD_U = 16, + OPC_ADD_S = 17, + OPC_SUB_U = 18, + OPC_SUB_S = 19, + OPC_CMPS_U = 20, + OPC_CMPS_S = 21, + OPC_MIN_U = 22, + OPC_MIN_S = 23, + OPC_MAX_U = 24, + OPC_MAX_S = 25, + OPC_ABSNEG_S = 26, + /* 27 - invalid */ + OPC_AND_B = 28, + OPC_OR_B = 29, + OPC_NOT_B = 30, + OPC_XOR_B = 31, + /* 32 - invalid */ + OPC_CMPV_U = 33, + OPC_CMPV_S = 34, + /* 35-47 - invalid */ + OPC_MUL_U = 48, + OPC_MUL_S = 49, + OPC_MULL_U = 50, + OPC_BFREV_B = 51, + OPC_CLZ_S = 52, + OPC_CLZ_B = 53, + OPC_SHL_B = 54, + OPC_SHR_B = 55, + OPC_ASHR_B = 56, + OPC_BARY_F = 57, + OPC_MGEN_B = 58, + OPC_GETBIT_B = 59, + OPC_SETRM = 60, + OPC_CBITS_B = 61, + OPC_SHB = 62, + OPC_MSAD = 63, + + /* category 3: */ + OPC_MAD_U16 = 0, + OPC_MADSH_U16 = 1, + OPC_MAD_S16 = 2, + OPC_MADSH_M16 = 3, /* should this be .s16? */ + OPC_MAD_U24 = 4, + OPC_MAD_S24 = 5, + OPC_MAD_F16 = 6, + OPC_MAD_F32 = 7, + OPC_SEL_B16 = 8, + OPC_SEL_B32 = 9, + OPC_SEL_S16 = 10, + OPC_SEL_S32 = 11, + OPC_SEL_F16 = 12, + OPC_SEL_F32 = 13, + OPC_SAD_S16 = 14, + OPC_SAD_S32 = 15, + + /* category 4: */ + OPC_RCP = 0, + OPC_RSQ = 1, + OPC_LOG2 = 2, + OPC_EXP2 = 3, + OPC_SIN = 4, + OPC_COS = 5, + OPC_SQRT = 6, + // 7-63 - invalid + + /* category 5: */ + OPC_ISAM = 0, + OPC_ISAML = 1, + OPC_ISAMM = 2, + OPC_SAM = 3, + OPC_SAMB = 4, + OPC_SAML = 5, + OPC_SAMGQ = 6, + OPC_GETLOD = 7, + OPC_CONV = 8, + OPC_CONVM = 9, + OPC_GETSIZE = 10, + OPC_GETBUF = 11, + OPC_GETPOS = 12, + OPC_GETINFO = 13, + OPC_DSX = 14, + OPC_DSY = 15, + OPC_GATHER4R = 16, + OPC_GATHER4G = 17, + OPC_GATHER4B = 18, + OPC_GATHER4A = 19, + OPC_SAMGP0 = 20, + OPC_SAMGP1 = 21, + OPC_SAMGP2 = 22, + OPC_SAMGP3 = 23, + OPC_DSXPP_1 = 24, + OPC_DSYPP_1 = 25, + OPC_RGETPOS = 26, + OPC_RGETINFO = 27, + + /* category 6: */ + OPC_LDG = 0, /* load-global */ + OPC_LDL = 1, + OPC_LDP = 2, + OPC_STG = 3, /* store-global */ + OPC_STL = 4, + OPC_STP = 5, + OPC_STI = 6, + OPC_G2L = 7, + OPC_L2G = 8, + OPC_PREFETCH = 9, + OPC_LDLW = 10, + OPC_STLW = 11, + OPC_RESFMT = 14, + OPC_RESINFO = 15, + OPC_ATOMIC_ADD_L = 16, + OPC_ATOMIC_SUB_L = 17, + OPC_ATOMIC_XCHG_L = 18, + OPC_ATOMIC_INC_L = 19, + OPC_ATOMIC_DEC_L = 20, + OPC_ATOMIC_CMPXCHG_L = 21, + OPC_ATOMIC_MIN_L = 22, + OPC_ATOMIC_MAX_L = 23, + OPC_ATOMIC_AND_L = 24, + OPC_ATOMIC_OR_L = 25, + OPC_ATOMIC_XOR_L = 26, + OPC_LDGB_TYPED_4D = 27, + OPC_STGB_4D_4 = 28, + OPC_STIB = 29, + OPC_LDC_4 = 30, + OPC_LDLV = 31, + + /* meta instructions (category -1): */ + /* placeholder instr to mark inputs/outputs: */ + OPC_META_INPUT = 0, + OPC_META_OUTPUT = 1, + /* The "fan-in" and "fan-out" instructions are used for keeping + * track of instructions that write to multiple dst registers + * (fan-out) like texture sample instructions, or read multiple + * consecutive scalar registers (fan-in) (bary.f, texture samp) + */ + OPC_META_FO = 2, + OPC_META_FI = 3, + /* branches/flow control */ + OPC_META_FLOW = 4, + OPC_META_PHI = 5, + /* relative addressing */ + OPC_META_DEREF = 6, + + +} opc_t; + +typedef enum { + TYPE_F16 = 0, + TYPE_F32 = 1, + TYPE_U16 = 2, + TYPE_U32 = 3, + TYPE_S16 = 4, + TYPE_S32 = 5, + TYPE_U8 = 6, + TYPE_S8 = 7, // XXX I assume? +} type_t; + +static inline uint32_t type_size(type_t type) +{ + switch (type) { + case TYPE_F32: + case TYPE_U32: + case TYPE_S32: + return 32; + case TYPE_F16: + case TYPE_U16: + case TYPE_S16: + return 16; + case TYPE_U8: + case TYPE_S8: + return 8; + default: + assert(0); /* invalid type */ + return 0; + } +} + +static inline int type_float(type_t type) +{ + return (type == TYPE_F32) || (type == TYPE_F16); +} + +static inline int type_uint(type_t type) +{ + return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8); +} + +static inline int type_sint(type_t type) +{ + return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8); +} + +typedef union PACKED { + /* normal gpr or const src register: */ + struct PACKED { + uint32_t comp : 2; + uint32_t num : 10; + }; + /* for immediate val: */ + int32_t iim_val : 11; + /* to make compiler happy: */ + uint32_t dummy32; + uint32_t dummy10 : 10; + uint32_t dummy11 : 11; + uint32_t dummy12 : 12; + uint32_t dummy13 : 13; + uint32_t dummy8 : 8; +} reg_t; + +/* special registers: */ +#define REG_A0 61 /* address register */ +#define REG_P0 62 /* predicate register */ + +static inline int reg_special(reg_t reg) +{ + return (reg.num == REG_A0) || (reg.num == REG_P0); +} + +typedef struct PACKED { + /* dword0: */ + int16_t immed : 16; + uint32_t dummy1 : 16; + + /* dword1: */ + uint32_t dummy2 : 8; + uint32_t repeat : 3; + uint32_t dummy3 : 1; + uint32_t ss : 1; + uint32_t dummy4 : 7; + uint32_t inv : 1; + uint32_t comp : 2; + uint32_t opc : 4; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; +} instr_cat0_t; + +typedef struct PACKED { + /* dword0: */ + union PACKED { + /* for normal src register: */ + struct PACKED { + uint32_t src : 11; + /* at least low bit of pad must be zero or it will + * look like a address relative src + */ + uint32_t pad : 21; + }; + /* for address relative: */ + struct PACKED { + int32_t off : 10; + uint32_t src_rel_c : 1; + uint32_t src_rel : 1; + uint32_t unknown : 20; + }; + /* for immediate: */ + int32_t iim_val; + float fim_val; + }; + + /* dword1: */ + uint32_t dst : 8; + uint32_t repeat : 3; + uint32_t src_r : 1; + uint32_t ss : 1; + uint32_t ul : 1; + uint32_t dst_type : 3; + uint32_t dst_rel : 1; + uint32_t src_type : 3; + uint32_t src_c : 1; + uint32_t src_im : 1; + uint32_t even : 1; + uint32_t pos_inf : 1; + uint32_t must_be_0 : 2; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; +} instr_cat1_t; + +typedef struct PACKED { + /* dword0: */ + union PACKED { + struct PACKED { + uint32_t src1 : 11; + uint32_t must_be_zero1: 2; + uint32_t src1_im : 1; /* immediate */ + uint32_t src1_neg : 1; /* negate */ + uint32_t src1_abs : 1; /* absolute value */ + }; + struct PACKED { + uint32_t src1 : 10; + uint32_t src1_c : 1; /* relative-const */ + uint32_t src1_rel : 1; /* relative address */ + uint32_t must_be_zero : 1; + uint32_t dummy : 3; + } rel1; + struct PACKED { + uint32_t src1 : 12; + uint32_t src1_c : 1; /* const */ + uint32_t dummy : 3; + } c1; + }; + + union PACKED { + struct PACKED { + uint32_t src2 : 11; + uint32_t must_be_zero2: 2; + uint32_t src2_im : 1; /* immediate */ + uint32_t src2_neg : 1; /* negate */ + uint32_t src2_abs : 1; /* absolute value */ + }; + struct PACKED { + uint32_t src2 : 10; + uint32_t src2_c : 1; /* relative-const */ + uint32_t src2_rel : 1; /* relative address */ + uint32_t must_be_zero : 1; + uint32_t dummy : 3; + } rel2; + struct PACKED { + uint32_t src2 : 12; + uint32_t src2_c : 1; /* const */ + uint32_t dummy : 3; + } c2; + }; + + /* dword1: */ + uint32_t dst : 8; + uint32_t repeat : 3; + uint32_t src1_r : 1; + uint32_t ss : 1; + uint32_t ul : 1; /* dunno */ + uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */ + uint32_t ei : 1; + uint32_t cond : 3; + uint32_t src2_r : 1; + uint32_t full : 1; /* not half */ + uint32_t opc : 6; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; +} instr_cat2_t; + +typedef struct PACKED { + /* dword0: */ + union PACKED { + struct PACKED { + uint32_t src1 : 11; + uint32_t must_be_zero1: 2; + uint32_t src2_c : 1; + uint32_t src1_neg : 1; + uint32_t src2_r : 1; + }; + struct PACKED { + uint32_t src1 : 10; + uint32_t src1_c : 1; + uint32_t src1_rel : 1; + uint32_t must_be_zero : 1; + uint32_t dummy : 3; + } rel1; + struct PACKED { + uint32_t src1 : 12; + uint32_t src1_c : 1; + uint32_t dummy : 3; + } c1; + }; + + union PACKED { + struct PACKED { + uint32_t src3 : 11; + uint32_t must_be_zero2: 2; + uint32_t src3_r : 1; + uint32_t src2_neg : 1; + uint32_t src3_neg : 1; + }; + struct PACKED { + uint32_t src3 : 10; + uint32_t src3_c : 1; + uint32_t src3_rel : 1; + uint32_t must_be_zero : 1; + uint32_t dummy : 3; + } rel2; + struct PACKED { + uint32_t src3 : 12; + uint32_t src3_c : 1; + uint32_t dummy : 3; + } c2; + }; + + /* dword1: */ + uint32_t dst : 8; + uint32_t repeat : 3; + uint32_t src1_r : 1; + uint32_t ss : 1; + uint32_t ul : 1; + uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */ + uint32_t src2 : 8; + uint32_t opc : 4; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; +} instr_cat3_t; + +static inline bool instr_cat3_full(instr_cat3_t *cat3) +{ + switch (cat3->opc) { + case OPC_MAD_F16: + case OPC_MAD_U16: + case OPC_MAD_S16: + case OPC_SEL_B16: + case OPC_SEL_S16: + case OPC_SEL_F16: + case OPC_SAD_S16: + case OPC_SAD_S32: // really?? + return false; + default: + return true; + } +} + +typedef struct PACKED { + /* dword0: */ + union PACKED { + struct PACKED { + uint32_t src : 11; + uint32_t must_be_zero1: 2; + uint32_t src_im : 1; /* immediate */ + uint32_t src_neg : 1; /* negate */ + uint32_t src_abs : 1; /* absolute value */ + }; + struct PACKED { + uint32_t src : 10; + uint32_t src_c : 1; /* relative-const */ + uint32_t src_rel : 1; /* relative address */ + uint32_t must_be_zero : 1; + uint32_t dummy : 3; + } rel; + struct PACKED { + uint32_t src : 12; + uint32_t src_c : 1; /* const */ + uint32_t dummy : 3; + } c; + }; + uint32_t dummy1 : 16; /* seem to be ignored */ + + /* dword1: */ + uint32_t dst : 8; + uint32_t repeat : 3; + uint32_t src_r : 1; + uint32_t ss : 1; + uint32_t ul : 1; + uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */ + uint32_t dummy2 : 5; /* seem to be ignored */ + uint32_t full : 1; /* not half */ + uint32_t opc : 6; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; +} instr_cat4_t; + +typedef struct PACKED { + /* dword0: */ + union PACKED { + /* normal case: */ + struct PACKED { + uint32_t full : 1; /* not half */ + uint32_t src1 : 8; + uint32_t src2 : 8; + uint32_t dummy1 : 4; /* seem to be ignored */ + uint32_t samp : 4; + uint32_t tex : 7; + } norm; + /* s2en case: */ + struct PACKED { + uint32_t full : 1; /* not half */ + uint32_t src1 : 8; + uint32_t src2 : 11; + uint32_t dummy1 : 1; + uint32_t src3 : 8; + uint32_t dummy2 : 3; + } s2en; + /* same in either case: */ + // XXX I think, confirm this + struct PACKED { + uint32_t full : 1; /* not half */ + uint32_t src1 : 8; + uint32_t pad : 23; + }; + }; + + /* dword1: */ + uint32_t dst : 8; + uint32_t wrmask : 4; /* write-mask */ + uint32_t type : 3; + uint32_t dummy2 : 1; /* seems to be ignored */ + uint32_t is_3d : 1; + + uint32_t is_a : 1; + uint32_t is_s : 1; + uint32_t is_s2en : 1; + uint32_t is_o : 1; + uint32_t is_p : 1; + + uint32_t opc : 5; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; +} instr_cat5_t; + +/* used for load instructions: */ +typedef struct PACKED { + /* dword0: */ + uint32_t must_be_one1 : 1; + int16_t off : 13; + uint32_t src : 8; + uint32_t dummy1 : 1; + uint32_t must_be_one2 : 1; + int32_t iim_val : 8; + + /* dword1: */ + uint32_t dst : 8; + uint32_t dummy2 : 9; + uint32_t type : 3; + uint32_t dummy3 : 2; + uint32_t opc : 5; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; +} instr_cat6a_t; + +/* used for store instructions: */ +typedef struct PACKED { + /* dword0: */ + uint32_t must_be_zero1 : 1; + uint32_t src : 8; + uint32_t off_hi : 5; /* high bits of 'off'... ugly! */ + uint32_t dummy1 : 9; + uint32_t must_be_one1 : 1; + int32_t iim_val : 8; + + /* dword1: */ + uint16_t off : 8; + uint32_t must_be_one2 : 1; + uint32_t dst : 8; + uint32_t type : 3; + uint32_t dummy2 : 2; + uint32_t opc : 5; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; +} instr_cat6b_t; + +typedef union PACKED { + instr_cat6a_t a; + instr_cat6b_t b; + struct PACKED { + /* dword0: */ + uint32_t pad1 : 24; + int32_t iim_val : 8; + + /* dword1: */ + uint32_t pad2 : 17; + uint32_t type : 3; + uint32_t pad3 : 2; + uint32_t opc : 5; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; + }; +} instr_cat6_t; + +typedef union PACKED { + instr_cat0_t cat0; + instr_cat1_t cat1; + instr_cat2_t cat2; + instr_cat3_t cat3; + instr_cat4_t cat4; + instr_cat5_t cat5; + instr_cat6_t cat6; + struct PACKED { + /* dword0: */ + uint64_t pad1 : 40; + uint32_t repeat : 3; /* cat0-cat4 */ + uint32_t pad2 : 1; + uint32_t ss : 1; /* cat1-cat4 (cat0??) */ + uint32_t ul : 1; /* cat2-cat4 (and cat1 in blob.. which may be bug??) */ + uint32_t pad3 : 13; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; + + }; +} instr_t; + +static inline uint32_t instr_opc(instr_t *instr) +{ + switch (instr->opc_cat) { + case 0: return instr->cat0.opc; + case 1: return 0; + case 2: return instr->cat2.opc; + case 3: return instr->cat3.opc; + case 4: return instr->cat4.opc; + case 5: return instr->cat5.opc; + case 6: return instr->cat6.opc; + default: return 0; + } +} + +static inline bool is_mad(opc_t opc) +{ + switch (opc) { + case OPC_MAD_U16: + case OPC_MADSH_U16: + case OPC_MAD_S16: + case OPC_MADSH_M16: + case OPC_MAD_U24: + case OPC_MAD_S24: + case OPC_MAD_F16: + case OPC_MAD_F32: + return true; + default: + return false; + } +} + +#endif /* INSTR_A3XX_H_ */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c new file mode 100644 index 00000000000..ea2a9251b28 --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3.c @@ -0,0 +1,675 @@ +/* + * Copyright (c) 2012 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ir3.h" + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include <stdbool.h> +#include <errno.h> + +#include "freedreno_util.h" +#include "instr-a3xx.h" + +#define CHUNK_SZ 1020 + +struct ir3_heap_chunk { + struct ir3_heap_chunk *next; + uint32_t heap[CHUNK_SZ]; +}; + +static void grow_heap(struct ir3 *shader) +{ + struct ir3_heap_chunk *chunk = calloc(1, sizeof(*chunk)); + chunk->next = shader->chunk; + shader->chunk = chunk; + shader->heap_idx = 0; +} + +/* simple allocator to carve allocations out of an up-front allocated heap, + * so that we can free everything easily in one shot. + */ +void * ir3_alloc(struct ir3 *shader, int sz) +{ + void *ptr; + + sz = align(sz, 4) / 4; + + if ((shader->heap_idx + sz) > CHUNK_SZ) + grow_heap(shader); + + ptr = &shader->chunk->heap[shader->heap_idx]; + shader->heap_idx += sz; + + return ptr; +} + +struct ir3 * ir3_create(void) +{ + struct ir3 *shader = + calloc(1, sizeof(struct ir3)); + grow_heap(shader); + return shader; +} + +void ir3_destroy(struct ir3 *shader) +{ + while (shader->chunk) { + struct ir3_heap_chunk *chunk = shader->chunk; + shader->chunk = chunk->next; + free(chunk); + } + free(shader); +} + +#define iassert(cond) do { \ + if (!(cond)) { \ + assert(cond); \ + return -1; \ + } } while (0) + +static uint32_t reg(struct ir3_register *reg, struct ir3_info *info, + uint32_t repeat, uint32_t valid_flags) +{ + reg_t val = { .dummy32 = 0 }; + + assert(!(reg->flags & ~valid_flags)); + + if (!(reg->flags & IR3_REG_R)) + repeat = 0; + + if (reg->flags & IR3_REG_IMMED) { + val.iim_val = reg->iim_val; + } else { + int8_t components = util_last_bit(reg->wrmask); + int8_t max = (reg->num + repeat + components - 1) >> 2; + + val.comp = reg->num & 0x3; + val.num = reg->num >> 2; + + if (reg->flags & IR3_REG_CONST) { + info->max_const = MAX2(info->max_const, max); + } else if ((max != REG_A0) && (max != REG_P0)) { + if (reg->flags & IR3_REG_HALF) { + info->max_half_reg = MAX2(info->max_half_reg, max); + } else { + info->max_reg = MAX2(info->max_reg, max); + } + } + } + + return val.dummy32; +} + +static int emit_cat0(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) +{ + instr_cat0_t *cat0 = ptr; + + cat0->immed = instr->cat0.immed; + cat0->repeat = instr->repeat; + cat0->ss = !!(instr->flags & IR3_INSTR_SS); + cat0->inv = instr->cat0.inv; + cat0->comp = instr->cat0.comp; + cat0->opc = instr->opc; + cat0->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat0->sync = !!(instr->flags & IR3_INSTR_SY); + cat0->opc_cat = 0; + + return 0; +} + +static uint32_t type_flags(type_t type) +{ + return (type_size(type) == 32) ? 0 : IR3_REG_HALF; +} + +static int emit_cat1(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) +{ + struct ir3_register *dst = instr->regs[0]; + struct ir3_register *src = instr->regs[1]; + instr_cat1_t *cat1 = ptr; + + iassert(instr->regs_count == 2); + iassert(!((dst->flags ^ type_flags(instr->cat1.dst_type)) & IR3_REG_HALF)); + iassert((src->flags & IR3_REG_IMMED) || + !((src->flags ^ type_flags(instr->cat1.src_type)) & IR3_REG_HALF)); + + if (src->flags & IR3_REG_IMMED) { + cat1->iim_val = src->iim_val; + cat1->src_im = 1; + } else if (src->flags & IR3_REG_RELATIV) { + cat1->off = src->offset; + cat1->src_rel = 1; + cat1->src_rel_c = !!(src->flags & IR3_REG_CONST); + } else { + cat1->src = reg(src, info, instr->repeat, + IR3_REG_IMMED | IR3_REG_R | + IR3_REG_CONST | IR3_REG_HALF); + cat1->src_c = !!(src->flags & IR3_REG_CONST); + } + + cat1->dst = reg(dst, info, instr->repeat, + IR3_REG_RELATIV | IR3_REG_EVEN | + IR3_REG_R | IR3_REG_POS_INF | IR3_REG_HALF); + cat1->repeat = instr->repeat; + cat1->src_r = !!(src->flags & IR3_REG_R); + cat1->ss = !!(instr->flags & IR3_INSTR_SS); + cat1->ul = !!(instr->flags & IR3_INSTR_UL); + cat1->dst_type = instr->cat1.dst_type; + cat1->dst_rel = !!(dst->flags & IR3_REG_RELATIV); + cat1->src_type = instr->cat1.src_type; + cat1->even = !!(dst->flags & IR3_REG_EVEN); + cat1->pos_inf = !!(dst->flags & IR3_REG_POS_INF); + cat1->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat1->sync = !!(instr->flags & IR3_INSTR_SY); + cat1->opc_cat = 1; + + return 0; +} + +static int emit_cat2(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) +{ + struct ir3_register *dst = instr->regs[0]; + struct ir3_register *src1 = instr->regs[1]; + struct ir3_register *src2 = instr->regs[2]; + instr_cat2_t *cat2 = ptr; + + iassert((instr->regs_count == 2) || (instr->regs_count == 3)); + + if (src1->flags & IR3_REG_RELATIV) { + iassert(src1->num < (1 << 10)); + cat2->rel1.src1 = reg(src1, info, instr->repeat, + IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE | + IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF); + cat2->rel1.src1_c = !!(src1->flags & IR3_REG_CONST); + cat2->rel1.src1_rel = 1; + } else if (src1->flags & IR3_REG_CONST) { + iassert(src1->num < (1 << 12)); + cat2->c1.src1 = reg(src1, info, instr->repeat, + IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS | + IR3_REG_R | IR3_REG_HALF); + cat2->c1.src1_c = 1; + } else { + iassert(src1->num < (1 << 11)); + cat2->src1 = reg(src1, info, instr->repeat, + IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS | + IR3_REG_R | IR3_REG_HALF); + } + cat2->src1_im = !!(src1->flags & IR3_REG_IMMED); + cat2->src1_neg = !!(src1->flags & IR3_REG_NEGATE); + cat2->src1_abs = !!(src1->flags & IR3_REG_ABS); + cat2->src1_r = !!(src1->flags & IR3_REG_R); + + if (src2) { + iassert((src2->flags & IR3_REG_IMMED) || + !((src1->flags ^ src2->flags) & IR3_REG_HALF)); + + if (src2->flags & IR3_REG_RELATIV) { + iassert(src2->num < (1 << 10)); + cat2->rel2.src2 = reg(src2, info, instr->repeat, + IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE | + IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF); + cat2->rel2.src2_c = !!(src2->flags & IR3_REG_CONST); + cat2->rel2.src2_rel = 1; + } else if (src2->flags & IR3_REG_CONST) { + iassert(src2->num < (1 << 12)); + cat2->c2.src2 = reg(src2, info, instr->repeat, + IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS | + IR3_REG_R | IR3_REG_HALF); + cat2->c2.src2_c = 1; + } else { + iassert(src2->num < (1 << 11)); + cat2->src2 = reg(src2, info, instr->repeat, + IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS | + IR3_REG_R | IR3_REG_HALF); + } + + cat2->src2_im = !!(src2->flags & IR3_REG_IMMED); + cat2->src2_neg = !!(src2->flags & IR3_REG_NEGATE); + cat2->src2_abs = !!(src2->flags & IR3_REG_ABS); + cat2->src2_r = !!(src2->flags & IR3_REG_R); + } + + cat2->dst = reg(dst, info, instr->repeat, + IR3_REG_R | IR3_REG_EI | IR3_REG_HALF); + cat2->repeat = instr->repeat; + cat2->ss = !!(instr->flags & IR3_INSTR_SS); + cat2->ul = !!(instr->flags & IR3_INSTR_UL); + cat2->dst_half = !!((src1->flags ^ dst->flags) & IR3_REG_HALF); + cat2->ei = !!(dst->flags & IR3_REG_EI); + cat2->cond = instr->cat2.condition; + cat2->full = ! (src1->flags & IR3_REG_HALF); + cat2->opc = instr->opc; + cat2->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat2->sync = !!(instr->flags & IR3_INSTR_SY); + cat2->opc_cat = 2; + + return 0; +} + +static int emit_cat3(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) +{ + struct ir3_register *dst = instr->regs[0]; + struct ir3_register *src1 = instr->regs[1]; + struct ir3_register *src2 = instr->regs[2]; + struct ir3_register *src3 = instr->regs[3]; + instr_cat3_t *cat3 = ptr; + uint32_t src_flags = 0; + + switch (instr->opc) { + case OPC_MAD_F16: + case OPC_MAD_U16: + case OPC_MAD_S16: + case OPC_SEL_B16: + case OPC_SEL_S16: + case OPC_SEL_F16: + case OPC_SAD_S16: + case OPC_SAD_S32: // really?? + src_flags |= IR3_REG_HALF; + break; + default: + break; + } + + iassert(instr->regs_count == 4); + iassert(!((src1->flags ^ src_flags) & IR3_REG_HALF)); + iassert(!((src2->flags ^ src_flags) & IR3_REG_HALF)); + iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF)); + + if (src1->flags & IR3_REG_RELATIV) { + iassert(src1->num < (1 << 10)); + cat3->rel1.src1 = reg(src1, info, instr->repeat, + IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE | + IR3_REG_R | IR3_REG_HALF); + cat3->rel1.src1_c = !!(src1->flags & IR3_REG_CONST); + cat3->rel1.src1_rel = 1; + } else if (src1->flags & IR3_REG_CONST) { + iassert(src1->num < (1 << 12)); + cat3->c1.src1 = reg(src1, info, instr->repeat, + IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_R | + IR3_REG_HALF); + cat3->c1.src1_c = 1; + } else { + iassert(src1->num < (1 << 11)); + cat3->src1 = reg(src1, info, instr->repeat, + IR3_REG_NEGATE | IR3_REG_R | IR3_REG_HALF); + } + + cat3->src1_neg = !!(src1->flags & IR3_REG_NEGATE); + cat3->src1_r = !!(src1->flags & IR3_REG_R); + + cat3->src2 = reg(src2, info, instr->repeat, + IR3_REG_CONST | IR3_REG_NEGATE | + IR3_REG_R | IR3_REG_HALF); + cat3->src2_c = !!(src2->flags & IR3_REG_CONST); + cat3->src2_neg = !!(src2->flags & IR3_REG_NEGATE); + cat3->src2_r = !!(src2->flags & IR3_REG_R); + + + if (src3->flags & IR3_REG_RELATIV) { + iassert(src3->num < (1 << 10)); + cat3->rel2.src3 = reg(src3, info, instr->repeat, + IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE | + IR3_REG_R | IR3_REG_HALF); + cat3->rel2.src3_c = !!(src3->flags & IR3_REG_CONST); + cat3->rel2.src3_rel = 1; + } else if (src3->flags & IR3_REG_CONST) { + iassert(src3->num < (1 << 12)); + cat3->c2.src3 = reg(src3, info, instr->repeat, + IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_R | + IR3_REG_HALF); + cat3->c2.src3_c = 1; + } else { + iassert(src3->num < (1 << 11)); + cat3->src3 = reg(src3, info, instr->repeat, + IR3_REG_NEGATE | IR3_REG_R | IR3_REG_HALF); + } + + cat3->src3_neg = !!(src3->flags & IR3_REG_NEGATE); + cat3->src3_r = !!(src3->flags & IR3_REG_R); + + cat3->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); + cat3->repeat = instr->repeat; + cat3->ss = !!(instr->flags & IR3_INSTR_SS); + cat3->ul = !!(instr->flags & IR3_INSTR_UL); + cat3->dst_half = !!((src_flags ^ dst->flags) & IR3_REG_HALF); + cat3->opc = instr->opc; + cat3->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat3->sync = !!(instr->flags & IR3_INSTR_SY); + cat3->opc_cat = 3; + + return 0; +} + +static int emit_cat4(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) +{ + struct ir3_register *dst = instr->regs[0]; + struct ir3_register *src = instr->regs[1]; + instr_cat4_t *cat4 = ptr; + + iassert(instr->regs_count == 2); + + if (src->flags & IR3_REG_RELATIV) { + iassert(src->num < (1 << 10)); + cat4->rel.src = reg(src, info, instr->repeat, + IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE | + IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF); + cat4->rel.src_c = !!(src->flags & IR3_REG_CONST); + cat4->rel.src_rel = 1; + } else if (src->flags & IR3_REG_CONST) { + iassert(src->num < (1 << 12)); + cat4->c.src = reg(src, info, instr->repeat, + IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS | + IR3_REG_R | IR3_REG_HALF); + cat4->c.src_c = 1; + } else { + iassert(src->num < (1 << 11)); + cat4->src = reg(src, info, instr->repeat, + IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS | + IR3_REG_R | IR3_REG_HALF); + } + + cat4->src_im = !!(src->flags & IR3_REG_IMMED); + cat4->src_neg = !!(src->flags & IR3_REG_NEGATE); + cat4->src_abs = !!(src->flags & IR3_REG_ABS); + cat4->src_r = !!(src->flags & IR3_REG_R); + + cat4->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); + cat4->repeat = instr->repeat; + cat4->ss = !!(instr->flags & IR3_INSTR_SS); + cat4->ul = !!(instr->flags & IR3_INSTR_UL); + cat4->dst_half = !!((src->flags ^ dst->flags) & IR3_REG_HALF); + cat4->full = ! (src->flags & IR3_REG_HALF); + cat4->opc = instr->opc; + cat4->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat4->sync = !!(instr->flags & IR3_INSTR_SY); + cat4->opc_cat = 4; + + return 0; +} + +static int emit_cat5(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) +{ + struct ir3_register *dst = instr->regs[0]; + struct ir3_register *src1 = instr->regs[1]; + struct ir3_register *src2 = instr->regs[2]; + struct ir3_register *src3 = instr->regs[3]; + instr_cat5_t *cat5 = ptr; + + iassert(!((dst->flags ^ type_flags(instr->cat5.type)) & IR3_REG_HALF)); + + if (src1) { + cat5->full = ! (src1->flags & IR3_REG_HALF); + cat5->src1 = reg(src1, info, instr->repeat, IR3_REG_HALF); + } + + + if (instr->flags & IR3_INSTR_S2EN) { + if (src2) { + iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF)); + cat5->s2en.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF); + } + if (src3) { + iassert(src3->flags & IR3_REG_HALF); + cat5->s2en.src3 = reg(src3, info, instr->repeat, IR3_REG_HALF); + } + iassert(!(instr->cat5.samp | instr->cat5.tex)); + } else { + iassert(!src3); + if (src2) { + iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF)); + cat5->norm.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF); + } + cat5->norm.samp = instr->cat5.samp; + cat5->norm.tex = instr->cat5.tex; + } + + cat5->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); + cat5->wrmask = dst->wrmask; + cat5->type = instr->cat5.type; + cat5->is_3d = !!(instr->flags & IR3_INSTR_3D); + cat5->is_a = !!(instr->flags & IR3_INSTR_A); + cat5->is_s = !!(instr->flags & IR3_INSTR_S); + cat5->is_s2en = !!(instr->flags & IR3_INSTR_S2EN); + cat5->is_o = !!(instr->flags & IR3_INSTR_O); + cat5->is_p = !!(instr->flags & IR3_INSTR_P); + cat5->opc = instr->opc; + cat5->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat5->sync = !!(instr->flags & IR3_INSTR_SY); + cat5->opc_cat = 5; + + return 0; +} + +static int emit_cat6(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) +{ + struct ir3_register *dst = instr->regs[0]; + struct ir3_register *src = instr->regs[1]; + instr_cat6_t *cat6 = ptr; + + iassert(instr->regs_count == 2); + + switch (instr->opc) { + /* load instructions: */ + case OPC_LDG: + case OPC_LDP: + case OPC_LDL: + case OPC_LDLW: + case OPC_LDLV: + case OPC_PREFETCH: { + instr_cat6a_t *cat6a = ptr; + + iassert(!((dst->flags ^ type_flags(instr->cat6.type)) & IR3_REG_HALF)); + + cat6a->must_be_one1 = 1; + cat6a->must_be_one2 = 1; + cat6a->off = instr->cat6.offset; + cat6a->src = reg(src, info, instr->repeat, 0); + cat6a->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); + break; + } + /* store instructions: */ + case OPC_STG: + case OPC_STP: + case OPC_STL: + case OPC_STLW: + case OPC_STI: { + instr_cat6b_t *cat6b = ptr; + uint32_t src_flags = type_flags(instr->cat6.type); + uint32_t dst_flags = (instr->opc == OPC_STI) ? IR3_REG_HALF : 0; + + iassert(!((src->flags ^ src_flags) & IR3_REG_HALF)); + + cat6b->must_be_one1 = 1; + cat6b->must_be_one2 = 1; + cat6b->src = reg(src, info, instr->repeat, src_flags); + cat6b->off_hi = instr->cat6.offset >> 8; + cat6b->off = instr->cat6.offset; + cat6b->dst = reg(dst, info, instr->repeat, IR3_REG_R | dst_flags); + + break; + } + default: + // TODO + break; + } + + cat6->iim_val = instr->cat6.iim_val; + cat6->type = instr->cat6.type; + cat6->opc = instr->opc; + cat6->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat6->sync = !!(instr->flags & IR3_INSTR_SY); + cat6->opc_cat = 6; + + return 0; +} + +static int (*emit[])(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) = { + emit_cat0, emit_cat1, emit_cat2, emit_cat3, emit_cat4, emit_cat5, emit_cat6, +}; + +void * ir3_assemble(struct ir3 *shader, struct ir3_info *info) +{ + uint32_t *ptr, *dwords; + uint32_t i; + + info->max_reg = -1; + info->max_half_reg = -1; + info->max_const = -1; + info->instrs_count = 0; + + /* need a integer number of instruction "groups" (sets of four + * instructions), so pad out w/ NOPs if needed: + * (each instruction is 64bits) + */ + info->sizedwords = 2 * align(shader->instrs_count, 4); + + ptr = dwords = calloc(1, 4 * info->sizedwords); + + for (i = 0; i < shader->instrs_count; i++) { + struct ir3_instruction *instr = shader->instrs[i]; + int ret = emit[instr->category](instr, dwords, info); + if (ret) + goto fail; + info->instrs_count += 1 + instr->repeat; + dwords += 2; + } + + return ptr; + +fail: + free(ptr); + return NULL; +} + +static struct ir3_register * reg_create(struct ir3 *shader, + int num, int flags) +{ + struct ir3_register *reg = + ir3_alloc(shader, sizeof(struct ir3_register)); + reg->wrmask = 1; + reg->flags = flags; + reg->num = num; + return reg; +} + +static void insert_instr(struct ir3 *shader, + struct ir3_instruction *instr) +{ +#ifdef DEBUG + static uint32_t serialno = 0; + instr->serialno = ++serialno; +#endif + if (shader->instrs_count == shader->instrs_sz) { + shader->instrs_sz = MAX2(2 * shader->instrs_sz, 16); + shader->instrs = realloc(shader->instrs, + shader->instrs_sz * sizeof(shader->instrs[0])); + } + shader->instrs[shader->instrs_count++] = instr; +} + +struct ir3_block * ir3_block_create(struct ir3 *shader, + unsigned ntmp, unsigned nin, unsigned nout) +{ + struct ir3_block *block; + unsigned size; + char *ptr; + + size = sizeof(*block); + size += sizeof(block->temporaries[0]) * ntmp; + size += sizeof(block->inputs[0]) * nin; + size += sizeof(block->outputs[0]) * nout; + + ptr = ir3_alloc(shader, size); + + block = (void *)ptr; + ptr += sizeof(*block); + + block->temporaries = (void *)ptr; + block->ntemporaries = ntmp; + ptr += sizeof(block->temporaries[0]) * ntmp; + + block->inputs = (void *)ptr; + block->ninputs = nin; + ptr += sizeof(block->inputs[0]) * nin; + + block->outputs = (void *)ptr; + block->noutputs = nout; + ptr += sizeof(block->outputs[0]) * nout; + + block->shader = shader; + + return block; +} + +struct ir3_instruction * ir3_instr_create(struct ir3_block *block, + int category, opc_t opc) +{ + struct ir3_instruction *instr = + ir3_alloc(block->shader, sizeof(struct ir3_instruction)); + instr->block = block; + instr->category = category; + instr->opc = opc; + insert_instr(block->shader, instr); + return instr; +} + +struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr) +{ + struct ir3_instruction *new_instr = + ir3_alloc(instr->block->shader, sizeof(struct ir3_instruction)); + unsigned i; + + *new_instr = *instr; + insert_instr(instr->block->shader, new_instr); + + /* clone registers: */ + new_instr->regs_count = 0; + for (i = 0; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + struct ir3_register *new_reg = + ir3_reg_create(new_instr, reg->num, reg->flags); + *new_reg = *reg; + } + + return new_instr; +} + +struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, + int num, int flags) +{ + struct ir3_register *reg = reg_create(instr->block->shader, num, flags); + assert(instr->regs_count < ARRAY_SIZE(instr->regs)); + instr->regs[instr->regs_count++] = reg; + return reg; +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h new file mode 100644 index 00000000000..9ed914ba2e4 --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3.h @@ -0,0 +1,480 @@ +/* + * Copyright (c) 2013 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IR3_H_ +#define IR3_H_ + +#include <stdint.h> +#include <stdbool.h> + +#include "instr-a3xx.h" +#include "disasm.h" /* TODO move 'enum shader_t' somewhere else.. */ + +/* low level intermediate representation of an adreno shader program */ + +struct ir3; +struct ir3_instruction; +struct ir3_block; + +struct ir3 * fd_asm_parse(const char *src); + +struct ir3_info { + uint16_t sizedwords; + uint16_t instrs_count; /* expanded to account for rpt's */ + /* NOTE: max_reg, etc, does not include registers not touched + * by the shader (ie. vertex fetched via VFD_DECODE but not + * touched by shader) + */ + int8_t max_reg; /* highest GPR # used by shader */ + int8_t max_half_reg; + int8_t max_const; +}; + +struct ir3_register { + enum { + IR3_REG_CONST = 0x001, + IR3_REG_IMMED = 0x002, + IR3_REG_HALF = 0x004, + IR3_REG_RELATIV= 0x008, + IR3_REG_R = 0x010, + IR3_REG_NEGATE = 0x020, + IR3_REG_ABS = 0x040, + IR3_REG_EVEN = 0x080, + IR3_REG_POS_INF= 0x100, + /* (ei) flag, end-input? Set on last bary, presumably to signal + * that the shader needs no more input: + */ + IR3_REG_EI = 0x200, + /* meta-flags, for intermediate stages of IR, ie. + * before register assignment is done: + */ + IR3_REG_SSA = 0x1000, /* 'instr' is ptr to assigning instr */ + IR3_REG_IA = 0x2000, /* meta-input dst is "assigned" */ + IR3_REG_ADDR = 0x4000, /* register is a0.x */ + } flags; + union { + /* normal registers: + * the component is in the low two bits of the reg #, so + * rN.x becomes: (N << 2) | x + */ + int num; + /* immediate: */ + int iim_val; + float fim_val; + /* relative: */ + int offset; + /* for IR3_REG_SSA, src registers contain ptr back to + * assigning instruction. + */ + struct ir3_instruction *instr; + }; + + /* used for cat5 instructions, but also for internal/IR level + * tracking of what registers are read/written by an instruction. + * wrmask may be a bad name since it is used to represent both + * src and dst that touch multiple adjacent registers. + */ + int wrmask; +}; + +struct ir3_instruction { + struct ir3_block *block; + int category; + opc_t opc; + enum { + /* (sy) flag is set on first instruction, and after sample + * instructions (probably just on RAW hazard). + */ + IR3_INSTR_SY = 0x001, + /* (ss) flag is set on first instruction, and first instruction + * to depend on the result of "long" instructions (RAW hazard): + * + * rcp, rsq, log2, exp2, sin, cos, sqrt + * + * It seems to synchronize until all in-flight instructions are + * completed, for example: + * + * rsq hr1.w, hr1.w + * add.f hr2.z, (neg)hr2.z, hc0.y + * mul.f hr2.w, (neg)hr2.y, (neg)hr2.y + * rsq hr2.x, hr2.x + * (rpt1)nop + * mad.f16 hr2.w, hr2.z, hr2.z, hr2.w + * nop + * mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w + * (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w + * (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x + * + * The last mul.f does not have (ss) set, presumably because the + * (ss) on the previous instruction does the job. + * + * The blob driver also seems to set it on WAR hazards, although + * not really clear if this is needed or just blob compiler being + * sloppy. So far I haven't found a case where removing the (ss) + * causes problems for WAR hazard, but I could just be getting + * lucky: + * + * rcp r1.y, r3.y + * (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z + * + */ + IR3_INSTR_SS = 0x002, + /* (jp) flag is set on jump targets: + */ + IR3_INSTR_JP = 0x004, + IR3_INSTR_UL = 0x008, + IR3_INSTR_3D = 0x010, + IR3_INSTR_A = 0x020, + IR3_INSTR_O = 0x040, + IR3_INSTR_P = 0x080, + IR3_INSTR_S = 0x100, + IR3_INSTR_S2EN = 0x200, + /* meta-flags, for intermediate stages of IR, ie. + * before register assignment is done: + */ + IR3_INSTR_MARK = 0x1000, + } flags; + int repeat; + unsigned regs_count; + struct ir3_register *regs[5]; + union { + struct { + char inv; + char comp; + int immed; + } cat0; + struct { + type_t src_type, dst_type; + } cat1; + struct { + enum { + IR3_COND_LT = 0, + IR3_COND_LE = 1, + IR3_COND_GT = 2, + IR3_COND_GE = 3, + IR3_COND_EQ = 4, + IR3_COND_NE = 5, + } condition; + } cat2; + struct { + unsigned samp, tex; + type_t type; + } cat5; + struct { + type_t type; + int offset; + int iim_val; + } cat6; + /* for meta-instructions, just used to hold extra data + * before instruction scheduling, etc + */ + struct { + int off; /* component/offset */ + } fo; + struct { + struct ir3_block *if_block, *else_block; + } flow; + struct { + struct ir3_block *block; + } inout; + }; + + /* transient values used during various algorithms: */ + union { + /* The instruction depth is the max dependency distance to output. + * + * You can also think of it as the "cost", if we did any sort of + * optimization for register footprint. Ie. a value that is just + * result of moving a const to a reg would have a low cost, so to + * it could make sense to duplicate the instruction at various + * points where the result is needed to reduce register footprint. + */ + unsigned depth; + }; + struct ir3_instruction *next; +#ifdef DEBUG + uint32_t serialno; +#endif +}; + +struct ir3_heap_chunk; + +struct ir3 { + unsigned instrs_count, instrs_sz; + struct ir3_instruction **instrs; + unsigned heap_idx; + struct ir3_heap_chunk *chunk; +}; + +struct ir3_block { + struct ir3 *shader; + unsigned ntemporaries, ninputs, noutputs; + /* maps TGSI_FILE_TEMPORARY index back to the assigning instruction: */ + struct ir3_instruction **temporaries; + struct ir3_instruction **inputs; + struct ir3_instruction **outputs; + /* only a single address register: */ + struct ir3_instruction *address; + struct ir3_block *parent; + struct ir3_instruction *head; +}; + +struct ir3 * ir3_create(void); +void ir3_destroy(struct ir3 *shader); +void * ir3_assemble(struct ir3 *shader, + struct ir3_info *info); +void * ir3_alloc(struct ir3 *shader, int sz); + +struct ir3_block * ir3_block_create(struct ir3 *shader, + unsigned ntmp, unsigned nin, unsigned nout); + +struct ir3_instruction * ir3_instr_create(struct ir3_block *block, + int category, opc_t opc); +struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr); +const char *ir3_instr_name(struct ir3_instruction *instr); + +struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, + int num, int flags); + + +static inline bool ir3_instr_check_mark(struct ir3_instruction *instr) +{ + if (instr->flags & IR3_INSTR_MARK) + return true; /* already visited */ + instr->flags ^= IR3_INSTR_MARK; + return false; +} + +static inline void ir3_clear_mark(struct ir3 *shader) +{ + /* TODO would be nice to drop the instruction array.. for + * new compiler, _clear_mark() is all we use it for, and + * we could probably manage a linked list instead.. + */ + unsigned i; + for (i = 0; i < shader->instrs_count; i++) { + struct ir3_instruction *instr = shader->instrs[i]; + instr->flags &= ~IR3_INSTR_MARK; + } +} + +static inline int ir3_instr_regno(struct ir3_instruction *instr, + struct ir3_register *reg) +{ + unsigned i; + for (i = 0; i < instr->regs_count; i++) + if (reg == instr->regs[i]) + return i; + return -1; +} + + +/* comp: + * 0 - x + * 1 - y + * 2 - z + * 3 - w + */ +static inline uint32_t regid(int num, int comp) +{ + return (num << 2) | (comp & 0x3); +} + +static inline uint32_t reg_num(struct ir3_register *reg) +{ + return reg->num >> 2; +} + +static inline uint32_t reg_comp(struct ir3_register *reg) +{ + return reg->num & 0x3; +} + +static inline bool is_flow(struct ir3_instruction *instr) +{ + return (instr->category == 0); +} + +static inline bool is_kill(struct ir3_instruction *instr) +{ + return is_flow(instr) && (instr->opc == OPC_KILL); +} + +static inline bool is_nop(struct ir3_instruction *instr) +{ + return is_flow(instr) && (instr->opc == OPC_NOP); +} + +static inline bool is_alu(struct ir3_instruction *instr) +{ + return (1 <= instr->category) && (instr->category <= 3); +} + +static inline bool is_sfu(struct ir3_instruction *instr) +{ + return (instr->category == 4); +} + +static inline bool is_tex(struct ir3_instruction *instr) +{ + return (instr->category == 5); +} + +static inline bool is_input(struct ir3_instruction *instr) +{ + return (instr->category == 2) && (instr->opc == OPC_BARY_F); +} + +static inline bool is_meta(struct ir3_instruction *instr) +{ + /* TODO how should we count PHI (and maybe fan-in/out) which + * might actually contribute some instructions to the final + * result? + */ + return (instr->category == -1); +} + +static inline bool is_addr(struct ir3_instruction *instr) +{ + return is_meta(instr) && (instr->opc == OPC_META_DEREF); +} + +static inline bool writes_addr(struct ir3_instruction *instr) +{ + if (instr->regs_count > 0) { + struct ir3_register *dst = instr->regs[0]; + return !!(dst->flags & IR3_REG_ADDR); + } + return false; +} + +static inline bool writes_pred(struct ir3_instruction *instr) +{ + if (instr->regs_count > 0) { + struct ir3_register *dst = instr->regs[0]; + return reg_num(dst) == REG_P0; + } + return false; +} + +static inline bool reg_gpr(struct ir3_register *r) +{ + if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_SSA | IR3_REG_ADDR)) + return false; + if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0)) + return false; + return true; +} + +/* dump: */ +#include <stdio.h> +void ir3_dump(struct ir3 *shader, const char *name, + struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3? */, + FILE *f); +void ir3_dump_instr_single(struct ir3_instruction *instr); +void ir3_dump_instr_list(struct ir3_instruction *instr); + +/* flatten if/else: */ +int ir3_block_flatten(struct ir3_block *block); + +/* depth calculation: */ +int ir3_delayslots(struct ir3_instruction *assigner, + struct ir3_instruction *consumer, unsigned n); +void ir3_block_depth(struct ir3_block *block); + +/* copy-propagate: */ +void ir3_block_cp(struct ir3_block *block); + +/* scheduling: */ +void ir3_block_sched(struct ir3_block *block); + +/* register assignment: */ +int ir3_block_ra(struct ir3_block *block, enum shader_t type, + bool half_precision, bool frag_coord, bool frag_face, + bool *has_samp); + +#ifndef ARRAY_SIZE +# define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) +#endif + +/* ************************************************************************* */ +/* split this out or find some helper to use.. like main/bitset.h.. */ + +#include <string.h> + +#define MAX_REG 256 + +typedef uint8_t regmask_t[2 * MAX_REG / 8]; + +static inline unsigned regmask_idx(struct ir3_register *reg) +{ + unsigned num = reg->num; + assert(num < MAX_REG); + if (reg->flags & IR3_REG_HALF) + num += MAX_REG; + return num; +} + +static inline void regmask_init(regmask_t *regmask) +{ + memset(regmask, 0, sizeof(*regmask)); +} + +static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg) +{ + unsigned idx = regmask_idx(reg); + unsigned i; + for (i = 0; i < 4; i++, idx++) + if (reg->wrmask & (1 << i)) + (*regmask)[idx / 8] |= 1 << (idx % 8); +} + +/* set bits in a if not set in b, conceptually: + * a |= (reg & ~b) + */ +static inline void regmask_set_if_not(regmask_t *a, + struct ir3_register *reg, regmask_t *b) +{ + unsigned idx = regmask_idx(reg); + unsigned i; + for (i = 0; i < 4; i++, idx++) + if (reg->wrmask & (1 << i)) + if (!((*b)[idx / 8] & (1 << (idx % 8)))) + (*a)[idx / 8] |= 1 << (idx % 8); +} + +static inline unsigned regmask_get(regmask_t *regmask, + struct ir3_register *reg) +{ + unsigned idx = regmask_idx(reg); + unsigned i; + for (i = 0; i < 4; i++, idx++) + if (reg->wrmask & (1 << i)) + if ((*regmask)[idx / 8] & (1 << (idx % 8))) + return true; + return false; +} + +/* ************************************************************************* */ + +#endif /* IR3_H_ */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c new file mode 100644 index 00000000000..1fa2fd4e389 --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c @@ -0,0 +1,2639 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2013 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include <stdarg.h> + +#include "pipe/p_state.h" +#include "util/u_string.h" +#include "util/u_memory.h" +#include "util/u_inlines.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_ureg.h" +#include "tgsi/tgsi_info.h" +#include "tgsi/tgsi_strings.h" +#include "tgsi/tgsi_dump.h" +#include "tgsi/tgsi_scan.h" + +#include "freedreno_lowering.h" +#include "freedreno_util.h" + +#include "ir3_compiler.h" +#include "ir3_shader.h" + +#include "instr-a3xx.h" +#include "ir3.h" + +struct ir3_compile_context { + const struct tgsi_token *tokens; + bool free_tokens; + struct ir3 *ir; + struct ir3_shader_variant *so; + + struct ir3_block *block; + struct ir3_instruction *current_instr; + + /* we need to defer updates to block->outputs[] until the end + * of an instruction (so we don't see new value until *after* + * the src registers are processed) + */ + struct { + struct ir3_instruction *instr, **instrp; + } output_updates[16]; + unsigned num_output_updates; + + /* are we in a sequence of "atomic" instructions? + */ + bool atomic; + + /* For fragment shaders, from the hw perspective the only + * actual input is r0.xy position register passed to bary.f. + * But TGSI doesn't know that, it still declares things as + * IN[] registers. So we do all the input tracking normally + * and fix things up after compile_instructions() + * + * NOTE that frag_pos is the hardware position (possibly it + * is actually an index or tag or some such.. it is *not* + * values that can be directly used for gl_FragCoord..) + */ + struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4]; + + struct tgsi_parse_context parser; + unsigned type; + + struct tgsi_shader_info info; + + /* for calculating input/output positions/linkages: */ + unsigned next_inloc; + + unsigned num_internal_temps; + struct tgsi_src_register internal_temps[6]; + + /* idx/slot for last compiler generated immediate */ + unsigned immediate_idx; + + /* stack of branch instructions that mark (potentially nested) + * branch if/else/loop/etc + */ + struct { + struct ir3_instruction *instr, *cond; + bool inv; /* true iff in else leg of branch */ + } branch[16]; + unsigned int branch_count; + + /* list of kill instructions: */ + struct ir3_instruction *kill[16]; + unsigned int kill_count; + + /* used when dst is same as one of the src, to avoid overwriting a + * src element before the remaining scalar instructions that make + * up the vector operation + */ + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; +}; + + +static void vectorize(struct ir3_compile_context *ctx, + struct ir3_instruction *instr, struct tgsi_dst_register *dst, + int nsrcs, ...); +static void create_mov(struct ir3_compile_context *ctx, + struct tgsi_dst_register *dst, struct tgsi_src_register *src); +static type_t get_ftype(struct ir3_compile_context *ctx); + +static unsigned +compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so, + const struct tgsi_token *tokens) +{ + unsigned ret; + struct tgsi_shader_info *info = &ctx->info; + const struct fd_lowering_config lconfig = { + .color_two_side = so->key.color_two_side, + .lower_DST = true, + .lower_XPD = true, + .lower_SCS = true, + .lower_LRP = true, + .lower_FRC = true, + .lower_POW = true, + .lower_LIT = true, + .lower_EXP = true, + .lower_LOG = true, + .lower_DP4 = true, + .lower_DP3 = true, + .lower_DPH = true, + .lower_DP2 = true, + .lower_DP2A = true, + }; + + ctx->tokens = fd_transform_lowering(&lconfig, tokens, &ctx->info); + ctx->free_tokens = !!ctx->tokens; + if (!ctx->tokens) { + /* no lowering */ + ctx->tokens = tokens; + } + ctx->ir = so->ir; + ctx->so = so; + ctx->next_inloc = 8; + ctx->num_internal_temps = 0; + ctx->branch_count = 0; + ctx->kill_count = 0; + ctx->block = NULL; + ctx->current_instr = NULL; + ctx->num_output_updates = 0; + ctx->atomic = false; + ctx->frag_pos = NULL; + ctx->frag_face = NULL; + + memset(ctx->frag_coord, 0, sizeof(ctx->frag_coord)); + +#define FM(x) (1 << TGSI_FILE_##x) + /* optimize can't deal with relative addressing: */ + if (info->indirect_files & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT))) + return TGSI_PARSE_ERROR; + + /* Immediates go after constants: */ + so->first_immediate = info->file_max[TGSI_FILE_CONSTANT] + 1; + ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1); + + ret = tgsi_parse_init(&ctx->parser, ctx->tokens); + if (ret != TGSI_PARSE_OK) + return ret; + + ctx->type = ctx->parser.FullHeader.Processor.Processor; + + return ret; +} + +static void +compile_error(struct ir3_compile_context *ctx, const char *format, ...) +{ + va_list ap; + va_start(ap, format); + _debug_vprintf(format, ap); + va_end(ap); + tgsi_dump(ctx->tokens, 0); + debug_assert(0); +} + +#define compile_assert(ctx, cond) do { \ + if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \ + } while (0) + +static void +compile_free(struct ir3_compile_context *ctx) +{ + if (ctx->free_tokens) + free((void *)ctx->tokens); + tgsi_parse_free(&ctx->parser); +} + +struct instr_translater { + void (*fxn)(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst); + unsigned tgsi_opc; + opc_t opc; + opc_t hopc; /* opc to use for half_precision mode, if different */ + unsigned arg; +}; + +static void +instr_finish(struct ir3_compile_context *ctx) +{ + unsigned i; + + if (ctx->atomic) + return; + + for (i = 0; i < ctx->num_output_updates; i++) + *(ctx->output_updates[i].instrp) = ctx->output_updates[i].instr; + + ctx->num_output_updates = 0; +} + +/* For "atomic" groups of instructions, for example the four scalar + * instructions to perform a vec4 operation. Basically this just + * blocks out handling of output_updates so the next scalar instruction + * still sees the result from before the start of the atomic group. + * + * NOTE: when used properly, this could probably replace get/put_dst() + * stuff. + */ +static void +instr_atomic_start(struct ir3_compile_context *ctx) +{ + ctx->atomic = true; +} + +static void +instr_atomic_end(struct ir3_compile_context *ctx) +{ + ctx->atomic = false; + instr_finish(ctx); +} + +static struct ir3_instruction * +instr_create(struct ir3_compile_context *ctx, int category, opc_t opc) +{ + instr_finish(ctx); + return (ctx->current_instr = ir3_instr_create(ctx->block, category, opc)); +} + +static struct ir3_instruction * +instr_clone(struct ir3_compile_context *ctx, struct ir3_instruction *instr) +{ + instr_finish(ctx); + return (ctx->current_instr = ir3_instr_clone(instr)); +} + +static struct ir3_block * +push_block(struct ir3_compile_context *ctx) +{ + struct ir3_block *block; + unsigned ntmp, nin, nout; + +#define SCALAR_REGS(file) (4 * (ctx->info.file_max[TGSI_FILE_ ## file] + 1)) + + /* hmm, give ourselves room to create 4 extra temporaries (vec4): + */ + ntmp = SCALAR_REGS(TEMPORARY); + ntmp += 4 * 4; + + nout = SCALAR_REGS(OUTPUT); + nin = SCALAR_REGS(INPUT); + + /* for outermost block, 'inputs' are the actual shader INPUT + * register file. Reads from INPUT registers always go back to + * top block. For nested blocks, 'inputs' is used to track any + * TEMPORARY file register from one of the enclosing blocks that + * is ready in this block. + */ + if (!ctx->block) { + /* NOTE: fragment shaders actually have two inputs (r0.xy, the + * position) + */ + if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { + int n = 2; + if (ctx->info.reads_position) + n += 4; + if (ctx->info.uses_frontface) + n += 4; + nin = MAX2(n, nin); + nout += ARRAY_SIZE(ctx->kill); + } + } else { + nin = ntmp; + } + + block = ir3_block_create(ctx->ir, ntmp, nin, nout); + + if ((ctx->type == TGSI_PROCESSOR_FRAGMENT) && !ctx->block) + block->noutputs -= ARRAY_SIZE(ctx->kill); + + block->parent = ctx->block; + ctx->block = block; + + return block; +} + +static void +pop_block(struct ir3_compile_context *ctx) +{ + ctx->block = ctx->block->parent; + compile_assert(ctx, ctx->block); +} + +static struct ir3_instruction * +create_output(struct ir3_block *block, struct ir3_instruction *instr, + unsigned n) +{ + struct ir3_instruction *out; + + out = ir3_instr_create(block, -1, OPC_META_OUTPUT); + out->inout.block = block; + ir3_reg_create(out, n, 0); + if (instr) + ir3_reg_create(out, 0, IR3_REG_SSA)->instr = instr; + + return out; +} + +static struct ir3_instruction * +create_input(struct ir3_block *block, struct ir3_instruction *instr, + unsigned n) +{ + struct ir3_instruction *in; + + in = ir3_instr_create(block, -1, OPC_META_INPUT); + in->inout.block = block; + ir3_reg_create(in, n, 0); + if (instr) + ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr; + + return in; +} + +static struct ir3_instruction * +block_input(struct ir3_block *block, unsigned n) +{ + /* references to INPUT register file always go back up to + * top level: + */ + if (block->parent) + return block_input(block->parent, n); + return block->inputs[n]; +} + +/* return temporary in scope, creating if needed meta-input node + * to track block inputs + */ +static struct ir3_instruction * +block_temporary(struct ir3_block *block, unsigned n) +{ + /* references to TEMPORARY register file, find the nearest + * enclosing block which has already assigned this temporary, + * creating meta-input instructions along the way to keep + * track of block inputs + */ + if (block->parent && !block->temporaries[n]) { + /* if already have input for this block, reuse: */ + if (!block->inputs[n]) + block->inputs[n] = block_temporary(block->parent, n); + + /* and create new input to return: */ + return create_input(block, block->inputs[n], n); + } + return block->temporaries[n]; +} + +static struct ir3_instruction * +create_immed(struct ir3_compile_context *ctx, float val) +{ + /* NOTE: *don't* use instr_create() here! + */ + struct ir3_instruction *instr; + instr = ir3_instr_create(ctx->block, 1, 0); + instr->cat1.src_type = get_ftype(ctx); + instr->cat1.dst_type = get_ftype(ctx); + ir3_reg_create(instr, 0, 0); + ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = val; + return instr; +} + +static void +ssa_dst(struct ir3_compile_context *ctx, struct ir3_instruction *instr, + const struct tgsi_dst_register *dst, unsigned chan) +{ + unsigned n = regid(dst->Index, chan); + unsigned idx = ctx->num_output_updates; + + compile_assert(ctx, idx < ARRAY_SIZE(ctx->output_updates)); + + /* NOTE: defer update of temporaries[idx] or output[idx] + * until instr_finish(), so that if the current instruction + * reads the same TEMP/OUT[] it gets the old value: + * + * bleh.. this might be a bit easier to just figure out + * in instr_finish(). But at that point we've already + * lost information about OUTPUT vs TEMPORARY register + * file.. + */ + + switch (dst->File) { + case TGSI_FILE_OUTPUT: + compile_assert(ctx, n < ctx->block->noutputs); + ctx->output_updates[idx].instrp = &ctx->block->outputs[n]; + ctx->output_updates[idx].instr = instr; + ctx->num_output_updates++; + break; + case TGSI_FILE_TEMPORARY: + compile_assert(ctx, n < ctx->block->ntemporaries); + ctx->output_updates[idx].instrp = &ctx->block->temporaries[n]; + ctx->output_updates[idx].instr = instr; + ctx->num_output_updates++; + break; + case TGSI_FILE_ADDRESS: + compile_assert(ctx, n < 1); + ctx->output_updates[idx].instrp = &ctx->block->address; + ctx->output_updates[idx].instr = instr; + ctx->num_output_updates++; + break; + } +} + +static void +ssa_src(struct ir3_compile_context *ctx, struct ir3_register *reg, + const struct tgsi_src_register *src, unsigned chan) +{ + struct ir3_block *block = ctx->block; + unsigned n = regid(src->Index, chan); + + switch (src->File) { + case TGSI_FILE_INPUT: + reg->flags |= IR3_REG_SSA; + reg->instr = block_input(ctx->block, n); + break; + case TGSI_FILE_OUTPUT: + /* really this should just happen in case of 'MOV_SAT OUT[n], ..', + * for the following clamp instructions: + */ + reg->flags |= IR3_REG_SSA; + reg->instr = block->outputs[n]; + /* we don't have to worry about read from an OUTPUT that was + * assigned outside of the current block, because the _SAT + * clamp instructions will always be in the same block as + * the original instruction which wrote the OUTPUT + */ + compile_assert(ctx, reg->instr); + break; + case TGSI_FILE_TEMPORARY: + reg->flags |= IR3_REG_SSA; + reg->instr = block_temporary(ctx->block, n); + break; + } + + if ((reg->flags & IR3_REG_SSA) && !reg->instr) { + /* this can happen when registers (or components of a TGSI + * register) are used as src before they have been assigned + * (undefined contents). To avoid confusing the rest of the + * compiler, and to generally keep things peachy, substitute + * an instruction that sets the src to 0.0. Or to keep + * things undefined, I could plug in a random number? :-P + * + * NOTE: *don't* use instr_create() here! + */ + reg->instr = create_immed(ctx, 0.0); + } +} + +static struct ir3_register * +add_dst_reg_wrmask(struct ir3_compile_context *ctx, + struct ir3_instruction *instr, const struct tgsi_dst_register *dst, + unsigned chan, unsigned wrmask) +{ + unsigned flags = 0, num = 0; + struct ir3_register *reg; + + switch (dst->File) { + case TGSI_FILE_OUTPUT: + case TGSI_FILE_TEMPORARY: + /* uses SSA */ + break; + case TGSI_FILE_ADDRESS: + flags |= IR3_REG_ADDR; + /* uses SSA */ + break; + default: + compile_error(ctx, "unsupported dst register file: %s\n", + tgsi_file_name(dst->File)); + break; + } + + if (dst->Indirect) + flags |= IR3_REG_RELATIV; + + reg = ir3_reg_create(instr, regid(num, chan), flags); + + /* NOTE: do not call ssa_dst() if atomic.. vectorize() + * itself will call ssa_dst(). This is to filter out + * the (initially bogus) .x component dst which is + * created (but not necessarily used, ie. if the net + * result of the vector operation does not write to + * the .x component) + */ + + reg->wrmask = wrmask; + if (wrmask == 0x1) { + /* normal case */ + if (!ctx->atomic) + ssa_dst(ctx, instr, dst, chan); + } else if ((dst->File == TGSI_FILE_TEMPORARY) || + (dst->File == TGSI_FILE_OUTPUT) || + (dst->File == TGSI_FILE_ADDRESS)) { + unsigned i; + + /* if instruction writes multiple, we need to create + * some place-holder collect the registers: + */ + for (i = 0; i < 4; i++) { + if (wrmask & (1 << i)) { + struct ir3_instruction *collect = + ir3_instr_create(ctx->block, -1, OPC_META_FO); + collect->fo.off = i; + /* unused dst reg: */ + ir3_reg_create(collect, 0, 0); + /* and src reg used to hold original instr */ + ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = instr; + if (!ctx->atomic) + ssa_dst(ctx, collect, dst, chan+i); + } + } + } + + return reg; +} + +static struct ir3_register * +add_dst_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr, + const struct tgsi_dst_register *dst, unsigned chan) +{ + return add_dst_reg_wrmask(ctx, instr, dst, chan, 0x1); +} + +static struct ir3_register * +add_src_reg_wrmask(struct ir3_compile_context *ctx, + struct ir3_instruction *instr, const struct tgsi_src_register *src, + unsigned chan, unsigned wrmask) +{ + unsigned flags = 0, num = 0; + struct ir3_register *reg; + struct ir3_instruction *orig = NULL; + + /* TODO we need to use a mov to temp for const >= 64.. or maybe + * we could use relative addressing.. + */ + compile_assert(ctx, src->Index < 64); + + switch (src->File) { + case TGSI_FILE_IMMEDIATE: + /* TODO if possible, use actual immediate instead of const.. but + * TGSI has vec4 immediates, we can only embed scalar (of limited + * size, depending on instruction..) + */ + flags |= IR3_REG_CONST; + num = src->Index + ctx->so->first_immediate; + break; + case TGSI_FILE_CONSTANT: + flags |= IR3_REG_CONST; + num = src->Index; + break; + case TGSI_FILE_OUTPUT: + /* NOTE: we should only end up w/ OUTPUT file for things like + * clamp()'ing saturated dst instructions + */ + case TGSI_FILE_INPUT: + case TGSI_FILE_TEMPORARY: + /* uses SSA */ + break; + default: + compile_error(ctx, "unsupported src register file: %s\n", + tgsi_file_name(src->File)); + break; + } + + if (src->Absolute) + flags |= IR3_REG_ABS; + if (src->Negate) + flags |= IR3_REG_NEGATE; + + if (src->Indirect) { + flags |= IR3_REG_RELATIV; + + /* shouldn't happen, and we can't cope with it below: */ + compile_assert(ctx, wrmask == 0x1); + + /* wrap in a meta-deref to track both the src and address: */ + orig = instr; + + instr = ir3_instr_create(ctx->block, -1, OPC_META_DEREF); + ir3_reg_create(instr, 0, 0); + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->block->address; + } + + reg = ir3_reg_create(instr, regid(num, chan), flags); + + reg->wrmask = wrmask; + if (wrmask == 0x1) { + /* normal case */ + ssa_src(ctx, reg, src, chan); + } else if ((src->File == TGSI_FILE_TEMPORARY) || + (src->File == TGSI_FILE_OUTPUT) || + (src->File == TGSI_FILE_INPUT)) { + struct ir3_instruction *collect; + unsigned i; + + compile_assert(ctx, !src->Indirect); + + /* if instruction reads multiple, we need to create + * some place-holder collect the registers: + */ + collect = ir3_instr_create(ctx->block, -1, OPC_META_FI); + ir3_reg_create(collect, 0, 0); /* unused dst reg */ + + for (i = 0; i < 4; i++) { + if (wrmask & (1 << i)) { + /* and src reg used point to the original instr */ + ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), + src, chan + i); + } else if (wrmask & ~((i << i) - 1)) { + /* if any remaining components, then dummy + * placeholder src reg to fill in the blanks: + */ + ir3_reg_create(collect, 0, 0); + } + } + + reg->flags |= IR3_REG_SSA; + reg->instr = collect; + } + + if (src->Indirect) { + reg = ir3_reg_create(orig, 0, flags | IR3_REG_SSA); + reg->instr = instr; + } + return reg; +} + +static struct ir3_register * +add_src_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr, + const struct tgsi_src_register *src, unsigned chan) +{ + return add_src_reg_wrmask(ctx, instr, src, chan, 0x1); +} + +static void +src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst) +{ + src->File = dst->File; + src->Indirect = dst->Indirect; + src->Dimension = dst->Dimension; + src->Index = dst->Index; + src->Absolute = 0; + src->Negate = 0; + src->SwizzleX = TGSI_SWIZZLE_X; + src->SwizzleY = TGSI_SWIZZLE_Y; + src->SwizzleZ = TGSI_SWIZZLE_Z; + src->SwizzleW = TGSI_SWIZZLE_W; +} + +/* Get internal-temp src/dst to use for a sequence of instructions + * generated by a single TGSI op. + */ +static struct tgsi_src_register * +get_internal_temp(struct ir3_compile_context *ctx, + struct tgsi_dst_register *tmp_dst) +{ + struct tgsi_src_register *tmp_src; + int n; + + tmp_dst->File = TGSI_FILE_TEMPORARY; + tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW; + tmp_dst->Indirect = 0; + tmp_dst->Dimension = 0; + + /* assign next temporary: */ + n = ctx->num_internal_temps++; + compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps)); + tmp_src = &ctx->internal_temps[n]; + + tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1; + + src_from_dst(tmp_src, tmp_dst); + + return tmp_src; +} + +static inline bool +is_const(struct tgsi_src_register *src) +{ + return (src->File == TGSI_FILE_CONSTANT) || + (src->File == TGSI_FILE_IMMEDIATE); +} + +static inline bool +is_relative(struct tgsi_src_register *src) +{ + return src->Indirect; +} + +static inline bool +is_rel_or_const(struct tgsi_src_register *src) +{ + return is_relative(src) || is_const(src); +} + +static type_t +get_ftype(struct ir3_compile_context *ctx) +{ + return TYPE_F32; +} + +static type_t +get_utype(struct ir3_compile_context *ctx) +{ + return TYPE_U32; +} + +static unsigned +src_swiz(struct tgsi_src_register *src, int chan) +{ + switch (chan) { + case 0: return src->SwizzleX; + case 1: return src->SwizzleY; + case 2: return src->SwizzleZ; + case 3: return src->SwizzleW; + } + assert(0); + return 0; +} + +/* for instructions that cannot take a const register as src, if needed + * generate a move to temporary gpr: + */ +static struct tgsi_src_register * +get_unconst(struct ir3_compile_context *ctx, struct tgsi_src_register *src) +{ + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; + + compile_assert(ctx, is_rel_or_const(src)); + + tmp_src = get_internal_temp(ctx, &tmp_dst); + + create_mov(ctx, &tmp_dst, src); + + return tmp_src; +} + +static void +get_immediate(struct ir3_compile_context *ctx, + struct tgsi_src_register *reg, uint32_t val) +{ + unsigned neg, swiz, idx, i; + /* actually maps 1:1 currently.. not sure if that is safe to rely on: */ + static const unsigned swiz2tgsi[] = { + TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W, + }; + + for (i = 0; i < ctx->immediate_idx; i++) { + swiz = i % 4; + idx = i / 4; + + if (ctx->so->immediates[idx].val[swiz] == val) { + neg = 0; + break; + } + + if (ctx->so->immediates[idx].val[swiz] == -val) { + neg = 1; + break; + } + } + + if (i == ctx->immediate_idx) { + /* need to generate a new immediate: */ + swiz = i % 4; + idx = i / 4; + neg = 0; + ctx->so->immediates[idx].val[swiz] = val; + ctx->so->immediates_count = idx + 1; + ctx->immediate_idx++; + } + + reg->File = TGSI_FILE_IMMEDIATE; + reg->Indirect = 0; + reg->Dimension = 0; + reg->Index = idx; + reg->Absolute = 0; + reg->Negate = neg; + reg->SwizzleX = swiz2tgsi[swiz]; + reg->SwizzleY = swiz2tgsi[swiz]; + reg->SwizzleZ = swiz2tgsi[swiz]; + reg->SwizzleW = swiz2tgsi[swiz]; +} + +static void +create_mov(struct ir3_compile_context *ctx, struct tgsi_dst_register *dst, + struct tgsi_src_register *src) +{ + type_t type_mov = get_ftype(ctx); + unsigned i; + + for (i = 0; i < 4; i++) { + /* move to destination: */ + if (dst->WriteMask & (1 << i)) { + struct ir3_instruction *instr; + + if (src->Absolute || src->Negate) { + /* can't have abs or neg on a mov instr, so use + * absneg.f instead to handle these cases: + */ + instr = instr_create(ctx, 2, OPC_ABSNEG_F); + } else { + instr = instr_create(ctx, 1, 0); + instr->cat1.src_type = type_mov; + instr->cat1.dst_type = type_mov; + } + + add_dst_reg(ctx, instr, dst, i); + add_src_reg(ctx, instr, src, src_swiz(src, i)); + } + } +} + +static void +create_clamp(struct ir3_compile_context *ctx, + struct tgsi_dst_register *dst, struct tgsi_src_register *val, + struct tgsi_src_register *minval, struct tgsi_src_register *maxval) +{ + struct ir3_instruction *instr; + + instr = instr_create(ctx, 2, OPC_MAX_F); + vectorize(ctx, instr, dst, 2, val, 0, minval, 0); + + instr = instr_create(ctx, 2, OPC_MIN_F); + vectorize(ctx, instr, dst, 2, val, 0, maxval, 0); +} + +static void +create_clamp_imm(struct ir3_compile_context *ctx, + struct tgsi_dst_register *dst, + uint32_t minval, uint32_t maxval) +{ + struct tgsi_src_register minconst, maxconst; + struct tgsi_src_register src; + + src_from_dst(&src, dst); + + get_immediate(ctx, &minconst, minval); + get_immediate(ctx, &maxconst, maxval); + + create_clamp(ctx, dst, &src, &minconst, &maxconst); +} + +static struct tgsi_dst_register * +get_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = &inst->Dst[0].Register; + unsigned i; + for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { + struct tgsi_src_register *src = &inst->Src[i].Register; + if ((src->File == dst->File) && (src->Index == dst->Index)) { + if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) && + (src->SwizzleX == TGSI_SWIZZLE_X) && + (src->SwizzleY == TGSI_SWIZZLE_Y) && + (src->SwizzleZ == TGSI_SWIZZLE_Z) && + (src->SwizzleW == TGSI_SWIZZLE_W)) + continue; + ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst); + ctx->tmp_dst.WriteMask = dst->WriteMask; + dst = &ctx->tmp_dst; + break; + } + } + return dst; +} + +static void +put_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst, + struct tgsi_dst_register *dst) +{ + /* if necessary, add mov back into original dst: */ + if (dst != &inst->Dst[0].Register) { + create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src); + } +} + +/* helper to generate the necessary repeat and/or additional instructions + * to turn a scalar instruction into a vector operation: + */ +static void +vectorize(struct ir3_compile_context *ctx, struct ir3_instruction *instr, + struct tgsi_dst_register *dst, int nsrcs, ...) +{ + va_list ap; + int i, j, n = 0; + + instr_atomic_start(ctx); + + add_dst_reg(ctx, instr, dst, TGSI_SWIZZLE_X); + + va_start(ap, nsrcs); + for (j = 0; j < nsrcs; j++) { + struct tgsi_src_register *src = + va_arg(ap, struct tgsi_src_register *); + unsigned flags = va_arg(ap, unsigned); + struct ir3_register *reg; + if (flags & IR3_REG_IMMED) { + reg = ir3_reg_create(instr, 0, IR3_REG_IMMED); + /* this is an ugly cast.. should have put flags first! */ + reg->iim_val = *(int *)&src; + } else { + reg = add_src_reg(ctx, instr, src, TGSI_SWIZZLE_X); + } + reg->flags |= flags & ~IR3_REG_NEGATE; + if (flags & IR3_REG_NEGATE) + reg->flags ^= IR3_REG_NEGATE; + } + va_end(ap); + + for (i = 0; i < 4; i++) { + if (dst->WriteMask & (1 << i)) { + struct ir3_instruction *cur; + + if (n++ == 0) { + cur = instr; + } else { + cur = instr_clone(ctx, instr); + } + + ssa_dst(ctx, cur, dst, i); + + /* fix-up dst register component: */ + cur->regs[0]->num = regid(cur->regs[0]->num >> 2, i); + + /* fix-up src register component: */ + va_start(ap, nsrcs); + for (j = 0; j < nsrcs; j++) { + struct ir3_register *reg = cur->regs[j+1]; + struct tgsi_src_register *src = + va_arg(ap, struct tgsi_src_register *); + unsigned flags = va_arg(ap, unsigned); + if (reg->flags & IR3_REG_SSA) { + ssa_src(ctx, reg, src, src_swiz(src, i)); + } else if (!(flags & IR3_REG_IMMED)) { + reg->num = regid(reg->num >> 2, src_swiz(src, i)); + } + } + va_end(ap); + } + } + + instr_atomic_end(ctx); +} + +/* + * Handlers for TGSI instructions which do not have a 1:1 mapping to + * native instructions: + */ + +static void +trans_clamp(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *src0 = &inst->Src[0].Register; + struct tgsi_src_register *src1 = &inst->Src[1].Register; + struct tgsi_src_register *src2 = &inst->Src[2].Register; + + create_clamp(ctx, dst, src0, src1, src2); + + put_dst(ctx, inst, dst); +} + +/* ARL(x) = x, but mova from hrN.x to a0.. */ +static void +trans_arl(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr; + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; + struct tgsi_dst_register *dst = &inst->Dst[0].Register; + struct tgsi_src_register *src = &inst->Src[0].Register; + unsigned chan = src->SwizzleX; + + compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS); + + /* NOTE: we allocate a temporary from a flat register + * namespace (ignoring half vs full). It turns out + * not to really matter since registers get reassigned + * later in ir3_ra which (hopefully!) can deal a bit + * better with mixed half and full precision. + */ + tmp_src = get_internal_temp(ctx, &tmp_dst); + + /* cov.f{32,16}s16 Rtmp, Rsrc */ + instr = instr_create(ctx, 1, 0); + instr->cat1.src_type = get_ftype(ctx); + instr->cat1.dst_type = TYPE_S16; + add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF; + add_src_reg(ctx, instr, src, chan); + + /* shl.b Rtmp, Rtmp, 2 */ + instr = instr_create(ctx, 2, OPC_SHL_B); + add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF; + add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF; + ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2; + + /* mova a0, Rtmp */ + instr = instr_create(ctx, 1, 0); + instr->cat1.src_type = TYPE_S16; + instr->cat1.dst_type = TYPE_S16; + add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF; + add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF; +} + +/* + * texture fetch/sample instructions: + */ + +struct tex_info { + int8_t order[4]; + unsigned src_wrmask, flags; +}; + +static const struct tex_info * +get_tex_info(struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + static const struct tex_info tex1d = { + .order = { 0, -1, -1, -1 }, /* coord.x */ + .src_wrmask = TGSI_WRITEMASK_XY, + .flags = 0, + }; + static const struct tex_info tex1ds = { + .order = { 0, -1, 2, -1 }, /* coord.xz */ + .src_wrmask = TGSI_WRITEMASK_XYZ, + .flags = IR3_INSTR_S, + }; + static const struct tex_info tex2d = { + .order = { 0, 1, -1, -1 }, /* coord.xy */ + .src_wrmask = TGSI_WRITEMASK_XY, + .flags = 0, + }; + static const struct tex_info tex2ds = { + .order = { 0, 1, 2, -1 }, /* coord.xyz */ + .src_wrmask = TGSI_WRITEMASK_XYZ, + .flags = IR3_INSTR_S, + }; + static const struct tex_info tex3d = { + .order = { 0, 1, 2, -1 }, /* coord.xyz */ + .src_wrmask = TGSI_WRITEMASK_XYZ, + .flags = IR3_INSTR_3D, + }; + static const struct tex_info tex3ds = { + .order = { 0, 1, 2, 3 }, /* coord.xyzw */ + .src_wrmask = TGSI_WRITEMASK_XYZW, + .flags = IR3_INSTR_S | IR3_INSTR_3D, + }; + static const struct tex_info txp1d = { + .order = { 0, -1, 3, -1 }, /* coord.xw */ + .src_wrmask = TGSI_WRITEMASK_XYZ, + .flags = IR3_INSTR_P, + }; + static const struct tex_info txp1ds = { + .order = { 0, -1, 2, 3 }, /* coord.xzw */ + .src_wrmask = TGSI_WRITEMASK_XYZW, + .flags = IR3_INSTR_P | IR3_INSTR_S, + }; + static const struct tex_info txp2d = { + .order = { 0, 1, 3, -1 }, /* coord.xyw */ + .src_wrmask = TGSI_WRITEMASK_XYZ, + .flags = IR3_INSTR_P, + }; + static const struct tex_info txp2ds = { + .order = { 0, 1, 2, 3 }, /* coord.xyzw */ + .src_wrmask = TGSI_WRITEMASK_XYZW, + .flags = IR3_INSTR_P | IR3_INSTR_S, + }; + static const struct tex_info txp3d = { + .order = { 0, 1, 2, 3 }, /* coord.xyzw */ + .src_wrmask = TGSI_WRITEMASK_XYZW, + .flags = IR3_INSTR_P | IR3_INSTR_3D, + }; + + unsigned tex = inst->Texture.Texture; + + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_TEX: + switch (tex) { + case TGSI_TEXTURE_1D: + return &tex1d; + case TGSI_TEXTURE_SHADOW1D: + return &tex1ds; + case TGSI_TEXTURE_2D: + case TGSI_TEXTURE_RECT: + return &tex2d; + case TGSI_TEXTURE_SHADOW2D: + case TGSI_TEXTURE_SHADOWRECT: + return &tex2ds; + case TGSI_TEXTURE_3D: + case TGSI_TEXTURE_CUBE: + return &tex3d; + case TGSI_TEXTURE_SHADOWCUBE: + return &tex3ds; + default: + compile_error(ctx, "unknown texture type: %s\n", + tgsi_texture_names[tex]); + return NULL; + } + break; + case TGSI_OPCODE_TXP: + switch (tex) { + case TGSI_TEXTURE_1D: + return &txp1d; + case TGSI_TEXTURE_SHADOW1D: + return &txp1ds; + case TGSI_TEXTURE_2D: + case TGSI_TEXTURE_RECT: + return &txp2d; + case TGSI_TEXTURE_SHADOW2D: + case TGSI_TEXTURE_SHADOWRECT: + return &txp2ds; + case TGSI_TEXTURE_3D: + case TGSI_TEXTURE_CUBE: + return &txp3d; + default: + compile_error(ctx, "unknown texture type: %s\n", + tgsi_texture_names[tex]); + break; + } + break; + } + compile_assert(ctx, 0); + return NULL; +} + +static struct tgsi_src_register * +get_tex_coord(struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst, + const struct tex_info *tinf) +{ + struct tgsi_src_register *coord = &inst->Src[0].Register; + struct ir3_instruction *instr; + unsigned tex = inst->Texture.Texture; + bool needs_mov = false; + unsigned i; + + /* cat5 instruction cannot seem to handle const or relative: */ + if (is_rel_or_const(coord)) + needs_mov = true; + + /* 1D textures we fix up w/ 0.0 as 2nd coord: */ + if ((tex == TGSI_TEXTURE_1D) || (tex == TGSI_TEXTURE_SHADOW1D)) + needs_mov = true; + + /* The texture sample instructions need to coord in successive + * registers/components (ie. src.xy but not src.yx). And TXP + * needs the .w component in .z for 2D.. so in some cases we + * might need to emit some mov instructions to shuffle things + * around: + */ + for (i = 1; (i < 4) && (tinf->order[i] >= 0) && !needs_mov; i++) + if (src_swiz(coord, i) != (src_swiz(coord, 0) + tinf->order[i])) + needs_mov = true; + + if (needs_mov) { + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; + unsigned j; + + type_t type_mov = get_ftype(ctx); + + /* need to move things around: */ + tmp_src = get_internal_temp(ctx, &tmp_dst); + + for (j = 0; j < 4; j++) { + if (tinf->order[j] < 0) + continue; + instr = instr_create(ctx, 1, 0); /* mov */ + instr->cat1.src_type = type_mov; + instr->cat1.dst_type = type_mov; + add_dst_reg(ctx, instr, &tmp_dst, j); + add_src_reg(ctx, instr, coord, + src_swiz(coord, tinf->order[j])); + } + + /* fix up .y coord: */ + if ((tex == TGSI_TEXTURE_1D) || + (tex == TGSI_TEXTURE_SHADOW1D)) { + instr = instr_create(ctx, 1, 0); /* mov */ + instr->cat1.src_type = type_mov; + instr->cat1.dst_type = type_mov; + add_dst_reg(ctx, instr, &tmp_dst, 1); /* .y */ + ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = 0.5; + } + + coord = tmp_src; + } + + return coord; +} + +static void +trans_samp(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr; + struct tgsi_dst_register *dst = &inst->Dst[0].Register; + struct tgsi_src_register *coord; + struct tgsi_src_register *samp = &inst->Src[1].Register; + const struct tex_info *tinf; + + tinf = get_tex_info(ctx, inst); + coord = get_tex_coord(ctx, inst, tinf); + + instr = instr_create(ctx, 5, t->opc); + instr->cat5.type = get_ftype(ctx); + instr->cat5.samp = samp->Index; + instr->cat5.tex = samp->Index; + instr->flags |= tinf->flags; + + add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask); + add_src_reg_wrmask(ctx, instr, coord, coord->SwizzleX, tinf->src_wrmask); +} + +/* + * SEQ(a,b) = (a == b) ? 1.0 : 0.0 + * cmps.f.eq tmp0, a, b + * cov.u16f16 dst, tmp0 + * + * SNE(a,b) = (a != b) ? 1.0 : 0.0 + * cmps.f.ne tmp0, a, b + * cov.u16f16 dst, tmp0 + * + * SGE(a,b) = (a >= b) ? 1.0 : 0.0 + * cmps.f.ge tmp0, a, b + * cov.u16f16 dst, tmp0 + * + * SLE(a,b) = (a <= b) ? 1.0 : 0.0 + * cmps.f.le tmp0, a, b + * cov.u16f16 dst, tmp0 + * + * SGT(a,b) = (a > b) ? 1.0 : 0.0 + * cmps.f.gt tmp0, a, b + * cov.u16f16 dst, tmp0 + * + * SLT(a,b) = (a < b) ? 1.0 : 0.0 + * cmps.f.lt tmp0, a, b + * cov.u16f16 dst, tmp0 + * + * CMP(a,b,c) = (a < 0.0) ? b : c + * cmps.f.lt tmp0, a, {0.0} + * sel.b16 dst, b, tmp0, c + */ +static void +trans_cmp(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr; + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; + struct tgsi_src_register constval0; + /* final instruction for CMP() uses orig src1 and src2: */ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *a0, *a1, *a2; + unsigned condition; + + tmp_src = get_internal_temp(ctx, &tmp_dst); + + a0 = &inst->Src[0].Register; /* a */ + a1 = &inst->Src[1].Register; /* b */ + + switch (t->tgsi_opc) { + case TGSI_OPCODE_SEQ: + case TGSI_OPCODE_FSEQ: + condition = IR3_COND_EQ; + break; + case TGSI_OPCODE_SNE: + case TGSI_OPCODE_FSNE: + condition = IR3_COND_NE; + break; + case TGSI_OPCODE_SGE: + case TGSI_OPCODE_FSGE: + condition = IR3_COND_GE; + break; + case TGSI_OPCODE_SLT: + case TGSI_OPCODE_FSLT: + condition = IR3_COND_LT; + break; + case TGSI_OPCODE_SLE: + condition = IR3_COND_LE; + break; + case TGSI_OPCODE_SGT: + condition = IR3_COND_GT; + break; + case TGSI_OPCODE_CMP: + get_immediate(ctx, &constval0, fui(0.0)); + a0 = &inst->Src[0].Register; /* a */ + a1 = &constval0; /* {0.0} */ + condition = IR3_COND_LT; + break; + default: + compile_assert(ctx, 0); + return; + } + + if (is_const(a0) && is_const(a1)) + a0 = get_unconst(ctx, a0); + + /* cmps.f.<cond> tmp, a0, a1 */ + instr = instr_create(ctx, 2, OPC_CMPS_F); + instr->cat2.condition = condition; + vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0); + + switch (t->tgsi_opc) { + case TGSI_OPCODE_SEQ: + case TGSI_OPCODE_FSEQ: + case TGSI_OPCODE_SGE: + case TGSI_OPCODE_FSGE: + case TGSI_OPCODE_SLE: + case TGSI_OPCODE_SNE: + case TGSI_OPCODE_FSNE: + case TGSI_OPCODE_SGT: + case TGSI_OPCODE_SLT: + case TGSI_OPCODE_FSLT: + /* cov.u16f16 dst, tmp0 */ + instr = instr_create(ctx, 1, 0); + instr->cat1.src_type = get_utype(ctx); + instr->cat1.dst_type = get_ftype(ctx); + vectorize(ctx, instr, dst, 1, tmp_src, 0); + break; + case TGSI_OPCODE_CMP: + a1 = &inst->Src[1].Register; + a2 = &inst->Src[2].Register; + /* sel.{b32,b16} dst, src2, tmp, src1 */ + instr = instr_create(ctx, 3, OPC_SEL_B32); + vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0); + + break; + } + + put_dst(ctx, inst, dst); +} + +/* + * USNE(a,b) = (a != b) ? 1 : 0 + * cmps.u32.ne dst, a, b + * + * USEQ(a,b) = (a == b) ? 1 : 0 + * cmps.u32.eq dst, a, b + * + * ISGE(a,b) = (a > b) ? 1 : 0 + * cmps.s32.ge dst, a, b + * + * USGE(a,b) = (a > b) ? 1 : 0 + * cmps.u32.ge dst, a, b + * + * ISLT(a,b) = (a < b) ? 1 : 0 + * cmps.s32.lt dst, a, b + * + * USLT(a,b) = (a < b) ? 1 : 0 + * cmps.u32.lt dst, a, b + * + * UCMP(a,b,c) = (a < 0) ? b : c + * cmps.u32.lt tmp0, a, {0} + * sel.b16 dst, b, tmp0, c + */ +static void +trans_icmp(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr; + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register constval0; + struct tgsi_src_register *a0, *a1, *a2; + unsigned condition; + + a0 = &inst->Src[0].Register; /* a */ + a1 = &inst->Src[1].Register; /* b */ + + switch (t->tgsi_opc) { + case TGSI_OPCODE_USNE: + condition = IR3_COND_NE; + break; + case TGSI_OPCODE_USEQ: + condition = IR3_COND_EQ; + break; + case TGSI_OPCODE_ISGE: + case TGSI_OPCODE_USGE: + condition = IR3_COND_GE; + break; + case TGSI_OPCODE_ISLT: + case TGSI_OPCODE_USLT: + condition = IR3_COND_LT; + break; + case TGSI_OPCODE_UCMP: + get_immediate(ctx, &constval0, 0); + a0 = &inst->Src[0].Register; /* a */ + a1 = &constval0; /* {0} */ + condition = IR3_COND_LT; + break; + + default: + compile_assert(ctx, 0); + return; + } + + if (is_const(a0) && is_const(a1)) + a0 = get_unconst(ctx, a0); + + if (t->tgsi_opc == TGSI_OPCODE_UCMP) { + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; + tmp_src = get_internal_temp(ctx, &tmp_dst); + /* cmps.u32.lt tmp, a0, a1 */ + instr = instr_create(ctx, 2, t->opc); + instr->cat2.condition = condition; + vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0); + + a1 = &inst->Src[1].Register; + a2 = &inst->Src[2].Register; + /* sel.{b32,b16} dst, src2, tmp, src1 */ + instr = instr_create(ctx, 3, OPC_SEL_B32); + vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0); + } else { + /* cmps.{u32,s32}.<cond> dst, a0, a1 */ + instr = instr_create(ctx, 2, t->opc); + instr->cat2.condition = condition; + vectorize(ctx, instr, dst, 2, a0, 0, a1, 0); + } + put_dst(ctx, inst, dst); +} + +/* + * Conditional / Flow control + */ + +static void +push_branch(struct ir3_compile_context *ctx, bool inv, + struct ir3_instruction *instr, struct ir3_instruction *cond) +{ + unsigned int idx = ctx->branch_count++; + compile_assert(ctx, idx < ARRAY_SIZE(ctx->branch)); + ctx->branch[idx].instr = instr; + ctx->branch[idx].inv = inv; + /* else side of branch has same condition: */ + if (!inv) + ctx->branch[idx].cond = cond; +} + +static struct ir3_instruction * +pop_branch(struct ir3_compile_context *ctx) +{ + unsigned int idx = --ctx->branch_count; + return ctx->branch[idx].instr; +} + +static void +trans_if(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr, *cond; + struct tgsi_src_register *src = &inst->Src[0].Register; + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; + struct tgsi_src_register constval; + + get_immediate(ctx, &constval, fui(0.0)); + tmp_src = get_internal_temp(ctx, &tmp_dst); + + if (is_const(src)) + src = get_unconst(ctx, src); + + /* cmps.f.ne tmp0, b, {0.0} */ + instr = instr_create(ctx, 2, OPC_CMPS_F); + add_dst_reg(ctx, instr, &tmp_dst, 0); + add_src_reg(ctx, instr, src, src->SwizzleX); + add_src_reg(ctx, instr, &constval, constval.SwizzleX); + instr->cat2.condition = IR3_COND_NE; + + compile_assert(ctx, instr->regs[1]->flags & IR3_REG_SSA); /* because get_unconst() */ + cond = instr->regs[1]->instr; + + /* meta:flow tmp0 */ + instr = instr_create(ctx, -1, OPC_META_FLOW); + ir3_reg_create(instr, 0, 0); /* dummy dst */ + add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X); + + push_branch(ctx, false, instr, cond); + instr->flow.if_block = push_block(ctx); +} + +static void +trans_else(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr; + + pop_block(ctx); + + instr = pop_branch(ctx); + + compile_assert(ctx, (instr->category == -1) && + (instr->opc == OPC_META_FLOW)); + + push_branch(ctx, true, instr, NULL); + instr->flow.else_block = push_block(ctx); +} + +static struct ir3_instruction * +find_temporary(struct ir3_block *block, unsigned n) +{ + if (block->parent && !block->temporaries[n]) + return find_temporary(block->parent, n); + return block->temporaries[n]; +} + +static struct ir3_instruction * +find_output(struct ir3_block *block, unsigned n) +{ + if (block->parent && !block->outputs[n]) + return find_output(block->parent, n); + return block->outputs[n]; +} + +static struct ir3_instruction * +create_phi(struct ir3_compile_context *ctx, struct ir3_instruction *cond, + struct ir3_instruction *a, struct ir3_instruction *b) +{ + struct ir3_instruction *phi; + + compile_assert(ctx, cond); + + /* Either side of the condition could be null.. which + * indicates a variable written on only one side of the + * branch. Normally this should only be variables not + * used outside of that side of the branch. So we could + * just 'return a ? a : b;' in that case. But for better + * defined undefined behavior we just stick in imm{0.0}. + * In the common case of a value only used within the + * one side of the branch, the PHI instruction will not + * get scheduled + */ + if (!a) + a = create_immed(ctx, 0.0); + if (!b) + b = create_immed(ctx, 0.0); + + phi = instr_create(ctx, -1, OPC_META_PHI); + ir3_reg_create(phi, 0, 0); /* dummy dst */ + ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = cond; + ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = a; + ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = b; + + return phi; +} + +static void +trans_endif(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr; + struct ir3_block *ifb, *elseb; + struct ir3_instruction **ifout, **elseout; + unsigned i, ifnout = 0, elsenout = 0; + + pop_block(ctx); + + instr = pop_branch(ctx); + + compile_assert(ctx, (instr->category == -1) && + (instr->opc == OPC_META_FLOW)); + + ifb = instr->flow.if_block; + elseb = instr->flow.else_block; + /* if there is no else block, the parent block is used for the + * branch-not-taken src of the PHI instructions: + */ + if (!elseb) + elseb = ifb->parent; + + /* worst case sizes: */ + ifnout = ifb->ntemporaries + ifb->noutputs; + elsenout = elseb->ntemporaries + elseb->noutputs; + + ifout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * ifnout); + if (elseb != ifb->parent) + elseout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * elsenout); + + ifnout = 0; + elsenout = 0; + + /* generate PHI instructions for any temporaries written: */ + for (i = 0; i < ifb->ntemporaries; i++) { + struct ir3_instruction *a = ifb->temporaries[i]; + struct ir3_instruction *b = elseb->temporaries[i]; + + /* if temporary written in if-block, or if else block + * is present and temporary written in else-block: + */ + if (a || ((elseb != ifb->parent) && b)) { + struct ir3_instruction *phi; + + /* if only written on one side, find the closest + * enclosing update on other side: + */ + if (!a) + a = find_temporary(ifb, i); + if (!b) + b = find_temporary(elseb, i); + + ifout[ifnout] = a; + a = create_output(ifb, a, ifnout++); + + if (elseb != ifb->parent) { + elseout[elsenout] = b; + b = create_output(elseb, b, elsenout++); + } + + phi = create_phi(ctx, instr, a, b); + ctx->block->temporaries[i] = phi; + } + } + + compile_assert(ctx, ifb->noutputs == elseb->noutputs); + + /* .. and any outputs written: */ + for (i = 0; i < ifb->noutputs; i++) { + struct ir3_instruction *a = ifb->outputs[i]; + struct ir3_instruction *b = elseb->outputs[i]; + + /* if output written in if-block, or if else block + * is present and output written in else-block: + */ + if (a || ((elseb != ifb->parent) && b)) { + struct ir3_instruction *phi; + + /* if only written on one side, find the closest + * enclosing update on other side: + */ + if (!a) + a = find_output(ifb, i); + if (!b) + b = find_output(elseb, i); + + ifout[ifnout] = a; + a = create_output(ifb, a, ifnout++); + + if (elseb != ifb->parent) { + elseout[elsenout] = b; + b = create_output(elseb, b, elsenout++); + } + + phi = create_phi(ctx, instr, a, b); + ctx->block->outputs[i] = phi; + } + } + + ifb->noutputs = ifnout; + ifb->outputs = ifout; + + if (elseb != ifb->parent) { + elseb->noutputs = elsenout; + elseb->outputs = elseout; + } + + // TODO maybe we want to compact block->inputs? +} + +/* + * Kill + */ + +static void +trans_kill(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr, *immed, *cond = NULL; + bool inv = false; + + switch (t->tgsi_opc) { + case TGSI_OPCODE_KILL: + /* unconditional kill, use enclosing if condition: */ + if (ctx->branch_count > 0) { + unsigned int idx = ctx->branch_count - 1; + cond = ctx->branch[idx].cond; + inv = ctx->branch[idx].inv; + } else { + cond = create_immed(ctx, 1.0); + } + + break; + } + + compile_assert(ctx, cond); + + immed = create_immed(ctx, 0.0); + + /* cmps.f.ne p0.x, cond, {0.0} */ + instr = instr_create(ctx, 2, OPC_CMPS_F); + instr->cat2.condition = IR3_COND_NE; + ir3_reg_create(instr, regid(REG_P0, 0), 0); + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond; + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed; + cond = instr; + + /* kill p0.x */ + instr = instr_create(ctx, 0, OPC_KILL); + instr->cat0.inv = inv; + ir3_reg_create(instr, 0, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond; + + ctx->kill[ctx->kill_count++] = instr; +} + +/* + * Kill-If + */ + +static void +trans_killif(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_src_register *src = &inst->Src[0].Register; + struct ir3_instruction *instr, *immed, *cond = NULL; + bool inv = false; + + immed = create_immed(ctx, 0.0); + + /* cmps.f.ne p0.x, cond, {0.0} */ + instr = instr_create(ctx, 2, OPC_CMPS_F); + instr->cat2.condition = IR3_COND_NE; + ir3_reg_create(instr, regid(REG_P0, 0), 0); + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed; + add_src_reg(ctx, instr, src, src->SwizzleX); + + cond = instr; + + /* kill p0.x */ + instr = instr_create(ctx, 0, OPC_KILL); + instr->cat0.inv = inv; + ir3_reg_create(instr, 0, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond; + + ctx->kill[ctx->kill_count++] = instr; + +} +/* + * I2F / U2F / F2I / F2U + */ + +static void +trans_cov(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr; + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *src = &inst->Src[0].Register; + + // cov.f32s32 dst, tmp0 / + instr = instr_create(ctx, 1, 0); + switch (t->tgsi_opc) { + case TGSI_OPCODE_U2F: + instr->cat1.src_type = TYPE_U32; + instr->cat1.dst_type = TYPE_F32; + break; + case TGSI_OPCODE_I2F: + instr->cat1.src_type = TYPE_S32; + instr->cat1.dst_type = TYPE_F32; + break; + case TGSI_OPCODE_F2U: + instr->cat1.src_type = TYPE_F32; + instr->cat1.dst_type = TYPE_U32; + break; + case TGSI_OPCODE_F2I: + instr->cat1.src_type = TYPE_F32; + instr->cat1.dst_type = TYPE_S32; + break; + + } + vectorize(ctx, instr, dst, 1, src, 0); +} + +/* + * Handlers for TGSI instructions which do have 1:1 mapping to native + * instructions: + */ + +static void +instr_cat0(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + instr_create(ctx, 0, t->opc); +} + +static void +instr_cat1(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *src = &inst->Src[0].Register; + create_mov(ctx, dst, src); + put_dst(ctx, inst, dst); +} + +static void +instr_cat2(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *src0 = &inst->Src[0].Register; + struct tgsi_src_register *src1 = &inst->Src[1].Register; + struct ir3_instruction *instr; + unsigned src0_flags = 0, src1_flags = 0; + + switch (t->tgsi_opc) { + case TGSI_OPCODE_ABS: + case TGSI_OPCODE_IABS: + src0_flags = IR3_REG_ABS; + break; + case TGSI_OPCODE_SUB: + case TGSI_OPCODE_INEG: + src1_flags = IR3_REG_NEGATE; + break; + } + + switch (t->opc) { + case OPC_ABSNEG_F: + case OPC_ABSNEG_S: + case OPC_CLZ_B: + case OPC_CLZ_S: + case OPC_SIGN_F: + case OPC_FLOOR_F: + case OPC_CEIL_F: + case OPC_RNDNE_F: + case OPC_RNDAZ_F: + case OPC_TRUNC_F: + case OPC_NOT_B: + case OPC_BFREV_B: + case OPC_SETRM: + case OPC_CBITS_B: + /* these only have one src reg */ + instr = instr_create(ctx, 2, t->opc); + vectorize(ctx, instr, dst, 1, src0, src0_flags); + break; + default: + if (is_const(src0) && is_const(src1)) + src0 = get_unconst(ctx, src0); + + instr = instr_create(ctx, 2, t->opc); + vectorize(ctx, instr, dst, 2, src0, src0_flags, + src1, src1_flags); + break; + } + + put_dst(ctx, inst, dst); +} + +static void +instr_cat3(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *src0 = &inst->Src[0].Register; + struct tgsi_src_register *src1 = &inst->Src[1].Register; + struct ir3_instruction *instr; + + /* in particular, can't handle const for src1 for cat3.. + * for mad, we can swap first two src's if needed: + */ + if (is_rel_or_const(src1)) { + if (is_mad(t->opc) && !is_rel_or_const(src0)) { + struct tgsi_src_register *tmp; + tmp = src0; + src0 = src1; + src1 = tmp; + } else { + src1 = get_unconst(ctx, src1); + } + } + + instr = instr_create(ctx, 3, t->opc); + vectorize(ctx, instr, dst, 3, src0, 0, src1, 0, + &inst->Src[2].Register, 0); + put_dst(ctx, inst, dst); +} + +static void +instr_cat4(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *src = &inst->Src[0].Register; + struct ir3_instruction *instr; + unsigned i; + + /* seems like blob compiler avoids const as src.. */ + if (is_const(src)) + src = get_unconst(ctx, src); + + /* we need to replicate into each component: */ + for (i = 0; i < 4; i++) { + if (dst->WriteMask & (1 << i)) { + instr = instr_create(ctx, 4, t->opc); + add_dst_reg(ctx, instr, dst, i); + add_src_reg(ctx, instr, src, src->SwizzleX); + } + } + + put_dst(ctx, inst, dst); +} + +static const struct instr_translater translaters[TGSI_OPCODE_LAST] = { +#define INSTR(n, f, ...) \ + [TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ } + + INSTR(MOV, instr_cat1), + INSTR(RCP, instr_cat4, .opc = OPC_RCP), + INSTR(RSQ, instr_cat4, .opc = OPC_RSQ), + INSTR(SQRT, instr_cat4, .opc = OPC_SQRT), + INSTR(MUL, instr_cat2, .opc = OPC_MUL_F), + INSTR(ADD, instr_cat2, .opc = OPC_ADD_F), + INSTR(SUB, instr_cat2, .opc = OPC_ADD_F), + INSTR(MIN, instr_cat2, .opc = OPC_MIN_F), + INSTR(MAX, instr_cat2, .opc = OPC_MAX_F), + INSTR(UADD, instr_cat2, .opc = OPC_ADD_U), + INSTR(IMIN, instr_cat2, .opc = OPC_MIN_S), + INSTR(UMIN, instr_cat2, .opc = OPC_MIN_U), + INSTR(IMAX, instr_cat2, .opc = OPC_MAX_S), + INSTR(UMAX, instr_cat2, .opc = OPC_MAX_U), + INSTR(AND, instr_cat2, .opc = OPC_AND_B), + INSTR(OR, instr_cat2, .opc = OPC_OR_B), + INSTR(NOT, instr_cat2, .opc = OPC_NOT_B), + INSTR(XOR, instr_cat2, .opc = OPC_XOR_B), + INSTR(UMUL, instr_cat2, .opc = OPC_MUL_U), + INSTR(SHL, instr_cat2, .opc = OPC_SHL_B), + INSTR(USHR, instr_cat2, .opc = OPC_SHR_B), + INSTR(ISHR, instr_cat2, .opc = OPC_ASHR_B), + INSTR(IABS, instr_cat2, .opc = OPC_ABSNEG_S), + INSTR(INEG, instr_cat2, .opc = OPC_ABSNEG_S), + INSTR(AND, instr_cat2, .opc = OPC_AND_B), + INSTR(MAD, instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16), + INSTR(TRUNC, instr_cat2, .opc = OPC_TRUNC_F), + INSTR(CLAMP, trans_clamp), + INSTR(FLR, instr_cat2, .opc = OPC_FLOOR_F), + INSTR(ROUND, instr_cat2, .opc = OPC_RNDNE_F), + INSTR(SSG, instr_cat2, .opc = OPC_SIGN_F), + INSTR(CEIL, instr_cat2, .opc = OPC_CEIL_F), + INSTR(ARL, trans_arl), + INSTR(EX2, instr_cat4, .opc = OPC_EXP2), + INSTR(LG2, instr_cat4, .opc = OPC_LOG2), + INSTR(ABS, instr_cat2, .opc = OPC_ABSNEG_F), + INSTR(COS, instr_cat4, .opc = OPC_COS), + INSTR(SIN, instr_cat4, .opc = OPC_SIN), + INSTR(TEX, trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TEX), + INSTR(TXP, trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TXP), + INSTR(SGT, trans_cmp), + INSTR(SLT, trans_cmp), + INSTR(FSLT, trans_cmp), + INSTR(SGE, trans_cmp), + INSTR(FSGE, trans_cmp), + INSTR(SLE, trans_cmp), + INSTR(SNE, trans_cmp), + INSTR(FSNE, trans_cmp), + INSTR(SEQ, trans_cmp), + INSTR(FSEQ, trans_cmp), + INSTR(CMP, trans_cmp), + INSTR(USNE, trans_icmp, .opc = OPC_CMPS_U), + INSTR(USEQ, trans_icmp, .opc = OPC_CMPS_U), + INSTR(ISGE, trans_icmp, .opc = OPC_CMPS_S), + INSTR(USGE, trans_icmp, .opc = OPC_CMPS_U), + INSTR(ISLT, trans_icmp, .opc = OPC_CMPS_S), + INSTR(USLT, trans_icmp, .opc = OPC_CMPS_U), + INSTR(UCMP, trans_icmp, .opc = OPC_CMPS_U), + INSTR(IF, trans_if), + INSTR(UIF, trans_if), + INSTR(ELSE, trans_else), + INSTR(ENDIF, trans_endif), + INSTR(END, instr_cat0, .opc = OPC_END), + INSTR(KILL, trans_kill, .opc = OPC_KILL), + INSTR(KILL_IF, trans_killif, .opc = OPC_KILL), + INSTR(I2F, trans_cov), + INSTR(U2F, trans_cov), + INSTR(F2I, trans_cov), + INSTR(F2U, trans_cov), +}; + +static ir3_semantic +decl_semantic(const struct tgsi_declaration_semantic *sem) +{ + return ir3_semantic_name(sem->Name, sem->Index); +} + +static struct ir3_instruction * +decl_in_frag_bary(struct ir3_compile_context *ctx, unsigned regid, + unsigned j, unsigned inloc) +{ + struct ir3_instruction *instr; + struct ir3_register *src; + + /* bary.f dst, #inloc, r0.x */ + instr = instr_create(ctx, 2, OPC_BARY_F); + ir3_reg_create(instr, regid, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc; + src = ir3_reg_create(instr, 0, IR3_REG_SSA); + src->wrmask = 0x3; + src->instr = ctx->frag_pos; + + return instr; +} + +/* TGSI_SEMANTIC_POSITION + * """""""""""""""""""""" + * + * For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that + * fragment shader input contains the fragment's window position. The X + * component starts at zero and always increases from left to right. + * The Y component starts at zero and always increases but Y=0 may either + * indicate the top of the window or the bottom depending on the fragment + * coordinate origin convention (see TGSI_PROPERTY_FS_COORD_ORIGIN). + * The Z coordinate ranges from 0 to 1 to represent depth from the front + * to the back of the Z buffer. The W component contains the reciprocol + * of the interpolated vertex position W component. + */ +static struct ir3_instruction * +decl_in_frag_coord(struct ir3_compile_context *ctx, unsigned regid, + unsigned j) +{ + struct ir3_instruction *instr, *src; + + compile_assert(ctx, !ctx->frag_coord[j]); + + ctx->frag_coord[j] = create_input(ctx->block, NULL, 0); + + + switch (j) { + case 0: /* .x */ + case 1: /* .y */ + /* for frag_coord, we get unsigned values.. we need + * to subtract (integer) 8 and divide by 16 (right- + * shift by 4) then convert to float: + */ + + /* add.s tmp, src, -8 */ + instr = instr_create(ctx, 2, OPC_ADD_S); + ir3_reg_create(instr, regid, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_coord[j]; + ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -8; + src = instr; + + /* shr.b tmp, tmp, 4 */ + instr = instr_create(ctx, 2, OPC_SHR_B); + ir3_reg_create(instr, regid, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; + ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4; + src = instr; + + /* mov.u32f32 dst, tmp */ + instr = instr_create(ctx, 1, 0); + instr->cat1.src_type = TYPE_U32; + instr->cat1.dst_type = TYPE_F32; + ir3_reg_create(instr, regid, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; + + break; + case 2: /* .z */ + case 3: /* .w */ + /* seems that we can use these as-is: */ + instr = ctx->frag_coord[j]; + break; + default: + compile_error(ctx, "invalid channel\n"); + instr = create_immed(ctx, 0.0); + break; + } + + return instr; +} + +/* TGSI_SEMANTIC_FACE + * """""""""""""""""" + * + * This label applies to fragment shader inputs only and indicates that + * the register contains front/back-face information of the form (F, 0, + * 0, 1). The first component will be positive when the fragment belongs + * to a front-facing polygon, and negative when the fragment belongs to a + * back-facing polygon. + */ +static struct ir3_instruction * +decl_in_frag_face(struct ir3_compile_context *ctx, unsigned regid, + unsigned j) +{ + struct ir3_instruction *instr, *src; + + switch (j) { + case 0: /* .x */ + compile_assert(ctx, !ctx->frag_face); + + ctx->frag_face = create_input(ctx->block, NULL, 0); + + /* for faceness, we always get -1 or 0 (int).. but TGSI expects + * positive vs negative float.. and piglit further seems to + * expect -1.0 or 1.0: + * + * mul.s tmp, hr0.x, 2 + * add.s tmp, tmp, 1 + * mov.s16f32, dst, tmp + * + */ + + instr = instr_create(ctx, 2, OPC_MUL_S); + ir3_reg_create(instr, regid, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_face; + ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2; + src = instr; + + instr = instr_create(ctx, 2, OPC_ADD_S); + ir3_reg_create(instr, regid, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; + ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1; + src = instr; + + instr = instr_create(ctx, 1, 0); /* mov */ + instr->cat1.src_type = TYPE_S32; + instr->cat1.dst_type = TYPE_F32; + ir3_reg_create(instr, regid, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; + + break; + case 1: /* .y */ + case 2: /* .z */ + instr = create_immed(ctx, 0.0); + break; + case 3: /* .w */ + instr = create_immed(ctx, 1.0); + break; + default: + compile_error(ctx, "invalid channel\n"); + instr = create_immed(ctx, 0.0); + break; + } + + return instr; +} + +static void +decl_in(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl) +{ + struct ir3_shader_variant *so = ctx->so; + unsigned name = decl->Semantic.Name; + unsigned i; + + /* I don't think we should get frag shader input without + * semantic info? Otherwise how do inputs get linked to + * vert outputs? + */ + compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) || + decl->Declaration.Semantic); + + for (i = decl->Range.First; i <= decl->Range.Last; i++) { + unsigned n = so->inputs_count++; + unsigned r = regid(i, 0); + unsigned ncomp, j; + + /* we'll figure out the actual components used after scheduling */ + ncomp = 4; + + DBG("decl in -> r%d", i); + + compile_assert(ctx, n < ARRAY_SIZE(so->inputs)); + + so->inputs[n].semantic = decl_semantic(&decl->Semantic); + so->inputs[n].compmask = (1 << ncomp) - 1; + so->inputs[n].regid = r; + so->inputs[n].inloc = ctx->next_inloc; + + for (j = 0; j < ncomp; j++) { + struct ir3_instruction *instr = NULL; + + if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { + /* for fragment shaders, POSITION and FACE are handled + * specially, not using normal varying / bary.f + */ + if (name == TGSI_SEMANTIC_POSITION) { + so->inputs[n].bary = false; + so->frag_coord = true; + instr = decl_in_frag_coord(ctx, r + j, j); + } else if (name == TGSI_SEMANTIC_FACE) { + so->inputs[n].bary = false; + so->frag_face = true; + instr = decl_in_frag_face(ctx, r + j, j); + } else { + so->inputs[n].bary = true; + instr = decl_in_frag_bary(ctx, r + j, j, + so->inputs[n].inloc + j - 8); + } + } else { + instr = create_input(ctx->block, NULL, (i * 4) + j); + } + + ctx->block->inputs[(i * 4) + j] = instr; + } + + if (so->inputs[n].bary || (ctx->type == TGSI_PROCESSOR_VERTEX)) { + ctx->next_inloc += ncomp; + so->total_in += ncomp; + } + } +} + +static void +decl_out(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl) +{ + struct ir3_shader_variant *so = ctx->so; + unsigned comp = 0; + unsigned name = decl->Semantic.Name; + unsigned i; + + compile_assert(ctx, decl->Declaration.Semantic); + + DBG("decl out[%d] -> r%d", name, decl->Range.First); + + if (ctx->type == TGSI_PROCESSOR_VERTEX) { + switch (name) { + case TGSI_SEMANTIC_POSITION: + so->writes_pos = true; + break; + case TGSI_SEMANTIC_PSIZE: + so->writes_psize = true; + break; + case TGSI_SEMANTIC_COLOR: + case TGSI_SEMANTIC_BCOLOR: + case TGSI_SEMANTIC_GENERIC: + case TGSI_SEMANTIC_FOG: + case TGSI_SEMANTIC_TEXCOORD: + break; + default: + compile_error(ctx, "unknown VS semantic name: %s\n", + tgsi_semantic_names[name]); + } + } else { + switch (name) { + case TGSI_SEMANTIC_POSITION: + comp = 2; /* tgsi will write to .z component */ + so->writes_pos = true; + break; + case TGSI_SEMANTIC_COLOR: + break; + default: + compile_error(ctx, "unknown FS semantic name: %s\n", + tgsi_semantic_names[name]); + } + } + + for (i = decl->Range.First; i <= decl->Range.Last; i++) { + unsigned n = so->outputs_count++; + unsigned ncomp, j; + + ncomp = 4; + + compile_assert(ctx, n < ARRAY_SIZE(so->outputs)); + + so->outputs[n].semantic = decl_semantic(&decl->Semantic); + so->outputs[n].regid = regid(i, comp); + + /* avoid undefined outputs, stick a dummy mov from imm{0.0}, + * which if the output is actually assigned will be over- + * written + */ + for (j = 0; j < ncomp; j++) + ctx->block->outputs[(i * 4) + j] = create_immed(ctx, 0.0); + } +} + +/* from TGSI perspective, we actually have inputs. But most of the "inputs" + * for a fragment shader are just bary.f instructions. The *actual* inputs + * from the hw perspective are the frag_pos and optionally frag_coord and + * frag_face. + */ +static void +fixup_frag_inputs(struct ir3_compile_context *ctx) +{ + struct ir3_shader_variant *so = ctx->so; + struct ir3_block *block = ctx->block; + struct ir3_instruction **inputs; + struct ir3_instruction *instr; + int n, regid = 0; + + block->ninputs = 0; + + n = 4; /* always have frag_pos */ + n += COND(so->frag_face, 4); + n += COND(so->frag_coord, 4); + + inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *))); + + if (so->frag_face) { + /* this ultimately gets assigned to hr0.x so doesn't conflict + * with frag_coord/frag_pos.. + */ + inputs[block->ninputs++] = ctx->frag_face; + ctx->frag_face->regs[0]->num = 0; + + /* remaining channels not used, but let's avoid confusing + * other parts that expect inputs to come in groups of vec4 + */ + inputs[block->ninputs++] = NULL; + inputs[block->ninputs++] = NULL; + inputs[block->ninputs++] = NULL; + } + + /* since we don't know where to set the regid for frag_coord, + * we have to use r0.x for it. But we don't want to *always* + * use r1.x for frag_pos as that could increase the register + * footprint on simple shaders: + */ + if (so->frag_coord) { + ctx->frag_coord[0]->regs[0]->num = regid++; + ctx->frag_coord[1]->regs[0]->num = regid++; + ctx->frag_coord[2]->regs[0]->num = regid++; + ctx->frag_coord[3]->regs[0]->num = regid++; + + inputs[block->ninputs++] = ctx->frag_coord[0]; + inputs[block->ninputs++] = ctx->frag_coord[1]; + inputs[block->ninputs++] = ctx->frag_coord[2]; + inputs[block->ninputs++] = ctx->frag_coord[3]; + } + + /* we always have frag_pos: */ + so->pos_regid = regid; + + /* r0.x */ + instr = create_input(block, NULL, block->ninputs); + instr->regs[0]->num = regid++; + inputs[block->ninputs++] = instr; + ctx->frag_pos->regs[1]->instr = instr; + + /* r0.y */ + instr = create_input(block, NULL, block->ninputs); + instr->regs[0]->num = regid++; + inputs[block->ninputs++] = instr; + ctx->frag_pos->regs[2]->instr = instr; + + block->inputs = inputs; +} + +static void +compile_instructions(struct ir3_compile_context *ctx) +{ + push_block(ctx); + + /* for fragment shader, we have a single input register (usually + * r0.xy) which is used as the base for bary.f varying fetch instrs: + */ + if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { + struct ir3_instruction *instr; + instr = ir3_instr_create(ctx->block, -1, OPC_META_FI); + ir3_reg_create(instr, 0, 0); + ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.x */ + ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.y */ + ctx->frag_pos = instr; + } + + while (!tgsi_parse_end_of_tokens(&ctx->parser)) { + tgsi_parse_token(&ctx->parser); + + switch (ctx->parser.FullToken.Token.Type) { + case TGSI_TOKEN_TYPE_DECLARATION: { + struct tgsi_full_declaration *decl = + &ctx->parser.FullToken.FullDeclaration; + if (decl->Declaration.File == TGSI_FILE_OUTPUT) { + decl_out(ctx, decl); + } else if (decl->Declaration.File == TGSI_FILE_INPUT) { + decl_in(ctx, decl); + } + break; + } + case TGSI_TOKEN_TYPE_IMMEDIATE: { + /* TODO: if we know the immediate is small enough, and only + * used with instructions that can embed an immediate, we + * can skip this: + */ + struct tgsi_full_immediate *imm = + &ctx->parser.FullToken.FullImmediate; + unsigned n = ctx->so->immediates_count++; + compile_assert(ctx, n < ARRAY_SIZE(ctx->so->immediates)); + memcpy(ctx->so->immediates[n].val, imm->u, 16); + break; + } + case TGSI_TOKEN_TYPE_INSTRUCTION: { + struct tgsi_full_instruction *inst = + &ctx->parser.FullToken.FullInstruction; + unsigned opc = inst->Instruction.Opcode; + const struct instr_translater *t = &translaters[opc]; + + if (t->fxn) { + t->fxn(t, ctx, inst); + ctx->num_internal_temps = 0; + } else { + compile_error(ctx, "unknown TGSI opc: %s\n", + tgsi_get_opcode_name(opc)); + } + + switch (inst->Instruction.Saturate) { + case TGSI_SAT_ZERO_ONE: + create_clamp_imm(ctx, &inst->Dst[0].Register, + fui(0.0), fui(1.0)); + break; + case TGSI_SAT_MINUS_PLUS_ONE: + create_clamp_imm(ctx, &inst->Dst[0].Register, + fui(-1.0), fui(1.0)); + break; + } + + instr_finish(ctx); + + break; + } + default: + break; + } + } +} + +static void +compile_dump(struct ir3_compile_context *ctx) +{ + const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag"; + static unsigned n = 0; + char fname[16]; + FILE *f; + snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++); + f = fopen(fname, "w"); + if (!f) + return; + ir3_block_depth(ctx->block); + ir3_dump(ctx->ir, name, ctx->block, f); + fclose(f); +} + +int +ir3_compile_shader(struct ir3_shader_variant *so, + const struct tgsi_token *tokens, struct ir3_shader_key key) +{ + struct ir3_compile_context ctx; + struct ir3_block *block; + struct ir3_instruction **inputs; + unsigned i, j, actual_in; + int ret = 0; + + assert(!so->ir); + + so->ir = ir3_create(); + + assert(so->ir); + + if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) { + ret = -1; + goto out; + } + + compile_instructions(&ctx); + + block = ctx.block; + + /* keep track of the inputs from TGSI perspective.. */ + inputs = block->inputs; + + /* but fixup actual inputs for frag shader: */ + if (ctx.type == TGSI_PROCESSOR_FRAGMENT) + fixup_frag_inputs(&ctx); + + /* at this point, for binning pass, throw away unneeded outputs: */ + if (key.binning_pass) { + for (i = 0, j = 0; i < so->outputs_count; i++) { + unsigned name = sem2name(so->outputs[i].semantic); + unsigned idx = sem2name(so->outputs[i].semantic); + + /* throw away everything but first position/psize */ + if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) || + (name == TGSI_SEMANTIC_PSIZE))) { + if (i != j) { + so->outputs[j] = so->outputs[i]; + block->outputs[(j*4)+0] = block->outputs[(i*4)+0]; + block->outputs[(j*4)+1] = block->outputs[(i*4)+1]; + block->outputs[(j*4)+2] = block->outputs[(i*4)+2]; + block->outputs[(j*4)+3] = block->outputs[(i*4)+3]; + } + j++; + } + } + so->outputs_count = j; + block->noutputs = j * 4; + } + + /* at this point, we want the kill's in the outputs array too, + * so that they get scheduled (since they have no dst).. we've + * already ensured that the array is big enough in push_block(): + */ + if (ctx.type == TGSI_PROCESSOR_FRAGMENT) { + for (i = 0; i < ctx.kill_count; i++) + block->outputs[block->noutputs++] = ctx.kill[i]; + } + + if (fd_mesa_debug & FD_DBG_OPTDUMP) + compile_dump(&ctx); + + ret = ir3_block_flatten(block); + if (ret < 0) + goto out; + if ((ret > 0) && (fd_mesa_debug & FD_DBG_OPTDUMP)) + compile_dump(&ctx); + + ir3_block_cp(block); + + if (fd_mesa_debug & FD_DBG_OPTDUMP) + compile_dump(&ctx); + + ir3_block_depth(block); + + if (fd_mesa_debug & FD_DBG_OPTMSGS) { + printf("AFTER DEPTH:\n"); + ir3_dump_instr_list(block->head); + } + + ir3_block_sched(block); + + if (fd_mesa_debug & FD_DBG_OPTMSGS) { + printf("AFTER SCHED:\n"); + ir3_dump_instr_list(block->head); + } + + ret = ir3_block_ra(block, so->type, key.half_precision, + so->frag_coord, so->frag_face, &so->has_samp); + if (ret) + goto out; + + if (fd_mesa_debug & FD_DBG_OPTMSGS) { + printf("AFTER RA:\n"); + ir3_dump_instr_list(block->head); + } + + /* fixup input/outputs: */ + for (i = 0; i < so->outputs_count; i++) { + so->outputs[i].regid = block->outputs[i*4]->regs[0]->num; + /* preserve hack for depth output.. tgsi writes depth to .z, + * but what we give the hw is the scalar register: + */ + if ((ctx.type == TGSI_PROCESSOR_FRAGMENT) && + (sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION)) + so->outputs[i].regid += 2; + } + /* Note that some or all channels of an input may be unused: */ + actual_in = 0; + for (i = 0; i < so->inputs_count; i++) { + unsigned j, regid = ~0, compmask = 0; + so->inputs[i].ncomp = 0; + for (j = 0; j < 4; j++) { + struct ir3_instruction *in = inputs[(i*4) + j]; + if (in) { + compmask |= (1 << j); + regid = in->regs[0]->num - j; + actual_in++; + so->inputs[i].ncomp++; + } + } + so->inputs[i].regid = regid; + so->inputs[i].compmask = compmask; + } + + /* fragment shader always gets full vec4's even if it doesn't + * fetch all components, but vertex shader we need to update + * with the actual number of components fetch, otherwise thing + * will hang due to mismaptch between VFD_DECODE's and + * TOTALATTRTOVS + */ + if (so->type == SHADER_VERTEX) + so->total_in = actual_in; + +out: + if (ret) { + ir3_destroy(so->ir); + so->ir = NULL; + } + compile_free(&ctx); + + return ret; +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h new file mode 100644 index 00000000000..9b11b3d8abf --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h @@ -0,0 +1,42 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2013 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#ifndef FD3_COMPILER_H_ +#define FD3_COMPILER_H_ + +#include "ir3_shader.h" + + +int ir3_compile_shader(struct ir3_shader_variant *so, + const struct tgsi_token *tokens, + struct ir3_shader_key key); +int ir3_compile_shader_old(struct ir3_shader_variant *so, + const struct tgsi_token *tokens, + struct ir3_shader_key key); + +#endif /* FD3_COMPILER_H_ */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_old.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_old.c new file mode 100644 index 00000000000..1e1ca7ad813 --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_old.c @@ -0,0 +1,1524 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2013 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include <stdarg.h> + +#include "pipe/p_state.h" +#include "util/u_string.h" +#include "util/u_memory.h" +#include "util/u_inlines.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_ureg.h" +#include "tgsi/tgsi_info.h" +#include "tgsi/tgsi_strings.h" +#include "tgsi/tgsi_dump.h" +#include "tgsi/tgsi_scan.h" + +#include "freedreno_lowering.h" +#include "freedreno_util.h" + +#include "ir3_compiler.h" +#include "ir3_shader.h" + +#include "instr-a3xx.h" +#include "ir3.h" + + +struct ir3_compile_context { + const struct tgsi_token *tokens; + bool free_tokens; + struct ir3 *ir; + struct ir3_block *block; + struct ir3_shader_variant *so; + + struct tgsi_parse_context parser; + unsigned type; + + struct tgsi_shader_info info; + + /* last input dst (for setting (ei) flag): */ + struct ir3_register *last_input; + + /* last instruction with relative addressing: */ + struct ir3_instruction *last_rel; + + /* for calculating input/output positions/linkages: */ + unsigned next_inloc; + + unsigned num_internal_temps; + struct tgsi_src_register internal_temps[6]; + + /* track registers which need to synchronize w/ "complex alu" cat3 + * instruction pipeline: + */ + regmask_t needs_ss; + + /* track registers which need to synchronize with texture fetch + * pipeline: + */ + regmask_t needs_sy; + + /* inputs start at r0, temporaries start after last input, and + * outputs start after last temporary. + * + * We could be more clever, because this is not a hw restriction, + * but probably best just to implement an optimizing pass to + * reduce the # of registers used and get rid of redundant mov's + * (to output register). + */ + unsigned base_reg[TGSI_FILE_COUNT]; + + /* idx/slot for last compiler generated immediate */ + unsigned immediate_idx; + + /* stack of branch instructions that start (potentially nested) + * branch instructions, so that we can fix up the branch targets + * so that we can fix up the branch target on the corresponding + * END instruction + */ + struct ir3_instruction *branch[16]; + unsigned int branch_count; + + /* used when dst is same as one of the src, to avoid overwriting a + * src element before the remaining scalar instructions that make + * up the vector operation + */ + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; +}; + + +static void vectorize(struct ir3_compile_context *ctx, + struct ir3_instruction *instr, struct tgsi_dst_register *dst, + int nsrcs, ...); +static void create_mov(struct ir3_compile_context *ctx, + struct tgsi_dst_register *dst, struct tgsi_src_register *src); + +static unsigned +compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so, + const struct tgsi_token *tokens) +{ + unsigned ret, base = 0; + struct tgsi_shader_info *info = &ctx->info; + const struct fd_lowering_config lconfig = { + .color_two_side = so->key.color_two_side, + .lower_DST = true, + .lower_XPD = true, + .lower_SCS = true, + .lower_LRP = true, + .lower_FRC = true, + .lower_POW = true, + .lower_LIT = true, + .lower_EXP = true, + .lower_LOG = true, + .lower_DP4 = true, + .lower_DP3 = true, + .lower_DPH = true, + .lower_DP2 = true, + .lower_DP2A = true, + }; + + ctx->tokens = fd_transform_lowering(&lconfig, tokens, &ctx->info); + ctx->free_tokens = !!ctx->tokens; + if (!ctx->tokens) { + /* no lowering */ + ctx->tokens = tokens; + } + ctx->ir = so->ir; + ctx->block = ir3_block_create(ctx->ir, 0, 0, 0); + ctx->so = so; + ctx->last_input = NULL; + ctx->last_rel = NULL; + ctx->next_inloc = 8; + ctx->num_internal_temps = 0; + ctx->branch_count = 0; + + regmask_init(&ctx->needs_ss); + regmask_init(&ctx->needs_sy); + memset(ctx->base_reg, 0, sizeof(ctx->base_reg)); + + /* Immediates go after constants: */ + ctx->base_reg[TGSI_FILE_CONSTANT] = 0; + ctx->base_reg[TGSI_FILE_IMMEDIATE] = + info->file_max[TGSI_FILE_CONSTANT] + 1; + + /* if full precision and fragment shader, don't clobber + * r0.x w/ bary fetch: + */ + if ((so->type == SHADER_FRAGMENT) && !so->key.half_precision) + base = 1; + + /* Temporaries after outputs after inputs: */ + ctx->base_reg[TGSI_FILE_INPUT] = base; + ctx->base_reg[TGSI_FILE_OUTPUT] = base + + info->file_max[TGSI_FILE_INPUT] + 1; + ctx->base_reg[TGSI_FILE_TEMPORARY] = base + + info->file_max[TGSI_FILE_INPUT] + 1 + + info->file_max[TGSI_FILE_OUTPUT] + 1; + + so->first_immediate = ctx->base_reg[TGSI_FILE_IMMEDIATE]; + ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1); + + ret = tgsi_parse_init(&ctx->parser, ctx->tokens); + if (ret != TGSI_PARSE_OK) + return ret; + + ctx->type = ctx->parser.FullHeader.Processor.Processor; + + return ret; +} + +static void +compile_error(struct ir3_compile_context *ctx, const char *format, ...) +{ + va_list ap; + va_start(ap, format); + _debug_vprintf(format, ap); + va_end(ap); + tgsi_dump(ctx->tokens, 0); + debug_assert(0); +} + +#define compile_assert(ctx, cond) do { \ + if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \ + } while (0) + +static void +compile_free(struct ir3_compile_context *ctx) +{ + if (ctx->free_tokens) + free((void *)ctx->tokens); + tgsi_parse_free(&ctx->parser); +} + +struct instr_translater { + void (*fxn)(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst); + unsigned tgsi_opc; + opc_t opc; + opc_t hopc; /* opc to use for half_precision mode, if different */ + unsigned arg; +}; + +static void +handle_last_rel(struct ir3_compile_context *ctx) +{ + if (ctx->last_rel) { + ctx->last_rel->flags |= IR3_INSTR_UL; + ctx->last_rel = NULL; + } +} + +static struct ir3_instruction * +instr_create(struct ir3_compile_context *ctx, int category, opc_t opc) +{ + return ir3_instr_create(ctx->block, category, opc); +} + +static void +add_nop(struct ir3_compile_context *ctx, unsigned count) +{ + while (count-- > 0) + instr_create(ctx, 0, OPC_NOP); +} + +static unsigned +src_flags(struct ir3_compile_context *ctx, struct ir3_register *reg) +{ + unsigned flags = 0; + + if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)) + return flags; + + if (regmask_get(&ctx->needs_ss, reg)) { + flags |= IR3_INSTR_SS; + regmask_init(&ctx->needs_ss); + } + + if (regmask_get(&ctx->needs_sy, reg)) { + flags |= IR3_INSTR_SY; + regmask_init(&ctx->needs_sy); + } + + return flags; +} + +static struct ir3_register * +add_dst_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr, + const struct tgsi_dst_register *dst, unsigned chan) +{ + unsigned flags = 0, num = 0; + struct ir3_register *reg; + + switch (dst->File) { + case TGSI_FILE_OUTPUT: + case TGSI_FILE_TEMPORARY: + num = dst->Index + ctx->base_reg[dst->File]; + break; + case TGSI_FILE_ADDRESS: + num = REG_A0; + break; + default: + compile_error(ctx, "unsupported dst register file: %s\n", + tgsi_file_name(dst->File)); + break; + } + + if (dst->Indirect) + flags |= IR3_REG_RELATIV; + if (ctx->so->key.half_precision) + flags |= IR3_REG_HALF; + + reg = ir3_reg_create(instr, regid(num, chan), flags); + + if (dst->Indirect) + ctx->last_rel = instr; + + return reg; +} + +static struct ir3_register * +add_src_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr, + const struct tgsi_src_register *src, unsigned chan) +{ + unsigned flags = 0, num = 0; + struct ir3_register *reg; + + /* TODO we need to use a mov to temp for const >= 64.. or maybe + * we could use relative addressing.. + */ + compile_assert(ctx, src->Index < 64); + + switch (src->File) { + case TGSI_FILE_IMMEDIATE: + /* TODO if possible, use actual immediate instead of const.. but + * TGSI has vec4 immediates, we can only embed scalar (of limited + * size, depending on instruction..) + */ + case TGSI_FILE_CONSTANT: + flags |= IR3_REG_CONST; + num = src->Index + ctx->base_reg[src->File]; + break; + case TGSI_FILE_OUTPUT: + /* NOTE: we should only end up w/ OUTPUT file for things like + * clamp()'ing saturated dst instructions + */ + case TGSI_FILE_INPUT: + case TGSI_FILE_TEMPORARY: + num = src->Index + ctx->base_reg[src->File]; + break; + default: + compile_error(ctx, "unsupported src register file: %s\n", + tgsi_file_name(src->File)); + break; + } + + if (src->Absolute) + flags |= IR3_REG_ABS; + if (src->Negate) + flags |= IR3_REG_NEGATE; + if (src->Indirect) + flags |= IR3_REG_RELATIV; + if (ctx->so->key.half_precision) + flags |= IR3_REG_HALF; + + reg = ir3_reg_create(instr, regid(num, chan), flags); + + if (src->Indirect) + ctx->last_rel = instr; + + instr->flags |= src_flags(ctx, reg); + + return reg; +} + +static void +src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst) +{ + src->File = dst->File; + src->Indirect = dst->Indirect; + src->Dimension = dst->Dimension; + src->Index = dst->Index; + src->Absolute = 0; + src->Negate = 0; + src->SwizzleX = TGSI_SWIZZLE_X; + src->SwizzleY = TGSI_SWIZZLE_Y; + src->SwizzleZ = TGSI_SWIZZLE_Z; + src->SwizzleW = TGSI_SWIZZLE_W; +} + +/* Get internal-temp src/dst to use for a sequence of instructions + * generated by a single TGSI op. + */ +static struct tgsi_src_register * +get_internal_temp(struct ir3_compile_context *ctx, + struct tgsi_dst_register *tmp_dst) +{ + struct tgsi_src_register *tmp_src; + int n; + + tmp_dst->File = TGSI_FILE_TEMPORARY; + tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW; + tmp_dst->Indirect = 0; + tmp_dst->Dimension = 0; + + /* assign next temporary: */ + n = ctx->num_internal_temps++; + compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps)); + tmp_src = &ctx->internal_temps[n]; + + tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1; + + src_from_dst(tmp_src, tmp_dst); + + return tmp_src; +} + +/* Get internal half-precision temp src/dst to use for a sequence of + * instructions generated by a single TGSI op. + */ +static struct tgsi_src_register * +get_internal_temp_hr(struct ir3_compile_context *ctx, + struct tgsi_dst_register *tmp_dst) +{ + struct tgsi_src_register *tmp_src; + int n; + + if (ctx->so->key.half_precision) + return get_internal_temp(ctx, tmp_dst); + + tmp_dst->File = TGSI_FILE_TEMPORARY; + tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW; + tmp_dst->Indirect = 0; + tmp_dst->Dimension = 0; + + /* assign next temporary: */ + n = ctx->num_internal_temps++; + compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps)); + tmp_src = &ctx->internal_temps[n]; + + /* just use hr0 because no one else should be using half- + * precision regs: + */ + tmp_dst->Index = 0; + + src_from_dst(tmp_src, tmp_dst); + + return tmp_src; +} + +static inline bool +is_const(struct tgsi_src_register *src) +{ + return (src->File == TGSI_FILE_CONSTANT) || + (src->File == TGSI_FILE_IMMEDIATE); +} + +static inline bool +is_relative(struct tgsi_src_register *src) +{ + return src->Indirect; +} + +static inline bool +is_rel_or_const(struct tgsi_src_register *src) +{ + return is_relative(src) || is_const(src); +} + +static type_t +get_ftype(struct ir3_compile_context *ctx) +{ + return ctx->so->key.half_precision ? TYPE_F16 : TYPE_F32; +} + +static type_t +get_utype(struct ir3_compile_context *ctx) +{ + return ctx->so->key.half_precision ? TYPE_U16 : TYPE_U32; +} + +static unsigned +src_swiz(struct tgsi_src_register *src, int chan) +{ + switch (chan) { + case 0: return src->SwizzleX; + case 1: return src->SwizzleY; + case 2: return src->SwizzleZ; + case 3: return src->SwizzleW; + } + assert(0); + return 0; +} + +/* for instructions that cannot take a const register as src, if needed + * generate a move to temporary gpr: + */ +static struct tgsi_src_register * +get_unconst(struct ir3_compile_context *ctx, struct tgsi_src_register *src) +{ + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; + + compile_assert(ctx, is_rel_or_const(src)); + + tmp_src = get_internal_temp(ctx, &tmp_dst); + + create_mov(ctx, &tmp_dst, src); + + return tmp_src; +} + +static void +get_immediate(struct ir3_compile_context *ctx, + struct tgsi_src_register *reg, uint32_t val) +{ + unsigned neg, swiz, idx, i; + /* actually maps 1:1 currently.. not sure if that is safe to rely on: */ + static const unsigned swiz2tgsi[] = { + TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W, + }; + + for (i = 0; i < ctx->immediate_idx; i++) { + swiz = i % 4; + idx = i / 4; + + if (ctx->so->immediates[idx].val[swiz] == val) { + neg = 0; + break; + } + + if (ctx->so->immediates[idx].val[swiz] == -val) { + neg = 1; + break; + } + } + + if (i == ctx->immediate_idx) { + /* need to generate a new immediate: */ + swiz = i % 4; + idx = i / 4; + neg = 0; + ctx->so->immediates[idx].val[swiz] = val; + ctx->so->immediates_count = idx + 1; + ctx->immediate_idx++; + } + + reg->File = TGSI_FILE_IMMEDIATE; + reg->Indirect = 0; + reg->Dimension = 0; + reg->Index = idx; + reg->Absolute = 0; + reg->Negate = neg; + reg->SwizzleX = swiz2tgsi[swiz]; + reg->SwizzleY = swiz2tgsi[swiz]; + reg->SwizzleZ = swiz2tgsi[swiz]; + reg->SwizzleW = swiz2tgsi[swiz]; +} + +static void +create_mov(struct ir3_compile_context *ctx, struct tgsi_dst_register *dst, + struct tgsi_src_register *src) +{ + type_t type_mov = get_ftype(ctx); + unsigned i; + + for (i = 0; i < 4; i++) { + /* move to destination: */ + if (dst->WriteMask & (1 << i)) { + struct ir3_instruction *instr; + + if (src->Absolute || src->Negate) { + /* can't have abs or neg on a mov instr, so use + * absneg.f instead to handle these cases: + */ + instr = instr_create(ctx, 2, OPC_ABSNEG_F); + } else { + instr = instr_create(ctx, 1, 0); + instr->cat1.src_type = type_mov; + instr->cat1.dst_type = type_mov; + } + + add_dst_reg(ctx, instr, dst, i); + add_src_reg(ctx, instr, src, src_swiz(src, i)); + } else { + add_nop(ctx, 1); + } + } +} + +static void +create_clamp(struct ir3_compile_context *ctx, + struct tgsi_dst_register *dst, struct tgsi_src_register *val, + struct tgsi_src_register *minval, struct tgsi_src_register *maxval) +{ + struct ir3_instruction *instr; + + instr = instr_create(ctx, 2, OPC_MAX_F); + vectorize(ctx, instr, dst, 2, val, 0, minval, 0); + + instr = instr_create(ctx, 2, OPC_MIN_F); + vectorize(ctx, instr, dst, 2, val, 0, maxval, 0); +} + +static void +create_clamp_imm(struct ir3_compile_context *ctx, + struct tgsi_dst_register *dst, + uint32_t minval, uint32_t maxval) +{ + struct tgsi_src_register minconst, maxconst; + struct tgsi_src_register src; + + src_from_dst(&src, dst); + + get_immediate(ctx, &minconst, minval); + get_immediate(ctx, &maxconst, maxval); + + create_clamp(ctx, dst, &src, &minconst, &maxconst); +} + +static struct tgsi_dst_register * +get_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = &inst->Dst[0].Register; + unsigned i; + for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { + struct tgsi_src_register *src = &inst->Src[i].Register; + if ((src->File == dst->File) && (src->Index == dst->Index)) { + if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) && + (src->SwizzleX == TGSI_SWIZZLE_X) && + (src->SwizzleY == TGSI_SWIZZLE_Y) && + (src->SwizzleZ == TGSI_SWIZZLE_Z) && + (src->SwizzleW == TGSI_SWIZZLE_W)) + continue; + ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst); + ctx->tmp_dst.WriteMask = dst->WriteMask; + dst = &ctx->tmp_dst; + break; + } + } + return dst; +} + +static void +put_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst, + struct tgsi_dst_register *dst) +{ + /* if necessary, add mov back into original dst: */ + if (dst != &inst->Dst[0].Register) { + create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src); + } +} + +/* helper to generate the necessary repeat and/or additional instructions + * to turn a scalar instruction into a vector operation: + */ +static void +vectorize(struct ir3_compile_context *ctx, struct ir3_instruction *instr, + struct tgsi_dst_register *dst, int nsrcs, ...) +{ + va_list ap; + int i, j, n = 0; + bool indirect = dst->Indirect; + + add_dst_reg(ctx, instr, dst, TGSI_SWIZZLE_X); + + va_start(ap, nsrcs); + for (j = 0; j < nsrcs; j++) { + struct tgsi_src_register *src = + va_arg(ap, struct tgsi_src_register *); + unsigned flags = va_arg(ap, unsigned); + struct ir3_register *reg; + if (flags & IR3_REG_IMMED) { + reg = ir3_reg_create(instr, 0, IR3_REG_IMMED); + /* this is an ugly cast.. should have put flags first! */ + reg->iim_val = *(int *)&src; + } else { + reg = add_src_reg(ctx, instr, src, TGSI_SWIZZLE_X); + indirect |= src->Indirect; + } + reg->flags |= flags & ~IR3_REG_NEGATE; + if (flags & IR3_REG_NEGATE) + reg->flags ^= IR3_REG_NEGATE; + } + va_end(ap); + + for (i = 0; i < 4; i++) { + if (dst->WriteMask & (1 << i)) { + struct ir3_instruction *cur; + + if (n++ == 0) { + cur = instr; + } else { + cur = ir3_instr_clone(instr); + cur->flags &= ~(IR3_INSTR_SY | IR3_INSTR_SS | IR3_INSTR_JP); + } + + /* fix-up dst register component: */ + cur->regs[0]->num = regid(cur->regs[0]->num >> 2, i); + + /* fix-up src register component: */ + va_start(ap, nsrcs); + for (j = 0; j < nsrcs; j++) { + struct tgsi_src_register *src = + va_arg(ap, struct tgsi_src_register *); + unsigned flags = va_arg(ap, unsigned); + if (!(flags & IR3_REG_IMMED)) { + cur->regs[j+1]->num = + regid(cur->regs[j+1]->num >> 2, + src_swiz(src, i)); + cur->flags |= src_flags(ctx, cur->regs[j+1]); + } + } + va_end(ap); + + if (indirect) + ctx->last_rel = cur; + } + } + + /* pad w/ nop's.. at least until we are clever enough to + * figure out if we really need to.. + */ + add_nop(ctx, 4 - n); +} + +/* + * Handlers for TGSI instructions which do not have a 1:1 mapping to + * native instructions: + */ + +static void +trans_clamp(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *src0 = &inst->Src[0].Register; + struct tgsi_src_register *src1 = &inst->Src[1].Register; + struct tgsi_src_register *src2 = &inst->Src[2].Register; + + create_clamp(ctx, dst, src0, src1, src2); + + put_dst(ctx, inst, dst); +} + +/* ARL(x) = x, but mova from hrN.x to a0.. */ +static void +trans_arl(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr; + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; + struct tgsi_dst_register *dst = &inst->Dst[0].Register; + struct tgsi_src_register *src = &inst->Src[0].Register; + unsigned chan = src->SwizzleX; + compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS); + + handle_last_rel(ctx); + + tmp_src = get_internal_temp_hr(ctx, &tmp_dst); + + /* cov.{f32,f16}s16 Rtmp, Rsrc */ + instr = instr_create(ctx, 1, 0); + instr->cat1.src_type = get_ftype(ctx); + instr->cat1.dst_type = TYPE_S16; + add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF; + add_src_reg(ctx, instr, src, chan); + + add_nop(ctx, 3); + + /* shl.b Rtmp, Rtmp, 2 */ + instr = instr_create(ctx, 2, OPC_SHL_B); + add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF; + add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF; + ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2; + + add_nop(ctx, 3); + + /* mova a0, Rtmp */ + instr = instr_create(ctx, 1, 0); + instr->cat1.src_type = TYPE_S16; + instr->cat1.dst_type = TYPE_S16; + add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF; + add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF; + + /* need to ensure 5 instr slots before a0 is used: */ + add_nop(ctx, 6); +} + +/* texture fetch/sample instructions: */ +static void +trans_samp(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_register *r; + struct ir3_instruction *instr; + struct tgsi_src_register *coord = &inst->Src[0].Register; + struct tgsi_src_register *samp = &inst->Src[1].Register; + unsigned tex = inst->Texture.Texture; + int8_t *order; + unsigned i, flags = 0, src_wrmask; + bool needs_mov = false; + + switch (t->arg) { + case TGSI_OPCODE_TEX: + if (tex == TGSI_TEXTURE_2D) { + order = (int8_t[4]){ 0, 1, -1, -1 }; + src_wrmask = TGSI_WRITEMASK_XY; + } else { + order = (int8_t[4]){ 0, 1, 2, -1 }; + src_wrmask = TGSI_WRITEMASK_XYZ; + } + break; + case TGSI_OPCODE_TXP: + if (tex == TGSI_TEXTURE_2D) { + order = (int8_t[4]){ 0, 1, 3, -1 }; + src_wrmask = TGSI_WRITEMASK_XYZ; + } else { + order = (int8_t[4]){ 0, 1, 2, 3 }; + src_wrmask = TGSI_WRITEMASK_XYZW; + } + flags |= IR3_INSTR_P; + break; + default: + compile_assert(ctx, 0); + break; + } + + if ((tex == TGSI_TEXTURE_3D) || (tex == TGSI_TEXTURE_CUBE)) { + add_nop(ctx, 3); + flags |= IR3_INSTR_3D; + } + + /* cat5 instruction cannot seem to handle const or relative: */ + if (is_rel_or_const(coord)) + needs_mov = true; + + /* The texture sample instructions need to coord in successive + * registers/components (ie. src.xy but not src.yx). And TXP + * needs the .w component in .z for 2D.. so in some cases we + * might need to emit some mov instructions to shuffle things + * around: + */ + for (i = 1; (i < 4) && (order[i] >= 0) && !needs_mov; i++) + if (src_swiz(coord, i) != (src_swiz(coord, 0) + order[i])) + needs_mov = true; + + if (needs_mov) { + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; + unsigned j; + + type_t type_mov = get_ftype(ctx); + + /* need to move things around: */ + tmp_src = get_internal_temp(ctx, &tmp_dst); + + for (j = 0; (j < 4) && (order[j] >= 0); j++) { + instr = instr_create(ctx, 1, 0); + instr->cat1.src_type = type_mov; + instr->cat1.dst_type = type_mov; + add_dst_reg(ctx, instr, &tmp_dst, j); + add_src_reg(ctx, instr, coord, + src_swiz(coord, order[j])); + } + + coord = tmp_src; + + add_nop(ctx, 4 - j); + } + + instr = instr_create(ctx, 5, t->opc); + instr->cat5.type = get_ftype(ctx); + instr->cat5.samp = samp->Index; + instr->cat5.tex = samp->Index; + instr->flags |= flags; + + r = add_dst_reg(ctx, instr, &inst->Dst[0].Register, 0); + r->wrmask = inst->Dst[0].Register.WriteMask; + + add_src_reg(ctx, instr, coord, coord->SwizzleX)->wrmask = src_wrmask; + + /* after add_src_reg() so we don't set (sy) on sam instr itself! */ + regmask_set(&ctx->needs_sy, r); +} + +/* + * SEQ(a,b) = (a == b) ? 1.0 : 0.0 + * cmps.f.eq tmp0, b, a + * cov.u16f16 dst, tmp0 + * + * SNE(a,b) = (a != b) ? 1.0 : 0.0 + * cmps.f.eq tmp0, b, a + * add.s tmp0, tmp0, -1 + * sel.f16 dst, {0.0}, tmp0, {1.0} + * + * SGE(a,b) = (a >= b) ? 1.0 : 0.0 + * cmps.f.ge tmp0, a, b + * cov.u16f16 dst, tmp0 + * + * SLE(a,b) = (a <= b) ? 1.0 : 0.0 + * cmps.f.ge tmp0, b, a + * cov.u16f16 dst, tmp0 + * + * SGT(a,b) = (a > b) ? 1.0 : 0.0 + * cmps.f.ge tmp0, b, a + * add.s tmp0, tmp0, -1 + * sel.f16 dst, {0.0}, tmp0, {1.0} + * + * SLT(a,b) = (a < b) ? 1.0 : 0.0 + * cmps.f.ge tmp0, a, b + * add.s tmp0, tmp0, -1 + * sel.f16 dst, {0.0}, tmp0, {1.0} + * + * CMP(a,b,c) = (a < 0.0) ? b : c + * cmps.f.ge tmp0, a, {0.0} + * add.s tmp0, tmp0, -1 + * sel.f16 dst, c, tmp0, b + */ +static void +trans_cmp(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr; + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; + struct tgsi_src_register constval0, constval1; + /* final instruction for CMP() uses orig src1 and src2: */ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *a0, *a1; + unsigned condition; + + tmp_src = get_internal_temp(ctx, &tmp_dst); + + switch (t->tgsi_opc) { + case TGSI_OPCODE_SEQ: + case TGSI_OPCODE_SNE: + a0 = &inst->Src[1].Register; /* b */ + a1 = &inst->Src[0].Register; /* a */ + condition = IR3_COND_EQ; + break; + case TGSI_OPCODE_SGE: + case TGSI_OPCODE_SLT: + a0 = &inst->Src[0].Register; /* a */ + a1 = &inst->Src[1].Register; /* b */ + condition = IR3_COND_GE; + break; + case TGSI_OPCODE_SLE: + case TGSI_OPCODE_SGT: + a0 = &inst->Src[1].Register; /* b */ + a1 = &inst->Src[0].Register; /* a */ + condition = IR3_COND_GE; + break; + case TGSI_OPCODE_CMP: + get_immediate(ctx, &constval0, fui(0.0)); + a0 = &inst->Src[0].Register; /* a */ + a1 = &constval0; /* {0.0} */ + condition = IR3_COND_GE; + break; + default: + compile_assert(ctx, 0); + return; + } + + if (is_const(a0) && is_const(a1)) + a0 = get_unconst(ctx, a0); + + /* cmps.f.ge tmp, a0, a1 */ + instr = instr_create(ctx, 2, OPC_CMPS_F); + instr->cat2.condition = condition; + vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0); + + switch (t->tgsi_opc) { + case TGSI_OPCODE_SEQ: + case TGSI_OPCODE_SGE: + case TGSI_OPCODE_SLE: + /* cov.u16f16 dst, tmp0 */ + instr = instr_create(ctx, 1, 0); + instr->cat1.src_type = get_utype(ctx); + instr->cat1.dst_type = get_ftype(ctx); + vectorize(ctx, instr, dst, 1, tmp_src, 0); + break; + case TGSI_OPCODE_SNE: + case TGSI_OPCODE_SGT: + case TGSI_OPCODE_SLT: + case TGSI_OPCODE_CMP: + /* add.s tmp, tmp, -1 */ + instr = instr_create(ctx, 2, OPC_ADD_S); + vectorize(ctx, instr, &tmp_dst, 2, tmp_src, 0, -1, IR3_REG_IMMED); + + if (t->tgsi_opc == TGSI_OPCODE_CMP) { + /* sel.{f32,f16} dst, src2, tmp, src1 */ + instr = instr_create(ctx, 3, + ctx->so->key.half_precision ? OPC_SEL_F16 : OPC_SEL_F32); + vectorize(ctx, instr, dst, 3, + &inst->Src[2].Register, 0, + tmp_src, 0, + &inst->Src[1].Register, 0); + } else { + get_immediate(ctx, &constval0, fui(0.0)); + get_immediate(ctx, &constval1, fui(1.0)); + /* sel.{f32,f16} dst, {0.0}, tmp0, {1.0} */ + instr = instr_create(ctx, 3, + ctx->so->key.half_precision ? OPC_SEL_F16 : OPC_SEL_F32); + vectorize(ctx, instr, dst, 3, + &constval0, 0, tmp_src, 0, &constval1, 0); + } + + break; + } + + put_dst(ctx, inst, dst); +} + +/* + * Conditional / Flow control + */ + +static unsigned +find_instruction(struct ir3_compile_context *ctx, struct ir3_instruction *instr) +{ + unsigned i; + for (i = 0; i < ctx->ir->instrs_count; i++) + if (ctx->ir->instrs[i] == instr) + return i; + return ~0; +} + +static void +push_branch(struct ir3_compile_context *ctx, struct ir3_instruction *instr) +{ + ctx->branch[ctx->branch_count++] = instr; +} + +static void +pop_branch(struct ir3_compile_context *ctx) +{ + struct ir3_instruction *instr; + + /* if we were clever enough, we'd patch this up after the fact, + * and set (jp) flag on whatever the next instruction was, rather + * than inserting an extra nop.. + */ + instr = instr_create(ctx, 0, OPC_NOP); + instr->flags |= IR3_INSTR_JP; + + /* pop the branch instruction from the stack and fix up branch target: */ + instr = ctx->branch[--ctx->branch_count]; + instr->cat0.immed = ctx->ir->instrs_count - find_instruction(ctx, instr) - 1; +} + +/* We probably don't really want to translate if/else/endif into branches.. + * the blob driver evaluates both legs of the if and then uses the sel + * instruction to pick which sides of the branch to "keep".. but figuring + * that out will take somewhat more compiler smarts. So hopefully branches + * don't kill performance too badly. + */ +static void +trans_if(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr; + struct tgsi_src_register *src = &inst->Src[0].Register; + struct tgsi_src_register constval; + + get_immediate(ctx, &constval, fui(0.0)); + + if (is_const(src)) + src = get_unconst(ctx, src); + + instr = instr_create(ctx, 2, OPC_CMPS_F); + ir3_reg_create(instr, regid(REG_P0, 0), 0); + add_src_reg(ctx, instr, src, src->SwizzleX); + add_src_reg(ctx, instr, &constval, constval.SwizzleX); + instr->cat2.condition = IR3_COND_EQ; + + instr = instr_create(ctx, 0, OPC_BR); + push_branch(ctx, instr); +} + +static void +trans_else(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr; + + /* for first half of if/else/endif, generate a jump past the else: */ + instr = instr_create(ctx, 0, OPC_JUMP); + + pop_branch(ctx); + push_branch(ctx, instr); +} + +static void +trans_endif(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + pop_branch(ctx); +} + +/* + * Handlers for TGSI instructions which do have 1:1 mapping to native + * instructions: + */ + +static void +instr_cat0(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + instr_create(ctx, 0, t->opc); +} + +static void +instr_cat1(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *src = &inst->Src[0].Register; + + /* mov instructions can't handle a negate on src: */ + if (src->Negate) { + struct tgsi_src_register constval; + struct ir3_instruction *instr; + + /* since right now, we are using uniformly either TYPE_F16 or + * TYPE_F32, and we don't utilize the conversion possibilities + * of mov instructions, we can get away with substituting an + * add.f which can handle negate. Might need to revisit this + * in the future if we start supporting widening/narrowing or + * conversion to/from integer.. + */ + instr = instr_create(ctx, 2, OPC_ADD_F); + get_immediate(ctx, &constval, fui(0.0)); + vectorize(ctx, instr, dst, 2, src, 0, &constval, 0); + } else { + create_mov(ctx, dst, src); + /* create_mov() generates vector sequence, so no vectorize() */ + } + put_dst(ctx, inst, dst); +} + +static void +instr_cat2(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *src0 = &inst->Src[0].Register; + struct tgsi_src_register *src1 = &inst->Src[1].Register; + struct ir3_instruction *instr; + unsigned src0_flags = 0, src1_flags = 0; + + switch (t->tgsi_opc) { + case TGSI_OPCODE_ABS: + src0_flags = IR3_REG_ABS; + break; + case TGSI_OPCODE_SUB: + src1_flags = IR3_REG_NEGATE; + break; + } + + switch (t->opc) { + case OPC_ABSNEG_F: + case OPC_ABSNEG_S: + case OPC_CLZ_B: + case OPC_CLZ_S: + case OPC_SIGN_F: + case OPC_FLOOR_F: + case OPC_CEIL_F: + case OPC_RNDNE_F: + case OPC_RNDAZ_F: + case OPC_TRUNC_F: + case OPC_NOT_B: + case OPC_BFREV_B: + case OPC_SETRM: + case OPC_CBITS_B: + /* these only have one src reg */ + instr = instr_create(ctx, 2, t->opc); + vectorize(ctx, instr, dst, 1, src0, src0_flags); + break; + default: + if (is_const(src0) && is_const(src1)) + src0 = get_unconst(ctx, src0); + + instr = instr_create(ctx, 2, t->opc); + vectorize(ctx, instr, dst, 2, src0, src0_flags, + src1, src1_flags); + break; + } + + put_dst(ctx, inst, dst); +} + +static void +instr_cat3(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *src0 = &inst->Src[0].Register; + struct tgsi_src_register *src1 = &inst->Src[1].Register; + struct ir3_instruction *instr; + + /* in particular, can't handle const for src1 for cat3.. + * for mad, we can swap first two src's if needed: + */ + if (is_rel_or_const(src1)) { + if (is_mad(t->opc) && !is_rel_or_const(src0)) { + struct tgsi_src_register *tmp; + tmp = src0; + src0 = src1; + src1 = tmp; + } else { + src1 = get_unconst(ctx, src1); + } + } + + instr = instr_create(ctx, 3, + ctx->so->key.half_precision ? t->hopc : t->opc); + vectorize(ctx, instr, dst, 3, src0, 0, src1, 0, + &inst->Src[2].Register, 0); + put_dst(ctx, inst, dst); +} + +static void +instr_cat4(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *src = &inst->Src[0].Register; + struct ir3_instruction *instr; + unsigned i, n; + + /* seems like blob compiler avoids const as src.. */ + if (is_const(src)) + src = get_unconst(ctx, src); + + /* worst case: */ + add_nop(ctx, 6); + + /* we need to replicate into each component: */ + for (i = 0, n = 0; i < 4; i++) { + if (dst->WriteMask & (1 << i)) { + if (n++) + add_nop(ctx, 1); + instr = instr_create(ctx, 4, t->opc); + add_dst_reg(ctx, instr, dst, i); + add_src_reg(ctx, instr, src, src->SwizzleX); + } + } + + regmask_set(&ctx->needs_ss, instr->regs[0]); + put_dst(ctx, inst, dst); +} + +static const struct instr_translater translaters[TGSI_OPCODE_LAST] = { +#define INSTR(n, f, ...) \ + [TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ } + + INSTR(MOV, instr_cat1), + INSTR(RCP, instr_cat4, .opc = OPC_RCP), + INSTR(RSQ, instr_cat4, .opc = OPC_RSQ), + INSTR(SQRT, instr_cat4, .opc = OPC_SQRT), + INSTR(MUL, instr_cat2, .opc = OPC_MUL_F), + INSTR(ADD, instr_cat2, .opc = OPC_ADD_F), + INSTR(SUB, instr_cat2, .opc = OPC_ADD_F), + INSTR(MIN, instr_cat2, .opc = OPC_MIN_F), + INSTR(MAX, instr_cat2, .opc = OPC_MAX_F), + INSTR(MAD, instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16), + INSTR(TRUNC, instr_cat2, .opc = OPC_TRUNC_F), + INSTR(CLAMP, trans_clamp), + INSTR(FLR, instr_cat2, .opc = OPC_FLOOR_F), + INSTR(ROUND, instr_cat2, .opc = OPC_RNDNE_F), + INSTR(SSG, instr_cat2, .opc = OPC_SIGN_F), + INSTR(ARL, trans_arl), + INSTR(EX2, instr_cat4, .opc = OPC_EXP2), + INSTR(LG2, instr_cat4, .opc = OPC_LOG2), + INSTR(ABS, instr_cat2, .opc = OPC_ABSNEG_F), + INSTR(COS, instr_cat4, .opc = OPC_COS), + INSTR(SIN, instr_cat4, .opc = OPC_SIN), + INSTR(TEX, trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TEX), + INSTR(TXP, trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TXP), + INSTR(SGT, trans_cmp), + INSTR(SLT, trans_cmp), + INSTR(SGE, trans_cmp), + INSTR(SLE, trans_cmp), + INSTR(SNE, trans_cmp), + INSTR(SEQ, trans_cmp), + INSTR(CMP, trans_cmp), + INSTR(IF, trans_if), + INSTR(ELSE, trans_else), + INSTR(ENDIF, trans_endif), + INSTR(END, instr_cat0, .opc = OPC_END), + INSTR(KILL, instr_cat0, .opc = OPC_KILL), +}; + +static ir3_semantic +decl_semantic(const struct tgsi_declaration_semantic *sem) +{ + return ir3_semantic_name(sem->Name, sem->Index); +} + +static int +decl_in(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl) +{ + struct ir3_shader_variant *so = ctx->so; + unsigned base = ctx->base_reg[TGSI_FILE_INPUT]; + unsigned i, flags = 0; + int nop = 0; + + /* I don't think we should get frag shader input without + * semantic info? Otherwise how do inputs get linked to + * vert outputs? + */ + compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) || + decl->Declaration.Semantic); + + if (ctx->so->key.half_precision) + flags |= IR3_REG_HALF; + + for (i = decl->Range.First; i <= decl->Range.Last; i++) { + unsigned n = so->inputs_count++; + unsigned r = regid(i + base, 0); + unsigned ncomp; + + /* TODO use ctx->info.input_usage_mask[decl->Range.n] to figure out ncomp: */ + ncomp = 4; + + DBG("decl in -> r%d", i + base); // XXX + + compile_assert(ctx, n < ARRAY_SIZE(so->inputs)); + + so->inputs[n].semantic = decl_semantic(&decl->Semantic); + so->inputs[n].compmask = (1 << ncomp) - 1; + so->inputs[n].ncomp = ncomp; + so->inputs[n].regid = r; + so->inputs[n].inloc = ctx->next_inloc; + so->inputs[n].bary = true; /* all that is supported */ + ctx->next_inloc += ncomp; + + so->total_in += ncomp; + + /* for frag shaders, we need to generate the corresponding bary instr: */ + if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { + unsigned j; + + for (j = 0; j < ncomp; j++) { + struct ir3_instruction *instr; + struct ir3_register *dst; + + instr = instr_create(ctx, 2, OPC_BARY_F); + + /* dst register: */ + dst = ir3_reg_create(instr, r + j, flags); + ctx->last_input = dst; + + /* input position: */ + ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = + so->inputs[n].inloc + j - 8; + + /* input base (always r0.xy): */ + ir3_reg_create(instr, regid(0,0), 0)->wrmask = 0x3; + } + + nop = 6; + } + } + + return nop; +} + +static void +decl_out(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl) +{ + struct ir3_shader_variant *so = ctx->so; + unsigned base = ctx->base_reg[TGSI_FILE_OUTPUT]; + unsigned comp = 0; + unsigned name = decl->Semantic.Name; + unsigned i; + + compile_assert(ctx, decl->Declaration.Semantic); // TODO is this ever not true? + + DBG("decl out[%d] -> r%d", name, decl->Range.First + base); // XXX + + if (ctx->type == TGSI_PROCESSOR_VERTEX) { + switch (name) { + case TGSI_SEMANTIC_POSITION: + so->writes_pos = true; + break; + case TGSI_SEMANTIC_PSIZE: + so->writes_psize = true; + break; + case TGSI_SEMANTIC_COLOR: + case TGSI_SEMANTIC_BCOLOR: + case TGSI_SEMANTIC_GENERIC: + case TGSI_SEMANTIC_FOG: + case TGSI_SEMANTIC_TEXCOORD: + break; + default: + compile_error(ctx, "unknown VS semantic name: %s\n", + tgsi_semantic_names[name]); + } + } else { + switch (name) { + case TGSI_SEMANTIC_POSITION: + comp = 2; /* tgsi will write to .z component */ + so->writes_pos = true; + break; + case TGSI_SEMANTIC_COLOR: + break; + default: + compile_error(ctx, "unknown FS semantic name: %s\n", + tgsi_semantic_names[name]); + } + } + + for (i = decl->Range.First; i <= decl->Range.Last; i++) { + unsigned n = so->outputs_count++; + compile_assert(ctx, n < ARRAY_SIZE(so->outputs)); + so->outputs[n].semantic = decl_semantic(&decl->Semantic); + so->outputs[n].regid = regid(i + base, comp); + } +} + +static void +decl_samp(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl) +{ + ctx->so->has_samp = true; +} + +static void +compile_instructions(struct ir3_compile_context *ctx) +{ + struct ir3 *ir = ctx->ir; + int nop = 0; + + while (!tgsi_parse_end_of_tokens(&ctx->parser)) { + tgsi_parse_token(&ctx->parser); + + switch (ctx->parser.FullToken.Token.Type) { + case TGSI_TOKEN_TYPE_DECLARATION: { + struct tgsi_full_declaration *decl = + &ctx->parser.FullToken.FullDeclaration; + if (decl->Declaration.File == TGSI_FILE_OUTPUT) { + decl_out(ctx, decl); + } else if (decl->Declaration.File == TGSI_FILE_INPUT) { + nop = decl_in(ctx, decl); + } else if (decl->Declaration.File == TGSI_FILE_SAMPLER) { + decl_samp(ctx, decl); + } + break; + } + case TGSI_TOKEN_TYPE_IMMEDIATE: { + /* TODO: if we know the immediate is small enough, and only + * used with instructions that can embed an immediate, we + * can skip this: + */ + struct tgsi_full_immediate *imm = + &ctx->parser.FullToken.FullImmediate; + unsigned n = ctx->so->immediates_count++; + memcpy(ctx->so->immediates[n].val, imm->u, 16); + break; + } + case TGSI_TOKEN_TYPE_INSTRUCTION: { + struct tgsi_full_instruction *inst = + &ctx->parser.FullToken.FullInstruction; + unsigned opc = inst->Instruction.Opcode; + const struct instr_translater *t = &translaters[opc]; + + add_nop(ctx, nop); + nop = 0; + + if (t->fxn) { + t->fxn(t, ctx, inst); + ctx->num_internal_temps = 0; + } else { + compile_error(ctx, "unknown TGSI opc: %s\n", + tgsi_get_opcode_name(opc)); + } + + switch (inst->Instruction.Saturate) { + case TGSI_SAT_ZERO_ONE: + create_clamp_imm(ctx, &inst->Dst[0].Register, + fui(0.0), fui(1.0)); + break; + case TGSI_SAT_MINUS_PLUS_ONE: + create_clamp_imm(ctx, &inst->Dst[0].Register, + fui(-1.0), fui(1.0)); + break; + } + + break; + } + default: + break; + } + } + + if (ir->instrs_count > 0) + ir->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY; + + if (ctx->last_input) + ctx->last_input->flags |= IR3_REG_EI; + + handle_last_rel(ctx); +} + +int +ir3_compile_shader_old(struct ir3_shader_variant *so, + const struct tgsi_token *tokens, struct ir3_shader_key key) +{ + struct ir3_compile_context ctx; + + assert(!so->ir); + + so->ir = ir3_create(); + + assert(so->ir); + + if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) + return -1; + + compile_instructions(&ctx); + + compile_free(&ctx); + + return 0; +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c new file mode 100644 index 00000000000..73c2a27c6eb --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c @@ -0,0 +1,158 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include "ir3.h" + +/* + * Copy Propagate: + * + * TODO probably want some sort of visitor sort of interface to + * avoid duplicating the same graph traversal logic everywhere.. + * + */ + +static void block_cp(struct ir3_block *block); +static struct ir3_instruction * instr_cp(struct ir3_instruction *instr, bool keep); + +static bool is_eligible_mov(struct ir3_instruction *instr) +{ + if ((instr->category == 1) && + (instr->cat1.src_type == instr->cat1.dst_type)) { + struct ir3_register *dst = instr->regs[0]; + struct ir3_register *src = instr->regs[1]; + if (dst->flags & IR3_REG_ADDR) + return false; + if ((src->flags & IR3_REG_SSA) && + /* TODO: propagate abs/neg modifiers if possible */ + !(src->flags & (IR3_REG_ABS | IR3_REG_NEGATE | IR3_REG_RELATIV))) + return true; + } + return false; +} + +static void walk_children(struct ir3_instruction *instr, bool keep) +{ + unsigned i; + + /* walk down the graph from each src: */ + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *src = instr->regs[i]; + if (src->flags & IR3_REG_SSA) + src->instr = instr_cp(src->instr, keep); + } +} + +static struct ir3_instruction * +instr_cp_fanin(struct ir3_instruction *instr) +{ + unsigned i; + + /* we need to handle fanin specially, to detect cases + * when we need to keep a mov + */ + + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *src = instr->regs[i]; + if (src->flags & IR3_REG_SSA) { + struct ir3_instruction *cand = + instr_cp(src->instr, false); + + /* if the candidate is a fanout, then keep + * the move. + * + * This is a bit, um, fragile, but it should + * catch the extra mov's that the front-end + * puts in for us already in these cases. + */ + if (is_meta(cand) && (cand->opc == OPC_META_FO)) + cand = instr_cp(src->instr, true); + + src->instr = cand; + } + } + + walk_children(instr, false); + + return instr; + +} + +static struct ir3_instruction * +instr_cp(struct ir3_instruction *instr, bool keep) +{ + /* if we've already visited this instruction, bail now: */ + if (ir3_instr_check_mark(instr)) + return instr; + + if (is_meta(instr) && (instr->opc == OPC_META_FI)) + return instr_cp_fanin(instr); + + if (is_eligible_mov(instr) && !keep) { + struct ir3_register *src = instr->regs[1]; + return instr_cp(src->instr, false); + } + + walk_children(instr, false); + + return instr; +} + +static void block_cp(struct ir3_block *block) +{ + unsigned i, j; + + for (i = 0; i < block->noutputs; i++) { + if (block->outputs[i]) { + struct ir3_instruction *out = + instr_cp(block->outputs[i], false); + + /* To deal with things like this: + * + * 43: MOV OUT[2], TEMP[5] + * 44: MOV OUT[0], TEMP[5] + * + * we need to ensure that no two outputs point to + * the same instruction + */ + for (j = 0; j < i; j++) { + if (block->outputs[j] == out) { + out = instr_cp(block->outputs[i], true); + break; + } + } + + block->outputs[i] = out; + } + } +} + +void ir3_block_cp(struct ir3_block *block) +{ + ir3_clear_mark(block->shader); + block_cp(block); +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c new file mode 100644 index 00000000000..dcc0362f0c8 --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c @@ -0,0 +1,159 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include "util/u_math.h" + +#include "ir3.h" + +/* + * Instruction Depth: + * + * Calculates weighted instruction depth, ie. the sum of # of needed + * instructions plus delay slots back to original input (ie INPUT or + * CONST). That is to say, an instructions depth is: + * + * depth(instr) { + * d = 0; + * // for each src register: + * foreach (src in instr->regs[1..n]) + * d = max(d, delayslots(src->instr, n) + depth(src->instr)); + * return d + 1; + * } + * + * After an instruction's depth is calculated, it is inserted into the + * blocks depth sorted list, which is used by the scheduling pass. + */ + +/* calculate required # of delay slots between the instruction that + * assigns a value and the one that consumes + */ +int ir3_delayslots(struct ir3_instruction *assigner, + struct ir3_instruction *consumer, unsigned n) +{ + /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal + * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch + * handled with sync bits + */ + + if (is_meta(assigner)) + return 0; + + if (writes_addr(assigner)) + return 6; + + /* handled via sync flags: */ + if (is_sfu(assigner) || is_tex(assigner)) + return 0; + + /* assigner must be alu: */ + if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer)) { + return 6; + } else if ((consumer->category == 3) && + is_mad(consumer->opc) && (n == 2)) { + /* special case, 3rd src to cat3 not required on first cycle */ + return 1; + } else { + return 3; + } +} + +static void insert_by_depth(struct ir3_instruction *instr) +{ + struct ir3_block *block = instr->block; + struct ir3_instruction *n = block->head; + struct ir3_instruction *p = NULL; + + while (n && (n != instr) && (n->depth > instr->depth)) { + p = n; + n = n->next; + } + + instr->next = n; + if (p) + p->next = instr; + else + block->head = instr; +} + +static void ir3_instr_depth(struct ir3_instruction *instr) +{ + unsigned i; + + /* if we've already visited this instruction, bail now: */ + if (ir3_instr_check_mark(instr)) + return; + + instr->depth = 0; + + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *src = instr->regs[i]; + if (src->flags & IR3_REG_SSA) { + unsigned sd; + + /* visit child to compute it's depth: */ + ir3_instr_depth(src->instr); + + sd = ir3_delayslots(src->instr, instr, i-1) + + src->instr->depth; + + instr->depth = MAX2(instr->depth, sd); + } + } + + /* meta-instructions don't add cycles, other than PHI.. which + * might translate to a real instruction.. + * + * well, not entirely true, fan-in/out, etc might need to need + * to generate some extra mov's in edge cases, etc.. probably + * we might want to do depth calculation considering the worst + * case for these?? + */ + if (!is_meta(instr)) + instr->depth++; + + insert_by_depth(instr); +} + +void ir3_block_depth(struct ir3_block *block) +{ + unsigned i; + + block->head = NULL; + + ir3_clear_mark(block->shader); + for (i = 0; i < block->noutputs; i++) + if (block->outputs[i]) + ir3_instr_depth(block->outputs[i]); + + /* at this point, any unvisited input is unused: */ + for (i = 0; i < block->ninputs; i++) { + struct ir3_instruction *in = block->inputs[i]; + if (in && !ir3_instr_check_mark(in)) + block->inputs[i] = NULL; + } +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_dump.c b/src/gallium/drivers/freedreno/ir3/ir3_dump.c new file mode 100644 index 00000000000..1a6f49d51cd --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_dump.c @@ -0,0 +1,425 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include <stdarg.h> + +#include "ir3.h" + +#define PTRID(x) ((unsigned long)(x)) + +struct ir3_dump_ctx { + FILE *f; + bool verbose; +}; + +static void dump_instr_name(struct ir3_dump_ctx *ctx, + struct ir3_instruction *instr) +{ + /* for debugging: */ + if (ctx->verbose) { +#ifdef DEBUG + fprintf(ctx->f, "%04u:", instr->serialno); +#endif + fprintf(ctx->f, "%03u: ", instr->depth); + } + + if (instr->flags & IR3_INSTR_SY) + fprintf(ctx->f, "(sy)"); + if (instr->flags & IR3_INSTR_SS) + fprintf(ctx->f, "(ss)"); + + if (is_meta(instr)) { + switch(instr->opc) { + case OPC_META_PHI: + fprintf(ctx->f, "Φ"); + break; + case OPC_META_DEREF: + fprintf(ctx->f, "(*)"); + break; + default: + /* shouldn't hit here.. just for debugging: */ + switch (instr->opc) { + case OPC_META_INPUT: fprintf(ctx->f, "_meta:in"); break; + case OPC_META_OUTPUT: fprintf(ctx->f, "_meta:out"); break; + case OPC_META_FO: fprintf(ctx->f, "_meta:fo"); break; + case OPC_META_FI: fprintf(ctx->f, "_meta:fi"); break; + case OPC_META_FLOW: fprintf(ctx->f, "_meta:flow"); break; + + default: fprintf(ctx->f, "_meta:%d", instr->opc); break; + } + break; + } + } else if (instr->category == 1) { + static const char *type[] = { + [TYPE_F16] = "f16", + [TYPE_F32] = "f32", + [TYPE_U16] = "u16", + [TYPE_U32] = "u32", + [TYPE_S16] = "s16", + [TYPE_S32] = "s32", + [TYPE_U8] = "u8", + [TYPE_S8] = "s8", + }; + if (instr->cat1.src_type == instr->cat1.dst_type) + fprintf(ctx->f, "mov"); + else + fprintf(ctx->f, "cov"); + fprintf(ctx->f, ".%s%s", type[instr->cat1.src_type], type[instr->cat1.dst_type]); + } else { + fprintf(ctx->f, "%s", ir3_instr_name(instr)); + if (instr->flags & IR3_INSTR_3D) + fprintf(ctx->f, ".3d"); + if (instr->flags & IR3_INSTR_A) + fprintf(ctx->f, ".a"); + if (instr->flags & IR3_INSTR_O) + fprintf(ctx->f, ".o"); + if (instr->flags & IR3_INSTR_P) + fprintf(ctx->f, ".p"); + if (instr->flags & IR3_INSTR_S) + fprintf(ctx->f, ".s"); + if (instr->flags & IR3_INSTR_S2EN) + fprintf(ctx->f, ".s2en"); + } +} + +static void dump_reg_name(struct ir3_dump_ctx *ctx, + struct ir3_register *reg) +{ + if ((reg->flags & IR3_REG_ABS) && (reg->flags & IR3_REG_NEGATE)) + fprintf(ctx->f, "(absneg)"); + else if (reg->flags & IR3_REG_NEGATE) + fprintf(ctx->f, "(neg)"); + else if (reg->flags & IR3_REG_ABS) + fprintf(ctx->f, "(abs)"); + + if (reg->flags & IR3_REG_IMMED) { + fprintf(ctx->f, "imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val); + } else if (reg->flags & IR3_REG_SSA) { + if (ctx->verbose) { + fprintf(ctx->f, "_["); + dump_instr_name(ctx, reg->instr); + fprintf(ctx->f, "]"); + } + } else { + if (reg->flags & IR3_REG_HALF) + fprintf(ctx->f, "h"); + if (reg->flags & IR3_REG_CONST) + fprintf(ctx->f, "c%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]); + else + fprintf(ctx->f, "r%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]); + } +} + +static void ir3_instr_dump(struct ir3_dump_ctx *ctx, + struct ir3_instruction *instr); +static void ir3_block_dump(struct ir3_dump_ctx *ctx, + struct ir3_block *block, const char *name); + +static void dump_instr(struct ir3_dump_ctx *ctx, + struct ir3_instruction *instr) +{ + /* if we've already visited this instruction, bail now: */ + if (ir3_instr_check_mark(instr)) + return; + + /* some meta-instructions need to be handled specially: */ + if (is_meta(instr)) { + if ((instr->opc == OPC_META_FO) || + (instr->opc == OPC_META_FI)) { + unsigned i; + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + if (reg->flags & IR3_REG_SSA) + dump_instr(ctx, reg->instr); + } + } else if (instr->opc == OPC_META_FLOW) { + struct ir3_register *reg = instr->regs[1]; + ir3_block_dump(ctx, instr->flow.if_block, "if"); + if (instr->flow.else_block) + ir3_block_dump(ctx, instr->flow.else_block, "else"); + if (reg->flags & IR3_REG_SSA) + dump_instr(ctx, reg->instr); + } else if ((instr->opc == OPC_META_PHI) || + (instr->opc == OPC_META_DEREF)) { + /* treat like a normal instruction: */ + ir3_instr_dump(ctx, instr); + } + } else { + ir3_instr_dump(ctx, instr); + } +} + +/* arrarraggh! if link is to something outside of the current block, we + * need to defer emitting the link until the end of the block, since the + * edge triggers pre-creation of the node it links to inside the cluster, + * even though it is meant to be outside.. + */ +static struct { + char buf[40960]; + unsigned n; +} edge_buf; + +/* helper to print or defer: */ +static void printdef(struct ir3_dump_ctx *ctx, + bool defer, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + if (defer) { + unsigned n = edge_buf.n; + n += vsnprintf(&edge_buf.buf[n], sizeof(edge_buf.buf) - n, + fmt, ap); + edge_buf.n = n; + } else { + vfprintf(ctx->f, fmt, ap); + } + va_end(ap); +} + +static void dump_link2(struct ir3_dump_ctx *ctx, + struct ir3_instruction *instr, const char *target, bool defer) +{ + /* some meta-instructions need to be handled specially: */ + if (is_meta(instr)) { + if (instr->opc == OPC_META_INPUT) { + printdef(ctx, defer, "input%lx:<in%u>:w -> %s", + PTRID(instr->inout.block), + instr->regs[0]->num, target); + } else if (instr->opc == OPC_META_FO) { + struct ir3_register *reg = instr->regs[1]; + dump_link2(ctx, reg->instr, target, defer); + printdef(ctx, defer, "[label=\".%c\"]", + "xyzw"[instr->fo.off & 0x3]); + } else if (instr->opc == OPC_META_FI) { + unsigned i; + + /* recursively dump all parents and links */ + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + if (reg->flags & IR3_REG_SSA) { + dump_link2(ctx, reg->instr, target, defer); + printdef(ctx, defer, "[label=\".%c\"]", + "xyzw"[(i - 1) & 0x3]); + } + } + } else if (instr->opc == OPC_META_OUTPUT) { + printdef(ctx, defer, "output%lx:<out%u>:w -> %s", + PTRID(instr->inout.block), + instr->regs[0]->num, target); + } else if ((instr->opc == OPC_META_PHI) || + (instr->opc == OPC_META_DEREF)) { + /* treat like a normal instruction: */ + printdef(ctx, defer, "instr%lx:<dst0> -> %s", PTRID(instr), target); + } + } else { + printdef(ctx, defer, "instr%lx:<dst0> -> %s", PTRID(instr), target); + } +} + +static void dump_link(struct ir3_dump_ctx *ctx, + struct ir3_instruction *instr, + struct ir3_block *block, const char *target) +{ + bool defer = instr->block != block; + dump_link2(ctx, instr, target, defer); + printdef(ctx, defer, "\n"); +} + +static struct ir3_register *follow_flow(struct ir3_register *reg) +{ + if (reg->flags & IR3_REG_SSA) { + struct ir3_instruction *instr = reg->instr; + /* go with the flow.. */ + if (is_meta(instr) && (instr->opc == OPC_META_FLOW)) + return instr->regs[1]; + } + return reg; +} + +static void ir3_instr_dump(struct ir3_dump_ctx *ctx, + struct ir3_instruction *instr) +{ + unsigned i; + + fprintf(ctx->f, "instr%lx [shape=record,style=filled,fillcolor=lightgrey,label=\"{", + PTRID(instr)); + dump_instr_name(ctx, instr); + + /* destination register: */ + fprintf(ctx->f, "|<dst0>"); + + /* source register(s): */ + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = follow_flow(instr->regs[i]); + + fprintf(ctx->f, "|"); + + if (reg->flags & IR3_REG_SSA) + fprintf(ctx->f, "<src%u> ", (i - 1)); + + dump_reg_name(ctx, reg); + } + + fprintf(ctx->f, "}\"];\n"); + + /* and recursively dump dependent instructions: */ + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + char target[32]; /* link target */ + + if (!(reg->flags & IR3_REG_SSA)) + continue; + + snprintf(target, sizeof(target), "instr%lx:<src%u>", + PTRID(instr), (i - 1)); + + dump_instr(ctx, reg->instr); + dump_link(ctx, follow_flow(reg)->instr, instr->block, target); + } +} + +static void ir3_block_dump(struct ir3_dump_ctx *ctx, + struct ir3_block *block, const char *name) +{ + unsigned i, n; + + n = edge_buf.n; + + fprintf(ctx->f, "subgraph cluster%lx {\n", PTRID(block)); + fprintf(ctx->f, "label=\"%s\";\n", name); + + /* draw inputs: */ + fprintf(ctx->f, "input%lx [shape=record,label=\"inputs", PTRID(block)); + for (i = 0; i < block->ninputs; i++) + if (block->inputs[i]) + fprintf(ctx->f, "|<in%u> i%u.%c", i, (i >> 2), "xyzw"[i & 0x3]); + fprintf(ctx->f, "\"];\n"); + + /* draw instruction graph: */ + for (i = 0; i < block->noutputs; i++) + dump_instr(ctx, block->outputs[i]); + + /* draw outputs: */ + fprintf(ctx->f, "output%lx [shape=record,label=\"outputs", PTRID(block)); + for (i = 0; i < block->noutputs; i++) + fprintf(ctx->f, "|<out%u> o%u.%c", i, (i >> 2), "xyzw"[i & 0x3]); + fprintf(ctx->f, "\"];\n"); + + /* and links to outputs: */ + for (i = 0; i < block->noutputs; i++) { + char target[32]; /* link target */ + + /* NOTE: there could be outputs that are never assigned, + * so skip them + */ + if (!block->outputs[i]) + continue; + + snprintf(target, sizeof(target), "output%lx:<out%u>:e", + PTRID(block), i); + + dump_link(ctx, block->outputs[i], block, target); + } + + fprintf(ctx->f, "}\n"); + + /* and links to inputs: */ + if (block->parent) { + for (i = 0; i < block->ninputs; i++) { + char target[32]; /* link target */ + + if (!block->inputs[i]) + continue; + + dump_instr(ctx, block->inputs[i]); + + snprintf(target, sizeof(target), "input%lx:<in%u>:e", + PTRID(block), i); + + dump_link(ctx, block->inputs[i], block, target); + } + } + + /* dump deferred edges: */ + if (edge_buf.n > n) { + fprintf(ctx->f, "%*s", edge_buf.n - n, &edge_buf.buf[n]); + edge_buf.n = n; + } +} + +void ir3_dump(struct ir3 *shader, const char *name, + struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3? */, + FILE *f) +{ + struct ir3_dump_ctx ctx = { + .f = f, + }; + ir3_clear_mark(shader); + fprintf(ctx.f, "digraph G {\n"); + fprintf(ctx.f, "rankdir=RL;\n"); + fprintf(ctx.f, "nodesep=0.25;\n"); + fprintf(ctx.f, "ranksep=1.5;\n"); + ir3_block_dump(&ctx, block, name); + fprintf(ctx.f, "}\n"); +} + +/* + * For Debugging: + */ + +void +ir3_dump_instr_single(struct ir3_instruction *instr) +{ + struct ir3_dump_ctx ctx = { + .f = stdout, + .verbose = true, + }; + unsigned i; + + dump_instr_name(&ctx, instr); + for (i = 0; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + printf(i ? ", " : " "); + dump_reg_name(&ctx, reg); + } + printf("\n"); +} + +void +ir3_dump_instr_list(struct ir3_instruction *instr) +{ + unsigned n = 0; + + while (instr) { + ir3_dump_instr_single(instr); + if (!is_meta(instr)) + n++; + instr = instr->next; + } + printf("%u instructions\n", n); +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_flatten.c b/src/gallium/drivers/freedreno/ir3/ir3_flatten.c new file mode 100644 index 00000000000..9389227034c --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_flatten.c @@ -0,0 +1,155 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include <stdarg.h> + +#include "ir3.h" + +/* + * Flatten: flatten out legs of if/else, etc + * + * TODO probably should use some heuristic to decide to not flatten + * if one side of the other is too large / deeply nested / whatever? + */ + +struct ir3_flatten_ctx { + struct ir3_block *block; + unsigned cnt; +}; + +static struct ir3_register *unwrap(struct ir3_register *reg) +{ + + if (reg->flags & IR3_REG_SSA) { + struct ir3_instruction *instr = reg->instr; + if (is_meta(instr)) { + switch (instr->opc) { + case OPC_META_OUTPUT: + case OPC_META_FLOW: + if (instr->regs_count > 1) + return instr->regs[1]; + return NULL; + default: + break; + } + } + } + return reg; +} + +static void ir3_instr_flatten(struct ir3_flatten_ctx *ctx, + struct ir3_instruction *instr) +{ + unsigned i; + + /* if we've already visited this instruction, bail now: */ + if (ir3_instr_check_mark(instr)) + return; + + instr->block = ctx->block; + + /* TODO: maybe some threshold to decide whether to + * flatten or not?? + */ + if (is_meta(instr)) { + if (instr->opc == OPC_META_PHI) { + struct ir3_register *cond, *t, *f; + + cond = unwrap(instr->regs[1]); + t = unwrap(instr->regs[2]); /* true val */ + f = unwrap(instr->regs[3]); /* false val */ + + /* must have cond, but t or f may be null if only written + * one one side of the if/else (in which case we can just + * convert the PHI to a simple move). + */ + assert(cond); + assert(t || f); + + if (t && f) { + /* convert the PHI instruction to sel.{b16,b32} */ + instr->category = 3; + + /* instruction type based on dst size: */ + if (instr->regs[0]->flags & IR3_REG_HALF) + instr->opc = OPC_SEL_B16; + else + instr->opc = OPC_SEL_B32; + + instr->regs[1] = t; + instr->regs[2] = cond; + instr->regs[3] = f; + } else { + /* convert to simple mov: */ + instr->category = 1; + instr->cat1.dst_type = TYPE_F32; + instr->cat1.src_type = TYPE_F32; + instr->regs_count = 2; + instr->regs[1] = t ? t : f; + } + + ctx->cnt++; + } else if ((instr->opc == OPC_META_INPUT) && + (instr->regs_count == 2)) { + type_t ftype; + + if (instr->regs[0]->flags & IR3_REG_HALF) + ftype = TYPE_F16; + else + ftype = TYPE_F32; + + /* convert meta:input to mov: */ + instr->category = 1; + instr->cat1.src_type = ftype; + instr->cat1.dst_type = ftype; + } + } + + /* recursively visit children: */ + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *src = instr->regs[i]; + if (src->flags & IR3_REG_SSA) + ir3_instr_flatten(ctx, src->instr); + } +} + +/* return >= 0 is # of phi's flattened, < 0 is error */ +int ir3_block_flatten(struct ir3_block *block) +{ + struct ir3_flatten_ctx ctx = { + .block = block, + }; + unsigned i; + + ir3_clear_mark(block->shader); + for(i = 0; i < block->noutputs; i++) + if (block->outputs[i]) + ir3_instr_flatten(&ctx, block->outputs[i]); + + return ctx.cnt; +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c new file mode 100644 index 00000000000..b916dd51393 --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c @@ -0,0 +1,790 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include "pipe/p_shader_tokens.h" +#include "util/u_math.h" + +#include "ir3.h" +#include "ir3_visitor.h" + +/* + * Register Assignment: + * + * NOTE: currently only works on a single basic block.. need to think + * about how multiple basic blocks are going to get scheduled. But + * I think I want to re-arrange how blocks work, ie. get rid of the + * block nesting thing.. + * + * NOTE: we could do register coalescing (eliminate moves) as part of + * the RA step.. OTOH I think we need to do scheduling before register + * assignment. And if we remove a mov that effects scheduling (unless + * we leave a placeholder nop, which seems lame), so I'm not really + * sure how practical this is to do both in a single stage. But OTOH + * I'm not really sure a sane way for the CP stage to realize when it + * cannot remove a mov due to multi-register constraints.. + * + */ + +struct ir3_ra_ctx { + struct ir3_block *block; + enum shader_t type; + bool half_precision; + bool frag_coord; + bool frag_face; + bool has_samp; + int cnt; + bool error; +}; + +/* sorta ugly way to retrofit half-precision support.. rather than + * passing extra param around, just OR in a high bit. All the low + * value arithmetic (ie. +/- offset within a contiguous vec4, etc) + * will continue to work as long as you don't underflow (and that + * would go badly anyways). + */ +#define REG_HALF 0x8000 + +struct ir3_ra_assignment { + int8_t off; /* offset of instruction dst within range */ + uint8_t num; /* number of components for the range */ +}; + +static void ra_assign(struct ir3_ra_ctx *ctx, + struct ir3_instruction *assigner, int num); +static struct ir3_ra_assignment ra_calc(struct ir3_instruction *instr); + +/* + * Register Allocation: + */ + +#define REG(n, wm, f) (struct ir3_register){ \ + .flags = (f), \ + .num = (n), \ + .wrmask = TGSI_WRITEMASK_ ## wm, \ + } + +/* check that the register exists, is a GPR and is not special (a0/p0) */ +static struct ir3_register * reg_check(struct ir3_instruction *instr, unsigned n) +{ + if ((n < instr->regs_count) && reg_gpr(instr->regs[n])) + return instr->regs[n]; + return NULL; +} + +static int output_base(struct ir3_ra_ctx *ctx) +{ + /* ugg, for fragment shader we need to have input at r0.x + * (or at least if there is a way to configure it, I can't + * see how because the blob driver always uses r0.x (ie. + * all zeros) + */ + if (ctx->type == SHADER_FRAGMENT) { + if (ctx->half_precision) + return ctx->frag_face ? 4 : 3; + return ctx->frag_coord ? 8 : 4; + } + return 0; +} + +/* live means read before written */ +static void compute_liveregs(struct ir3_ra_ctx *ctx, + struct ir3_instruction *instr, regmask_t *liveregs) +{ + struct ir3_block *block = instr->block; + regmask_t written; + unsigned i, j; + + regmask_init(liveregs); + regmask_init(&written); + + for (instr = instr->next; instr; instr = instr->next) { + struct ir3_register *r; + + if (is_meta(instr)) + continue; + + /* check first src's read: */ + for (j = 1; j < instr->regs_count; j++) { + r = reg_check(instr, j); + if (r) + regmask_set_if_not(liveregs, r, &written); + } + + /* then dst written (if assigned already): */ + if (instr->flags & IR3_INSTR_MARK) { + r = reg_check(instr, 0); + if (r) + regmask_set(&written, r); + } + } + + /* be sure to account for output registers too: */ + for (i = 0; i < block->noutputs; i++) { + struct ir3_register reg = REG(output_base(ctx) + i, X, 0); + regmask_set_if_not(liveregs, ®, &written); + } +} + +/* calculate registers that are clobbered before last use of 'assigner'. + * This needs to be done backwards, although it could possibly be + * combined into compute_liveregs(). (Ie. compute_liveregs() could + * reverse the list, then do this part backwards reversing the list + * again back to original order.) Otoh, probably I should try to + * construct a proper interference graph instead. + * + * XXX this need to follow the same recursion path that is used for + * to rename/assign registers (ie. ra_assign_src()).. this is a bit + * ugly right now, maybe refactor into node iterator sort of things + * that iterates nodes in the correct order? + */ +static bool compute_clobbers(struct ir3_ra_ctx *ctx, + struct ir3_instruction *instr, struct ir3_instruction *assigner, + regmask_t *liveregs) +{ + unsigned i; + bool live = false, was_live = false; + + if (instr == NULL) { + struct ir3_block *block = ctx->block; + + /* if at the end, check outputs: */ + for (i = 0; i < block->noutputs; i++) + if (block->outputs[i] == assigner) + return true; + return false; + } + + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + if ((reg->flags & IR3_REG_SSA) && (reg->instr == assigner)) { + if (is_meta(instr)) { + switch (instr->opc) { + case OPC_META_INPUT: + // TODO + assert(0); + break; + case OPC_META_FO: + case OPC_META_FI: + was_live |= compute_clobbers(ctx, instr->next, + instr, liveregs); + break; + default: + break; + } + } + live = true; + break; + } + } + + was_live |= compute_clobbers(ctx, instr->next, assigner, liveregs); + + if (was_live && (instr->regs_count > 0) && + (instr->flags & IR3_INSTR_MARK) && + !is_meta(instr)) + regmask_set(liveregs, instr->regs[0]); + + return live || was_live; +} + +static int find_available(regmask_t *liveregs, int size, bool half) +{ + unsigned i; + unsigned f = half ? IR3_REG_HALF : 0; + for (i = 0; i < MAX_REG - size; i++) { + if (!regmask_get(liveregs, ®(i, X, f))) { + unsigned start = i++; + for (; (i < MAX_REG) && ((i - start) < size); i++) + if (regmask_get(liveregs, ®(i, X, f))) + break; + if ((i - start) >= size) + return start; + } + } + assert(0); + return -1; +} + +static int alloc_block(struct ir3_ra_ctx *ctx, + struct ir3_instruction *instr, int size) +{ + if (!instr) { + /* special case, allocating shader outputs. At this + * point, nothing is allocated, just start the shader + * outputs at r0.x and let compute_liveregs() take + * care of the rest from here: + */ + return 0; + } else { + struct ir3_register *dst = instr->regs[0]; + regmask_t liveregs; + + compute_liveregs(ctx, instr, &liveregs); + + // XXX XXX XXX XXX XXX XXX XXX XXX XXX + // XXX hack.. maybe ra_calc should give us a list of + // instrs to compute_clobbers() on? + if (is_meta(instr) && (instr->opc == OPC_META_INPUT) && + (instr->regs_count == 1)) { + unsigned i, base = instr->regs[0]->num & ~0x3; + for (i = 0; i < 4; i++) { + struct ir3_instruction *in = ctx->block->inputs[base + i]; + if (in) + compute_clobbers(ctx, in->next, in, &liveregs); + } + } else + // XXX XXX XXX XXX XXX XXX XXX XXX XXX + compute_clobbers(ctx, instr->next, instr, &liveregs); + + return find_available(&liveregs, size, + !!(dst->flags & IR3_REG_HALF)); + } +} + +/* + * Constraint Calculation: + */ + +struct ra_calc_visitor { + struct ir3_visitor base; + struct ir3_ra_assignment a; +}; + +static inline struct ra_calc_visitor *ra_calc_visitor(struct ir3_visitor *v) +{ + return (struct ra_calc_visitor *)v; +} + +/* calculate register assignment for the instruction. If the register + * written by this instruction is required to be part of a range, to + * handle other (input/output/sam/bary.f/etc) contiguous register range + * constraints, that is calculated handled here. + */ +static void ra_calc_dst(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_calc_visitor *c = ra_calc_visitor(v); + if (is_tex(instr)) { + c->a.off = 0; + c->a.num = 4; + } else { + c->a.off = 0; + c->a.num = 1; + } +} + +static void +ra_calc_dst_shader_input(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_calc_visitor *c = ra_calc_visitor(v); + struct ir3_block *block = instr->block; + struct ir3_register *dst = instr->regs[0]; + unsigned base = dst->num & ~0x3; + unsigned i, num = 0; + + assert(!(dst->flags & IR3_REG_IA)); + + /* check what input components we need: */ + for (i = 0; i < 4; i++) { + unsigned idx = base + i; + if ((idx < block->ninputs) && block->inputs[idx]) + num = i + 1; + } + + c->a.off = dst->num - base; + c->a.num = num; +} + +static void ra_calc_src_fanin(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_calc_visitor *c = ra_calc_visitor(v); + unsigned srcn = ir3_instr_regno(instr, reg) - 1; + c->a.off += srcn; + c->a.num += srcn; + c->a.num = MAX2(c->a.num, instr->regs_count - 1); +} + +static const struct ir3_visitor_funcs calc_visitor_funcs = { + .instr = ir3_visit_instr, + .dst_shader_input = ra_calc_dst_shader_input, + .dst_fanout = ra_calc_dst, + .dst_fanin = ra_calc_dst, + .dst = ra_calc_dst, + .src_fanout = ir3_visit_reg, + .src_fanin = ra_calc_src_fanin, + .src = ir3_visit_reg, +}; + +static struct ir3_ra_assignment ra_calc(struct ir3_instruction *assigner) +{ + struct ra_calc_visitor v = { + .base.funcs = &calc_visitor_funcs, + }; + + ir3_visit_instr(&v.base, assigner); + + return v.a; +} + +/* + * Register Assignment: + */ + +struct ra_assign_visitor { + struct ir3_visitor base; + struct ir3_ra_ctx *ctx; + int num; +}; + +static inline struct ra_assign_visitor *ra_assign_visitor(struct ir3_visitor *v) +{ + return (struct ra_assign_visitor *)v; +} + +static type_t half_type(type_t type) +{ + switch (type) { + case TYPE_F32: return TYPE_F16; + case TYPE_U32: return TYPE_U16; + case TYPE_S32: return TYPE_S16; + /* instructions may already be fixed up: */ + case TYPE_F16: + case TYPE_U16: + case TYPE_S16: + return type; + default: + assert(0); + return ~0; + } +} + +/* some instructions need fix-up if dst register is half precision: */ +static void fixup_half_instr_dst(struct ir3_instruction *instr) +{ + switch (instr->category) { + case 1: /* move instructions */ + instr->cat1.dst_type = half_type(instr->cat1.dst_type); + break; + case 3: + switch (instr->opc) { + case OPC_MAD_F32: + instr->opc = OPC_MAD_F16; + break; + case OPC_SEL_B32: + instr->opc = OPC_SEL_B16; + break; + case OPC_SEL_S32: + instr->opc = OPC_SEL_S16; + break; + case OPC_SEL_F32: + instr->opc = OPC_SEL_F16; + break; + case OPC_SAD_S32: + instr->opc = OPC_SAD_S16; + break; + /* instructions may already be fixed up: */ + case OPC_MAD_F16: + case OPC_SEL_B16: + case OPC_SEL_S16: + case OPC_SEL_F16: + case OPC_SAD_S16: + break; + default: + assert(0); + break; + } + break; + case 5: + instr->cat5.type = half_type(instr->cat5.type); + break; + } +} +/* some instructions need fix-up if src register is half precision: */ +static void fixup_half_instr_src(struct ir3_instruction *instr) +{ + switch (instr->category) { + case 1: /* move instructions */ + instr->cat1.src_type = half_type(instr->cat1.src_type); + break; + } +} + +static void ra_assign_reg(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_assign_visitor *a = ra_assign_visitor(v); + + if (is_flow(instr) && (instr->opc == OPC_KILL)) + return; + + reg->flags &= ~IR3_REG_SSA; + reg->num = a->num & ~REG_HALF; + + assert(reg->num >= 0); + + if (a->num & REG_HALF) { + reg->flags |= IR3_REG_HALF; + /* if dst reg being assigned, patch up the instr: */ + if (reg == instr->regs[0]) + fixup_half_instr_dst(instr); + else + fixup_half_instr_src(instr); + } +} + +static void ra_assign_dst_shader_input(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_assign_visitor *a = ra_assign_visitor(v); + unsigned i, base = reg->num & ~0x3; + int off = base - reg->num; + + ra_assign_reg(v, instr, reg); + reg->flags |= IR3_REG_IA; + + /* trigger assignment of all our companion input components: */ + for (i = 0; i < 4; i++) { + struct ir3_instruction *in = instr->block->inputs[i+base]; + if (in && is_meta(in) && (in->opc == OPC_META_INPUT)) + ra_assign(a->ctx, in, a->num + off + i); + } +} + +static void ra_assign_dst_fanout(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_assign_visitor *a = ra_assign_visitor(v); + struct ir3_register *src = instr->regs[1]; + ra_assign_reg(v, instr, reg); + if (src->flags & IR3_REG_SSA) + ra_assign(a->ctx, src->instr, a->num - instr->fo.off); +} + +static void ra_assign_src_fanout(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_assign_visitor *a = ra_assign_visitor(v); + ra_assign_reg(v, instr, reg); + ra_assign(a->ctx, instr, a->num + instr->fo.off); +} + + +static void ra_assign_src_fanin(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_assign_visitor *a = ra_assign_visitor(v); + unsigned j, srcn = ir3_instr_regno(instr, reg) - 1; + ra_assign_reg(v, instr, reg); + ra_assign(a->ctx, instr, a->num - srcn); + for (j = 1; j < instr->regs_count; j++) { + struct ir3_register *reg = instr->regs[j]; + if (reg->flags & IR3_REG_SSA) /* could be renamed already */ + ra_assign(a->ctx, reg->instr, a->num - srcn + j - 1); + } +} + +static const struct ir3_visitor_funcs assign_visitor_funcs = { + .instr = ir3_visit_instr, + .dst_shader_input = ra_assign_dst_shader_input, + .dst_fanout = ra_assign_dst_fanout, + .dst_fanin = ra_assign_reg, + .dst = ra_assign_reg, + .src_fanout = ra_assign_src_fanout, + .src_fanin = ra_assign_src_fanin, + .src = ra_assign_reg, +}; + +static void ra_assign(struct ir3_ra_ctx *ctx, + struct ir3_instruction *assigner, int num) +{ + struct ra_assign_visitor v = { + .base.funcs = &assign_visitor_funcs, + .ctx = ctx, + .num = num, + }; + + /* if we've already visited this instruction, bail now: */ + if (ir3_instr_check_mark(assigner)) { + debug_assert(assigner->regs[0]->num == (num & ~REG_HALF)); + if (assigner->regs[0]->num != (num & ~REG_HALF)) { + /* impossible situation, should have been resolved + * at an earlier stage by inserting extra mov's: + */ + ctx->error = true; + } + return; + } + + ir3_visit_instr(&v.base, assigner); +} + +/* + * + */ + +static void ir3_instr_ra(struct ir3_ra_ctx *ctx, + struct ir3_instruction *instr) +{ + struct ir3_register *dst; + unsigned num; + + /* skip over nop's */ + if (instr->regs_count == 0) + return; + + dst = instr->regs[0]; + + /* if we've already visited this instruction, bail now: */ + if (instr->flags & IR3_INSTR_MARK) + return; + + /* allocate register(s): */ + if (is_addr(instr)) { + num = instr->regs[2]->num; + } else if (reg_gpr(dst)) { + struct ir3_ra_assignment a; + a = ra_calc(instr); + num = alloc_block(ctx, instr, a.num) + a.off; + } else if (dst->flags & IR3_REG_ADDR) { + dst->flags &= ~IR3_REG_ADDR; + num = regid(REG_A0, 0) | REG_HALF; + } else { + /* predicate register (p0).. etc */ + return; + } + + ra_assign(ctx, instr, num); +} + +/* flatten into shader: */ +// XXX this should probably be somewhere else: +static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block) +{ + struct ir3_instruction *n; + struct ir3 *shader = block->shader; + struct ir3_instruction *end = + ir3_instr_create(block, 0, OPC_END); + struct ir3_instruction *last_input = NULL; + struct ir3_instruction *last_rel = NULL; + regmask_t needs_ss_war; /* write after read */ + regmask_t needs_ss; + regmask_t needs_sy; + + regmask_init(&needs_ss_war); + regmask_init(&needs_ss); + regmask_init(&needs_sy); + + shader->instrs_count = 0; + + for (n = block->head; n; n = n->next) { + struct ir3_register *reg; + unsigned i; + + if (is_meta(n)) + continue; + + for (i = 1; i < n->regs_count; i++) { + reg = n->regs[i]; + + if (reg_gpr(reg)) { + + /* TODO: we probably only need (ss) for alu + * instr consuming sfu result.. need to make + * some tests for both this and (sy).. + */ + if (regmask_get(&needs_ss, reg)) { + n->flags |= IR3_INSTR_SS; + regmask_init(&needs_ss); + } + + if (regmask_get(&needs_sy, reg)) { + n->flags |= IR3_INSTR_SY; + regmask_init(&needs_sy); + } + } + + /* TODO: is it valid to have address reg loaded from a + * relative src (ie. mova a0, c<a0.x+4>)? If so, the + * last_rel check below should be moved ahead of this: + */ + if (reg->flags & IR3_REG_RELATIV) + last_rel = n; + } + + if (n->regs_count > 0) { + reg = n->regs[0]; + if (regmask_get(&needs_ss_war, reg)) { + n->flags |= IR3_INSTR_SS; + regmask_init(&needs_ss_war); // ??? I assume? + } + + if (last_rel && (reg->num == regid(REG_A0, 0))) { + last_rel->flags |= IR3_INSTR_UL; + last_rel = NULL; + } + } + + /* cat5+ does not have an (ss) bit, if needed we need to + * insert a nop to carry the sync flag. Would be kinda + * clever if we were aware of this during scheduling, but + * this should be a pretty rare case: + */ + if ((n->flags & IR3_INSTR_SS) && (n->category >= 5)) { + struct ir3_instruction *nop; + nop = ir3_instr_create(block, 0, OPC_NOP); + nop->flags |= IR3_INSTR_SS; + n->flags &= ~IR3_INSTR_SS; + } + + /* need to be able to set (ss) on first instruction: */ + if ((shader->instrs_count == 0) && (n->category >= 5)) + ir3_instr_create(block, 0, OPC_NOP); + + if (is_nop(n) && shader->instrs_count) { + struct ir3_instruction *last = + shader->instrs[shader->instrs_count-1]; + if (is_nop(last) && (last->repeat < 5)) { + last->repeat++; + last->flags |= n->flags; + continue; + } + } + + shader->instrs[shader->instrs_count++] = n; + + if (is_sfu(n)) + regmask_set(&needs_ss, n->regs[0]); + + if (is_tex(n)) { + /* this ends up being the # of samp instructions.. but that + * is ok, everything else only cares whether it is zero or + * not. We do this here, rather than when we encounter a + * SAMP decl, because (especially in binning pass shader) + * the samp instruction(s) could get eliminated if the + * result is not used. + */ + ctx->has_samp = true; + regmask_set(&needs_sy, n->regs[0]); + } + + /* both tex/sfu appear to not always immediately consume + * their src register(s): + */ + if (is_tex(n) || is_sfu(n)) { + for (i = 1; i < n->regs_count; i++) { + reg = n->regs[i]; + if (reg_gpr(reg)) + regmask_set(&needs_ss_war, reg); + } + } + + if (is_input(n)) + last_input = n; + } + + if (last_input) + last_input->regs[0]->flags |= IR3_REG_EI; + + if (last_rel) + last_rel->flags |= IR3_INSTR_UL; + + shader->instrs[shader->instrs_count++] = end; + + shader->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY; +} + +static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block) +{ + struct ir3_instruction *n; + + if (!block->parent) { + unsigned i, j; + int base, off = output_base(ctx); + + base = alloc_block(ctx, NULL, block->noutputs + off); + + if (ctx->half_precision) + base |= REG_HALF; + + for (i = 0; i < block->noutputs; i++) + if (block->outputs[i] && !is_kill(block->outputs[i])) + ra_assign(ctx, block->outputs[i], base + i + off); + + if (ctx->type == SHADER_FRAGMENT) { + i = 0; + if (ctx->frag_face) { + /* if we have frag_face, it gets hr0.x */ + ra_assign(ctx, block->inputs[i], REG_HALF | 0); + i += 4; + } + for (j = 0; i < block->ninputs; i++, j++) + if (block->inputs[i]) + ra_assign(ctx, block->inputs[i], (base & ~REG_HALF) + j); + } else { + for (i = 0; i < block->ninputs; i++) + if (block->inputs[i]) + ir3_instr_ra(ctx, block->inputs[i]); + } + } + + /* then loop over instruction list and assign registers: + */ + n = block->head; + while (n) { + ir3_instr_ra(ctx, n); + if (ctx->error) + return -1; + n = n->next; + } + + legalize(ctx, block); + + return 0; +} + +int ir3_block_ra(struct ir3_block *block, enum shader_t type, + bool half_precision, bool frag_coord, bool frag_face, + bool *has_samp) +{ + struct ir3_ra_ctx ctx = { + .block = block, + .type = type, + .half_precision = half_precision, + .frag_coord = frag_coord, + .frag_face = frag_face, + }; + int ret; + + ir3_clear_mark(block->shader); + ret = block_ra(&ctx, block); + *has_samp = ctx.has_samp; + + return ret; +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c new file mode 100644 index 00000000000..3ef67731926 --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c @@ -0,0 +1,401 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + + +#include "util/u_math.h" + +#include "ir3.h" + +enum { + SCHEDULED = -1, + DELAYED = -2, +}; + +/* + * Instruction Scheduling: + * + * Using the depth sorted list from depth pass, attempt to recursively + * schedule deepest unscheduled path. The first instruction that cannot + * be scheduled, returns the required delay slots it needs, at which + * point we return back up to the top and attempt to schedule by next + * highest depth. After a sufficient number of instructions have been + * scheduled, return back to beginning of list and start again. If you + * reach the end of depth sorted list without being able to insert any + * instruction, insert nop's. Repeat until no more unscheduled + * instructions. + * + * There are a few special cases that need to be handled, since sched + * is currently independent of register allocation. Usages of address + * register (a0.x) or predicate register (p0.x) must be serialized. Ie. + * if you have two pairs of instructions that write the same special + * register and then read it, then those pairs cannot be interleaved. + * To solve this, when we are in such a scheduling "critical section", + * and we encounter a conflicting write to a special register, we try + * to schedule any remaining instructions that use that value first. + */ + +struct ir3_sched_ctx { + struct ir3_instruction *scheduled; /* last scheduled instr */ + struct ir3_instruction *addr; /* current a0.x user, if any */ + struct ir3_instruction *pred; /* current p0.x user, if any */ + unsigned cnt; +}; + +static struct ir3_instruction * +deepest(struct ir3_instruction **srcs, unsigned nsrcs) +{ + struct ir3_instruction *d = NULL; + unsigned i = 0, id = 0; + + while ((i < nsrcs) && !(d = srcs[id = i])) + i++; + + if (!d) + return NULL; + + for (; i < nsrcs; i++) + if (srcs[i] && (srcs[i]->depth > d->depth)) + d = srcs[id = i]; + + srcs[id] = NULL; + + return d; +} + +static unsigned distance(struct ir3_sched_ctx *ctx, + struct ir3_instruction *instr, unsigned maxd) +{ + struct ir3_instruction *n = ctx->scheduled; + unsigned d = 0; + while (n && (n != instr) && (d < maxd)) { + if (is_alu(n) || is_flow(n)) + d++; + n = n->next; + } + return d; +} + +/* TODO maybe we want double linked list? */ +static struct ir3_instruction * prev(struct ir3_instruction *instr) +{ + struct ir3_instruction *p = instr->block->head; + while (p && (p->next != instr)) + p = p->next; + return p; +} + +static void schedule(struct ir3_sched_ctx *ctx, + struct ir3_instruction *instr, bool remove) +{ + struct ir3_block *block = instr->block; + + /* maybe there is a better way to handle this than just stuffing + * a nop.. ideally we'd know about this constraint in the + * scheduling and depth calculation.. + */ + if (ctx->scheduled && is_sfu(ctx->scheduled) && is_sfu(instr)) + schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false); + + /* remove from depth list: + */ + if (remove) { + struct ir3_instruction *p = prev(instr); + + /* NOTE: this can happen for inputs which are not + * read.. in that case there is no need to schedule + * the input, so just bail: + */ + if (instr != (p ? p->next : block->head)) + return; + + if (p) + p->next = instr->next; + else + block->head = instr->next; + } + + if (writes_addr(instr)) { + assert(ctx->addr == NULL); + ctx->addr = instr; + } + + if (writes_pred(instr)) { + assert(ctx->pred == NULL); + ctx->pred = instr; + } + + instr->flags |= IR3_INSTR_MARK; + + instr->next = ctx->scheduled; + ctx->scheduled = instr; + + ctx->cnt++; +} + +/* + * Delay-slot calculation. Follows fanin/fanout. + */ + +static unsigned delay_calc2(struct ir3_sched_ctx *ctx, + struct ir3_instruction *assigner, + struct ir3_instruction *consumer, unsigned srcn) +{ + unsigned delay = 0; + + if (is_meta(assigner)) { + unsigned i; + for (i = 1; i < assigner->regs_count; i++) { + struct ir3_register *reg = assigner->regs[i]; + if (reg->flags & IR3_REG_SSA) { + unsigned d = delay_calc2(ctx, reg->instr, + consumer, srcn); + delay = MAX2(delay, d); + } + } + } else { + delay = ir3_delayslots(assigner, consumer, srcn); + delay -= distance(ctx, assigner, delay); + } + + return delay; +} + +static unsigned delay_calc(struct ir3_sched_ctx *ctx, + struct ir3_instruction *instr) +{ + unsigned i, delay = 0; + + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + if (reg->flags & IR3_REG_SSA) { + unsigned d = delay_calc2(ctx, reg->instr, + instr, i - 1); + delay = MAX2(delay, d); + } + } + + return delay; +} + +/* A negative return value signals that an instruction has been newly + * scheduled, return back up to the top of the stack (to block_sched()) + */ +static int trysched(struct ir3_sched_ctx *ctx, + struct ir3_instruction *instr) +{ + struct ir3_instruction *srcs[ARRAY_SIZE(instr->regs) - 1]; + struct ir3_instruction *src; + unsigned i, delay, nsrcs = 0; + + /* if already scheduled: */ + if (instr->flags & IR3_INSTR_MARK) + return 0; + + /* figure out our src's: */ + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + if (reg->flags & IR3_REG_SSA) + srcs[nsrcs++] = reg->instr; + } + + /* for each src register in sorted order: + */ + delay = 0; + while ((src = deepest(srcs, nsrcs))) { + delay = trysched(ctx, src); + if (delay) + return delay; + } + + /* all our dependents are scheduled, figure out if + * we have enough delay slots to schedule ourself: + */ + delay = delay_calc(ctx, instr); + if (delay) + return delay; + + /* if this is a write to address/predicate register, and that + * register is currently in use, we need to defer until it is + * free: + */ + if (writes_addr(instr) && ctx->addr) { + assert(ctx->addr != instr); + return DELAYED; + } + if (writes_pred(instr) && ctx->pred) { + assert(ctx->pred != instr); + return DELAYED; + } + + schedule(ctx, instr, true); + return SCHEDULED; +} + +static struct ir3_instruction * reverse(struct ir3_instruction *instr) +{ + struct ir3_instruction *reversed = NULL; + while (instr) { + struct ir3_instruction *next = instr->next; + instr->next = reversed; + reversed = instr; + instr = next; + } + return reversed; +} + +static bool uses_current_addr(struct ir3_sched_ctx *ctx, + struct ir3_instruction *instr) +{ + unsigned i; + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + if (reg->flags & IR3_REG_SSA) { + if (is_addr(reg->instr)) { + struct ir3_instruction *addr; + addr = reg->instr->regs[1]->instr; /* the mova */ + if (ctx->addr == addr) + return true; + } + } + } + return false; +} + +static bool uses_current_pred(struct ir3_sched_ctx *ctx, + struct ir3_instruction *instr) +{ + unsigned i; + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + if ((reg->flags & IR3_REG_SSA) && (ctx->pred == reg->instr)) + return true; + } + return false; +} + +/* when we encounter an instruction that writes to the address register + * when it is in use, we delay that instruction and try to schedule all + * other instructions using the current address register: + */ +static int block_sched_undelayed(struct ir3_sched_ctx *ctx, + struct ir3_block *block) +{ + struct ir3_instruction *instr = block->head; + bool addr_in_use = false; + bool pred_in_use = false; + unsigned cnt = ~0; + + while (instr) { + struct ir3_instruction *next = instr->next; + bool addr = uses_current_addr(ctx, instr); + bool pred = uses_current_pred(ctx, instr); + + if (addr || pred) { + int ret = trysched(ctx, instr); + if (ret == SCHEDULED) + cnt = 0; + else if (ret > 0) + cnt = MIN2(cnt, ret); + if (addr) + addr_in_use = true; + if (pred) + pred_in_use = true; + } + + instr = next; + } + + if (!addr_in_use) + ctx->addr = NULL; + + if (!pred_in_use) + ctx->pred = NULL; + + return cnt; +} + +static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block) +{ + struct ir3_instruction *instr; + + /* schedule all the shader input's (meta-instr) first so that + * the RA step sees that the input registers contain a value + * from the start of the shader: + */ + if (!block->parent) { + unsigned i; + for (i = 0; i < block->ninputs; i++) { + struct ir3_instruction *in = block->inputs[i]; + if (in) + schedule(ctx, in, true); + } + } + + while ((instr = block->head)) { + /* NOTE: always grab next *before* trysched(), in case the + * instruction is actually scheduled (and therefore moved + * from depth list into scheduled list) + */ + struct ir3_instruction *next = instr->next; + int cnt = trysched(ctx, instr); + + if (cnt == DELAYED) + cnt = block_sched_undelayed(ctx, block); + + /* -1 is signal to return up stack, but to us means same as 0: */ + cnt = MAX2(0, cnt); + cnt += ctx->cnt; + instr = next; + + /* if deepest remaining instruction cannot be scheduled, try + * the increasingly more shallow instructions until needed + * number of delay slots is filled: + */ + while (instr && (cnt > ctx->cnt)) { + next = instr->next; + trysched(ctx, instr); + instr = next; + } + + /* and if we run out of instructions that can be scheduled, + * then it is time for nop's: + */ + while (cnt > ctx->cnt) + schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false); + } + + /* at this point, scheduled list is in reverse order, so fix that: */ + block->head = reverse(ctx->scheduled); +} + +void ir3_block_sched(struct ir3_block *block) +{ + struct ir3_sched_ctx ctx = {0}; + ir3_clear_mark(block->shader); + block_sched(&ctx, block); +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c new file mode 100644 index 00000000000..ddf99dbc46e --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c @@ -0,0 +1,211 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include "pipe/p_state.h" +#include "util/u_string.h" +#include "util/u_memory.h" +#include "util/u_inlines.h" +#include "util/u_format.h" +#include "tgsi/tgsi_dump.h" +#include "tgsi/tgsi_parse.h" + +#include "freedreno_context.h" +#include "freedreno_lowering.h" +#include "freedreno_util.h" + +#include "ir3_shader.h" +#include "ir3_compiler.h" + + +static void +delete_variant(struct ir3_shader_variant *v) +{ + ir3_destroy(v->ir); + fd_bo_del(v->bo); + free(v); +} + +static void +assemble_variant(struct ir3_shader_variant *v) +{ + struct fd_context *ctx = fd_context(v->shader->pctx); + uint32_t sz, *bin; + + bin = ir3_assemble(v->ir, &v->info); + sz = v->info.sizedwords * 4; + + v->bo = fd_bo_new(ctx->dev, sz, + DRM_FREEDRENO_GEM_CACHE_WCOMBINE | + DRM_FREEDRENO_GEM_TYPE_KMEM); + + memcpy(fd_bo_map(v->bo), bin, sz); + + free(bin); + + v->instrlen = v->info.sizedwords / 8; + v->constlen = v->info.max_const + 1; +} + +/* for vertex shader, the inputs are loaded into registers before the shader + * is executed, so max_regs from the shader instructions might not properly + * reflect the # of registers actually used: + */ +static void +fixup_vp_regfootprint(struct ir3_shader_variant *v) +{ + unsigned i; + for (i = 0; i < v->inputs_count; i++) { + if (v->inputs[i].compmask) { + uint32_t regid = (v->inputs[i].regid + 3) >> 2; + v->info.max_reg = MAX2(v->info.max_reg, regid); + } + } + for (i = 0; i < v->outputs_count; i++) { + uint32_t regid = (v->outputs[i].regid + 3) >> 2; + v->info.max_reg = MAX2(v->info.max_reg, regid); + } +} + +static struct ir3_shader_variant * +create_variant(struct ir3_shader *shader, struct ir3_shader_key key) +{ + struct ir3_shader_variant *v = CALLOC_STRUCT(ir3_shader_variant); + const struct tgsi_token *tokens = shader->tokens; + int ret; + + if (!v) + return NULL; + + v->shader = shader; + v->key = key; + v->type = shader->type; + + if (fd_mesa_debug & FD_DBG_DISASM) { + DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", shader->type, + key.binning_pass, key.color_two_side, key.half_precision); + tgsi_dump(tokens, 0); + } + + if (!(fd_mesa_debug & FD_DBG_NOOPT)) { + ret = ir3_compile_shader(v, tokens, key); + if (ret) { + debug_error("new compiler failed, trying fallback!"); + + v->inputs_count = 0; + v->outputs_count = 0; + v->total_in = 0; + v->has_samp = false; + v->immediates_count = 0; + } + } else { + ret = -1; /* force fallback to old compiler */ + } + + if (ret) + ret = ir3_compile_shader_old(v, tokens, key); + + if (ret) { + debug_error("compile failed!"); + goto fail; + } + + assemble_variant(v); + if (!v->bo) { + debug_error("assemble failed!"); + goto fail; + } + + if (shader->type == SHADER_VERTEX) + fixup_vp_regfootprint(v); + + if (fd_mesa_debug & FD_DBG_DISASM) { + DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type, + key.binning_pass, key.color_two_side, key.half_precision); + disasm_a3xx(fd_bo_map(v->bo), v->info.sizedwords, 0, v->type); + } + + return v; + +fail: + delete_variant(v); + return NULL; +} + +struct ir3_shader_variant * +ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key) +{ + struct ir3_shader_variant *v; + + /* some shader key values only apply to vertex or frag shader, + * so normalize the key to avoid constructing multiple identical + * variants: + */ + if (shader->type == SHADER_FRAGMENT) { + key.binning_pass = false; + } + if (shader->type == SHADER_VERTEX) { + key.color_two_side = false; + key.half_precision = false; + } + + for (v = shader->variants; v; v = v->next) + if (!memcmp(&key, &v->key, sizeof(key))) + return v; + + /* compile new variant if it doesn't exist already: */ + v = create_variant(shader, key); + v->next = shader->variants; + shader->variants = v; + + return v; +} + + +void +ir3_shader_destroy(struct ir3_shader *shader) +{ + struct ir3_shader_variant *v, *t; + for (v = shader->variants; v; ) { + t = v; + v = v->next; + delete_variant(t); + } + free((void *)shader->tokens); + free(shader); +} + +struct ir3_shader * +ir3_shader_create(struct pipe_context *pctx, const struct tgsi_token *tokens, + enum shader_t type) +{ + struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader); + shader->pctx = pctx; + shader->type = type; + shader->tokens = tgsi_dup_tokens(tokens); + return shader; +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h new file mode 100644 index 00000000000..1a91fcbcb13 --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h @@ -0,0 +1,163 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#ifndef IR3_SHADER_H_ +#define IR3_SHADER_H_ + +#include "ir3.h" +#include "disasm.h" + +typedef uint16_t ir3_semantic; /* semantic name + index */ +static inline ir3_semantic +ir3_semantic_name(uint8_t name, uint16_t index) +{ + return (name << 8) | (index & 0xff); +} + +static inline uint8_t sem2name(ir3_semantic sem) +{ + return sem >> 8; +} + +static inline uint16_t sem2idx(ir3_semantic sem) +{ + return sem & 0xff; +} + +/* Configuration key used to identify a shader variant.. different + * shader variants can be used to implement features not supported + * in hw (two sided color), binning-pass vertex shader, etc. + */ +struct ir3_shader_key { + /* vertex shader variant parameters: */ + unsigned binning_pass : 1; + + /* fragment shader variant parameters: */ + unsigned color_two_side : 1; + unsigned half_precision : 1; +}; + +struct ir3_shader_variant { + struct fd_bo *bo; + + struct ir3_shader_key key; + + struct ir3_info info; + struct ir3 *ir; + + /* the instructions length is in units of instruction groups + * (4 instructions, 8 dwords): + */ + unsigned instrlen; + + /* the constants length is in units of vec4's, and is the sum of + * the uniforms and the built-in compiler constants + */ + unsigned constlen; + + /* About Linkage: + * + Let the frag shader determine the position/compmask for the + * varyings, since it is the place where we know if the varying + * is actually used, and if so, which components are used. So + * what the hw calls "outloc" is taken from the "inloc" of the + * frag shader. + * + From the vert shader, we only need the output regid + */ + + /* for frag shader, pos_regid holds the frag_pos, ie. what is passed + * to bary.f instructions + */ + uint8_t pos_regid; + bool frag_coord, frag_face; + + /* varyings/outputs: */ + unsigned outputs_count; + struct { + ir3_semantic semantic; + uint8_t regid; + } outputs[16 + 2]; /* +POSITION +PSIZE */ + bool writes_pos, writes_psize; + + /* vertices/inputs: */ + unsigned inputs_count; + struct { + ir3_semantic semantic; + uint8_t regid; + uint8_t compmask; + uint8_t ncomp; + /* in theory inloc of fs should match outloc of vs: */ + uint8_t inloc; + uint8_t bary; + } inputs[16 + 2]; /* +POSITION +FACE */ + + unsigned total_in; /* sum of inputs (scalar) */ + + /* do we have one or more texture sample instructions: */ + bool has_samp; + + /* const reg # of first immediate, ie. 1 == c1 + * (not regid, because TGSI thinks in terms of vec4 registers, + * not scalar registers) + */ + unsigned first_immediate; + unsigned immediates_count; + struct { + uint32_t val[4]; + } immediates[64]; + + /* shader variants form a linked list: */ + struct ir3_shader_variant *next; + + /* replicated here to avoid passing extra ptrs everywhere: */ + enum shader_t type; + struct ir3_shader *shader; +}; + +struct ir3_shader { + enum shader_t type; + + struct pipe_context *pctx; + const struct tgsi_token *tokens; + + struct ir3_shader_variant *variants; + + /* so far, only used for blit_prog shader.. values for + * VPC_VARYING_INTERP[i].MODE and VPC_VARYING_PS_REPL[i].MODE + */ + uint32_t vinterp[4], vpsrepl[4]; +}; + + +struct ir3_shader * ir3_shader_create(struct pipe_context *pctx, + const struct tgsi_token *tokens, enum shader_t type); +void ir3_shader_destroy(struct ir3_shader *shader); + +struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader, + struct ir3_shader_key key); + +#endif /* IR3_SHADER_H_ */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_visitor.h b/src/gallium/drivers/freedreno/ir3/ir3_visitor.h new file mode 100644 index 00000000000..1c60d1620ca --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_visitor.h @@ -0,0 +1,154 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#ifndef IR3_VISITOR_H_ +#define IR3_VISITOR_H_ + +/** + * Visitor which follows dst to src relationships between instructions, + * first visiting the dst (writer) instruction, followed by src (reader) + * instruction(s). + * + * TODO maybe we want multiple different visitors to walk the + * graph in different ways? + */ + +struct ir3_visitor; + +typedef void (*ir3_visit_instr_func)(struct ir3_visitor *v, + struct ir3_instruction *instr); + +typedef void (*ir3_visit_reg_func)(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg); + +struct ir3_visitor_funcs { + ir3_visit_instr_func instr; // TODO do we need?? + + ir3_visit_reg_func dst_shader_input; + ir3_visit_reg_func dst_block_input; + ir3_visit_reg_func dst_fanout; + ir3_visit_reg_func dst_fanin; + ir3_visit_reg_func dst; + + ir3_visit_reg_func src_block_input; + ir3_visit_reg_func src_fanout; + ir3_visit_reg_func src_fanin; + ir3_visit_reg_func src; +}; + +struct ir3_visitor { + const struct ir3_visitor_funcs *funcs; + bool error; +}; + +#include "util/u_debug.h" + +static void visit_instr_dst(struct ir3_visitor *v, + struct ir3_instruction *instr) +{ + struct ir3_register *reg = instr->regs[0]; + + if (is_meta(instr)) { + switch (instr->opc) { + case OPC_META_INPUT: + if (instr->regs_count == 1) + v->funcs->dst_shader_input(v, instr, reg); + else + v->funcs->dst_block_input(v, instr, reg); + return; + case OPC_META_FO: + v->funcs->dst_fanout(v, instr, reg); + return; + case OPC_META_FI: + v->funcs->dst_fanin(v, instr, reg); + return; + default: + break; + + } + } + + v->funcs->dst(v, instr, reg); +} + +static void visit_instr_src(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + if (is_meta(instr)) { + switch (instr->opc) { + case OPC_META_INPUT: + /* shader-input does not have a src, only block input: */ + debug_assert(instr->regs_count == 2); + v->funcs->src_block_input(v, instr, reg); + return; + case OPC_META_FO: + v->funcs->src_fanout(v, instr, reg); + return; + case OPC_META_FI: + v->funcs->src_fanin(v, instr, reg); + return; + default: + break; + + } + } + + v->funcs->src(v, instr, reg); +} + +static void ir3_visit_instr(struct ir3_visitor *v, + struct ir3_instruction *instr) +{ + struct ir3_instruction *n; + + /* visit instruction that assigns value: */ + if (instr->regs_count > 0) + visit_instr_dst(v, instr); + + /* and of any following instructions which read that value: */ + n = instr->next; + while (n && !v->error) { + unsigned i; + + for (i = 1; i < n->regs_count; i++) { + struct ir3_register *reg = n->regs[i]; + if ((reg->flags & IR3_REG_SSA) && (reg->instr == instr)) + visit_instr_src(v, n, reg); + } + + n = n->next; + } +} + +static void ir3_visit_reg(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + /* no-op */ +} + +#endif /* IR3_VISITOR_H_ */ |