diff options
author | Alyssa Rosenzweig <[email protected]> | 2019-07-10 10:33:24 -0700 |
---|---|---|
committer | Alyssa Rosenzweig <[email protected]> | 2019-07-10 10:43:23 -0700 |
commit | ec2a59cd7aa42652645e76e29a72335370c80e50 (patch) | |
tree | 08e75a12d073cc627307bc59ab3a8d057244b68e /src/panfrost | |
parent | a2d0ea92ba752c62e59aa681acda7b97fc86d100 (diff) |
panfrost: Move non-Gallium files outside of Gallium
In preparation for a Panfrost-based non-Gallium driver (maybe
Vulkan...?), hoist everything except for the Gallium driver into a
shared src/panfrost. Practically, that means the compilers, the headers,
and pandecode.
Signed-off-by: Alyssa Rosenzweig <[email protected]>
Diffstat (limited to 'src/panfrost')
36 files changed, 15142 insertions, 1 deletions
diff --git a/src/panfrost/bifrost/bifrost.h b/src/panfrost/bifrost/bifrost.h new file mode 100644 index 00000000000..aa382b43be7 --- /dev/null +++ b/src/panfrost/bifrost/bifrost.h @@ -0,0 +1,85 @@ +/* + * Copyright (C) 2019 Connor Abbott <[email protected]> + * Copyright (C) 2019 Lyude Paul <[email protected]> + * Copyright (C) 2019 Ryan Houdek <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __bifrost_h__ +#define __bifrost_h__ + +#include <stdint.h> +#include <stdbool.h> + +struct bifrost_header { + unsigned unk0 : 7; + // If true, convert any infinite result of any floating-point operation to + // the biggest representable number. + unsigned suppress_inf: 1; + // Convert any NaN results to 0. + unsigned suppress_nan : 1; + unsigned unk1 : 2; + // true if the execution mask of the next clause is the same as the mask of + // the current clause. + unsigned back_to_back : 1; + unsigned no_end_of_shader: 1; + unsigned unk2 : 2; + // Set to true for fragment shaders, to implement this bit of spec text + // from section 7.1.5 of the GLSL ES spec: + // + // "Stores to image and buffer variables performed by helper invocations + // have no effect on the underlying image or buffer memory." + // + // Helper invocations are threads (invocations) corresponding to pixels in + // a quad that aren't actually part of the triangle, but are included to + // make derivatives work correctly. They're usually turned on, but they + // need to be masked off for GLSL-level stores. This bit seems to be the + // only bit that's actually different between fragment shaders and other + // shaders, so this is probably what it's doing. + unsigned elide_writes : 1; + // If backToBack is off: + // - true for conditional branches and fallthrough + // - false for unconditional branches + // The blob seems to always set it to true if back-to-back is on. + unsigned branch_cond : 1; + // This bit is set when the next clause writes to the data register of some + // previous clause. + unsigned datareg_writebarrier: 1; + unsigned datareg : 6; + unsigned scoreboard_deps: 8; + unsigned scoreboard_index: 3; + unsigned clause_type: 4; + unsigned unk3 : 1; // part of clauseType? + unsigned next_clause_type: 4; + unsigned unk4 : 1; // part of nextClauseType? +}; + +struct bifrost_fma_inst { + unsigned src0 : 3; + unsigned op : 20; +}; + +struct bifrost_add_inst { + unsigned src0 : 3; + unsigned op : 17; +}; + +#endif diff --git a/src/panfrost/bifrost/cmdline.c b/src/panfrost/bifrost/cmdline.c new file mode 100644 index 00000000000..9fd7094ebe1 --- /dev/null +++ b/src/panfrost/bifrost/cmdline.c @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2019 Ryan Houdek <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "main/mtypes.h" +#include "compiler/glsl/standalone.h" +#include "compiler/glsl/glsl_to_nir.h" +#include "compiler/nir_types.h" +#include "disassemble.h" +#include "util/u_dynarray.h" + +static void +disassemble(const char *filename) +{ + FILE *fp = fopen(filename, "rb"); + assert(fp); + + fseek(fp, 0, SEEK_END); + int filesize = ftell(fp); + rewind(fp); + + unsigned char *code = malloc(filesize); + int res = fread(code, 1, filesize, fp); + if (res != filesize) { + printf("Couldn't read full file\n"); + } + fclose(fp); + + disassemble_bifrost(code, filesize, false); + free(code); +} + +int +main(int argc, char **argv) +{ + if (argc < 2) { + printf("Pass a command\n"); + exit(1); + } + if (strcmp(argv[1], "disasm") == 0) { + disassemble(argv[2]); + } + return 0; +} diff --git a/src/panfrost/bifrost/disassemble.c b/src/panfrost/bifrost/disassemble.c new file mode 100644 index 00000000000..03ade19a689 --- /dev/null +++ b/src/panfrost/bifrost/disassemble.c @@ -0,0 +1,2227 @@ +/* + * Copyright (C) 2019 Connor Abbott <[email protected]> + * Copyright (C) 2019 Lyude Paul <[email protected]> + * Copyright (C) 2019 Ryan Houdek <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdbool.h> +#include <stdio.h> +#include <stdint.h> +#include <assert.h> +#include <inttypes.h> +#include <string.h> + +#include "bifrost.h" +#include "disassemble.h" +#include "util/macros.h" + +// return bits (high, lo] +static uint64_t bits(uint32_t word, unsigned lo, unsigned high) +{ + if (high == 32) + return word >> lo; + return (word & ((1 << high) - 1)) >> lo; +} + +// each of these structs represents an instruction that's dispatched in one +// cycle. Note that these instructions are packed in funny ways within the +// clause, hence the need for a separate struct. +struct bifrost_alu_inst { + uint32_t fma_bits; + uint32_t add_bits; + uint64_t reg_bits; +}; + +struct bifrost_regs { + unsigned uniform_const : 8; + unsigned reg2 : 6; + unsigned reg3 : 6; + unsigned reg0 : 5; + unsigned reg1 : 6; + unsigned ctrl : 4; +}; + +static unsigned get_reg0(struct bifrost_regs regs) +{ + if (regs.ctrl == 0) + return regs.reg0 | ((regs.reg1 & 0x1) << 5); + + return regs.reg0 <= regs.reg1 ? regs.reg0 : 63 - regs.reg0; +} + +static unsigned get_reg1(struct bifrost_regs regs) +{ + return regs.reg0 <= regs.reg1 ? regs.reg1 : 63 - regs.reg1; +} + +enum bifrost_reg_write_unit { + REG_WRITE_NONE = 0, // don't write + REG_WRITE_TWO, // write using reg2 + REG_WRITE_THREE, // write using reg3 +}; + +// this represents the decoded version of the ctrl register field. +struct bifrost_reg_ctrl{ + bool read_reg0; + bool read_reg1; + bool read_reg3; + enum bifrost_reg_write_unit fma_write_unit; + enum bifrost_reg_write_unit add_write_unit; + bool clause_start; +}; + +enum fma_src_type { + FMA_ONE_SRC, + FMA_TWO_SRC, + FMA_FADD, + FMA_FMINMAX, + FMA_FADD16, + FMA_FMINMAX16, + FMA_FCMP, + FMA_FCMP16, + FMA_THREE_SRC, + FMA_FMA, + FMA_FMA16, + FMA_FOUR_SRC, + FMA_FMA_MSCALE, + FMA_SHIFT_ADD64, +}; + +struct fma_op_info { + unsigned op; + char name[30]; + enum fma_src_type src_type; +}; + +enum add_src_type { + ADD_ONE_SRC, + ADD_TWO_SRC, + ADD_FADD, + ADD_FMINMAX, + ADD_FADD16, + ADD_FMINMAX16, + ADD_THREE_SRC, + ADD_FADDMscale, + ADD_FCMP, + ADD_FCMP16, + ADD_TEX_COMPACT, // texture instruction with embedded sampler + ADD_TEX, // texture instruction with sampler/etc. in uniform port + ADD_VARYING_INTERP, + ADD_BLENDING, + ADD_LOAD_ATTR, + ADD_VARYING_ADDRESS, + ADD_BRANCH, +}; + +struct add_op_info { + unsigned op; + char name[30]; + enum add_src_type src_type; + bool has_data_reg; +}; + +struct bifrost_tex_ctrl { + unsigned sampler_index : 4; // also used to signal indirects + unsigned tex_index : 7; + bool no_merge_index : 1; // whether to merge (direct) sampler & texture indices + bool filter : 1; // use the usual filtering pipeline (0 for texelFetch & textureGather) + unsigned unk0 : 2; + bool texel_offset : 1; // *Offset() + bool is_shadow : 1; + bool is_array : 1; + unsigned tex_type : 2; // 2D, 3D, Cube, Buffer + bool compute_lod : 1; // 0 for *Lod() + bool not_supply_lod : 1; // 0 for *Lod() or when a bias is applied + bool calc_gradients : 1; // 0 for *Grad() + unsigned unk1 : 1; + unsigned result_type : 4; // integer, unsigned, float TODO: why is this 4 bits? + unsigned unk2 : 4; +}; + +struct bifrost_dual_tex_ctrl { + unsigned sampler_index0 : 2; + unsigned unk0 : 2; + unsigned tex_index0 : 2; + unsigned sampler_index1 : 2; + unsigned tex_index1 : 2; + unsigned unk1 : 22; +}; + +enum branch_cond { + BR_COND_LT = 0, + BR_COND_LE = 1, + BR_COND_GE = 2, + BR_COND_GT = 3, + // Equal vs. not-equal determined by src0/src1 comparison + BR_COND_EQ = 4, + // floating-point comparisons + // Becomes UNE when you flip the arguments + BR_COND_OEQ = 5, + // TODO what happens when you flip the arguments? + BR_COND_OGT = 6, + BR_COND_OLT = 7, +}; + +enum branch_bit_size { + BR_SIZE_32 = 0, + BR_SIZE_16XX = 1, + BR_SIZE_16YY = 2, + // For the above combinations of bitsize and location, an extra bit is + // encoded via comparing the sources. The only possible source of ambiguity + // would be if the sources were the same, but then the branch condition + // would be always true or always false anyways, so we can ignore it. But + // this no longer works when comparing the y component to the x component, + // since it's valid to compare the y component of a source against its own + // x component. Instead, the extra bit is encoded via an extra bitsize. + BR_SIZE_16YX0 = 3, + BR_SIZE_16YX1 = 4, + BR_SIZE_32_AND_16X = 5, + BR_SIZE_32_AND_16Y = 6, + // Used for comparisons with zero and always-true, see below. I think this + // only works for integer comparisons. + BR_SIZE_ZERO = 7, +}; + +enum branch_code { + BR_ALWAYS = 63, +}; + +void dump_header(struct bifrost_header header, bool verbose); +void dump_instr(const struct bifrost_alu_inst *instr, struct bifrost_regs next_regs, uint64_t *consts, + unsigned data_reg, unsigned offset, bool verbose); +bool dump_clause(uint32_t *words, unsigned *size, unsigned offset, bool verbose); + +void dump_header(struct bifrost_header header, bool verbose) { + if (header.clause_type != 0) { + printf("id(%du) ", header.scoreboard_index); + } + + if (header.scoreboard_deps != 0) { + printf("next-wait("); + bool first = true; + for (unsigned i = 0; i < 8; i++) { + if (header.scoreboard_deps & (1 << i)) { + if (!first) { + printf(", "); + } + printf("%d", i); + first = false; + } + } + printf(") "); + } + + if (header.datareg_writebarrier) + printf("data-reg-barrier "); + + if (!header.no_end_of_shader) + printf("eos "); + + if (!header.back_to_back) { + printf("nbb "); + if (header.branch_cond) + printf("branch-cond "); + else + printf("branch-uncond "); + } + + if (header.elide_writes) + printf("we "); + + if (header.suppress_inf) + printf("suppress-inf "); + if (header.suppress_nan) + printf("suppress-nan "); + + if (header.unk0) + printf("unk0 "); + if (header.unk1) + printf("unk1 "); + if (header.unk2) + printf("unk2 "); + if (header.unk3) + printf("unk3 "); + if (header.unk4) + printf("unk4 "); + + printf("\n"); + + if (verbose) { + printf("# clause type %d, next clause type %d\n", + header.clause_type, header.next_clause_type); + } +} + +static struct bifrost_reg_ctrl DecodeRegCtrl(struct bifrost_regs regs) +{ + struct bifrost_reg_ctrl decoded = {}; + unsigned ctrl; + if (regs.ctrl == 0) { + ctrl = regs.reg1 >> 2; + decoded.read_reg0 = !(regs.reg1 & 0x2); + decoded.read_reg1 = false; + } else { + ctrl = regs.ctrl; + decoded.read_reg0 = decoded.read_reg1 = true; + } + switch (ctrl) { + case 1: + decoded.fma_write_unit = REG_WRITE_TWO; + break; + case 3: + decoded.fma_write_unit = REG_WRITE_TWO; + decoded.read_reg3 = true; + break; + case 4: + decoded.read_reg3 = true; + break; + case 5: + decoded.add_write_unit = REG_WRITE_TWO; + break; + case 6: + decoded.add_write_unit = REG_WRITE_TWO; + decoded.read_reg3 = true; + break; + case 8: + decoded.clause_start = true; + break; + case 9: + decoded.fma_write_unit = REG_WRITE_TWO; + decoded.clause_start = true; + break; + case 11: + break; + case 12: + decoded.read_reg3 = true; + decoded.clause_start = true; + break; + case 13: + decoded.add_write_unit = REG_WRITE_TWO; + decoded.clause_start = true; + break; + case 15: + decoded.fma_write_unit = REG_WRITE_THREE; + decoded.add_write_unit = REG_WRITE_TWO; + break; + default: + printf("# unknown reg ctrl %d\n", ctrl); + } + + return decoded; +} + +// Pass in the add_write_unit or fma_write_unit, and this returns which register +// the ADD/FMA units are writing to +static unsigned GetRegToWrite(enum bifrost_reg_write_unit unit, struct bifrost_regs regs) +{ + switch (unit) { + case REG_WRITE_TWO: + return regs.reg2; + case REG_WRITE_THREE: + return regs.reg3; + default: /* REG_WRITE_NONE */ + assert(0); + return 0; + } +} + +static void dump_regs(struct bifrost_regs srcs) +{ + struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(srcs); + printf("# "); + if (ctrl.read_reg0) + printf("port 0: R%d ", get_reg0(srcs)); + if (ctrl.read_reg1) + printf("port 1: R%d ", get_reg1(srcs)); + + if (ctrl.fma_write_unit == REG_WRITE_TWO) + printf("port 2: R%d (write FMA) ", srcs.reg2); + else if (ctrl.add_write_unit == REG_WRITE_TWO) + printf("port 2: R%d (write ADD) ", srcs.reg2); + + if (ctrl.fma_write_unit == REG_WRITE_THREE) + printf("port 3: R%d (write FMA) ", srcs.reg3); + else if (ctrl.add_write_unit == REG_WRITE_THREE) + printf("port 3: R%d (write ADD) ", srcs.reg3); + else if (ctrl.read_reg3) + printf("port 3: R%d (read) ", srcs.reg3); + + if (srcs.uniform_const) { + if (srcs.uniform_const & 0x80) { + printf("uniform: U%d", (srcs.uniform_const & 0x7f) * 2); + } + } + + printf("\n"); +} +static void dump_const_imm(uint32_t imm) +{ + union { + float f; + uint32_t i; + } fi; + fi.i = imm; + printf("0x%08x /* %f */", imm, fi.f); +} + +static uint64_t get_const(uint64_t *consts, struct bifrost_regs srcs) +{ + unsigned low_bits = srcs.uniform_const & 0xf; + uint64_t imm; + switch (srcs.uniform_const >> 4) { + case 4: imm = consts[0]; break; + case 5: imm = consts[1]; break; + case 6: imm = consts[2]; break; + case 7: imm = consts[3]; break; + case 2: imm = consts[4]; break; + case 3: imm = consts[5]; break; + default: assert(0); break; + } + return imm | low_bits; +} + +static void dump_uniform_const_src(struct bifrost_regs srcs, uint64_t *consts, bool high32) +{ + if (srcs.uniform_const & 0x80) { + unsigned uniform = (srcs.uniform_const & 0x7f) * 2; + printf("U%d", uniform + (high32 ? 1 : 0)); + } else if (srcs.uniform_const >= 0x20) { + uint64_t imm = get_const(consts, srcs); + if (high32) + dump_const_imm(imm >> 32); + else + dump_const_imm(imm); + } else { + switch (srcs.uniform_const) { + case 0: printf("0"); break; + case 5: printf("atest-data"); break; + case 6: printf("sample-ptr"); break; + case 8: + case 9: + case 10: + case 11: + case 12: + case 13: + case 14: + case 15: + printf("blend-descriptor%u", (unsigned) srcs.uniform_const - 8); + break; + default: + printf("unkConst%u", (unsigned) srcs.uniform_const); + break; + } + + if (high32) + printf(".y"); + else + printf(".x"); + } +} + +static void dump_src(unsigned src, struct bifrost_regs srcs, uint64_t *consts, bool isFMA) +{ + switch (src) { + case 0: printf("R%d", get_reg0(srcs)); break; + case 1: printf("R%d", get_reg1(srcs)); break; + case 2: printf("R%d", srcs.reg3); break; + case 3: + if (isFMA) + printf("0"); + else + printf("T"); // i.e. the output of FMA this cycle + break; + case 4: + dump_uniform_const_src(srcs, consts, false); + break; + case 5: + dump_uniform_const_src(srcs, consts, true); + break; + case 6: printf("T0"); break; + case 7: printf("T1"); break; + } +} + +static void dump_output_mod(unsigned mod) +{ + switch (mod) { + case 0: + break; + case 1: + printf(".clamp_0_inf"); break; // max(out, 0) + case 2: + printf(".clamp_m1_1"); break; // clamp(out, -1, 1) + case 3: + printf(".clamp_0_1"); break; // clamp(out, 0, 1) + default: + break; + } +} + +static void dump_minmax_mode(unsigned mod) +{ + switch (mod) { + case 0: + /* Same as fmax() and fmin() -- return the other number if any + * number is NaN. Also always return +0 if one argument is +0 and + * the other is -0. + */ + break; + case 1: + /* Instead of never returning a NaN, always return one. The + * "greater"/"lesser" NaN is always returned, first by checking the + * sign and then the mantissa bits. + */ + printf(".nan_wins"); break; + case 2: + /* For max, implement src0 > src1 ? src0 : src1 + * For min, implement src0 < src1 ? src0 : src1 + * + * This includes handling NaN's and signedness of 0 differently + * from above, since +0 and -0 compare equal and comparisons always + * return false for NaN's. As a result, this mode is *not* + * commutative. + */ + printf(".src1_wins"); break; + case 3: + /* For max, implement src0 < src1 ? src1 : src0 + * For min, implement src0 > src1 ? src1 : src0 + */ + printf(".src0_wins"); break; + default: + break; + } +} + +static void dump_round_mode(unsigned mod) +{ + switch (mod) { + case 0: + /* roundTiesToEven, the IEEE default. */ + break; + case 1: + /* roundTowardPositive in the IEEE spec. */ + printf(".round_pos"); break; + case 2: + /* roundTowardNegative in the IEEE spec. */ + printf(".round_neg"); break; + case 3: + /* roundTowardZero in the IEEE spec. */ + printf(".round_zero"); break; + default: + break; + } +} + +static const struct fma_op_info FMAOpInfos[] = { + { 0x00000, "FMA.f32", FMA_FMA }, + { 0x40000, "MAX.f32", FMA_FMINMAX }, + { 0x44000, "MIN.f32", FMA_FMINMAX }, + { 0x48000, "FCMP.GL", FMA_FCMP }, + { 0x4c000, "FCMP.D3D", FMA_FCMP }, + { 0x4ff98, "ADD.i32", FMA_TWO_SRC }, + { 0x4ffd8, "SUB.i32", FMA_TWO_SRC }, + { 0x4fff0, "SUBB.i32", FMA_TWO_SRC }, + { 0x50000, "FMA_MSCALE", FMA_FMA_MSCALE }, + { 0x58000, "ADD.f32", FMA_FADD }, + { 0x5c000, "CSEL.FEQ.f32", FMA_FOUR_SRC }, + { 0x5c200, "CSEL.FGT.f32", FMA_FOUR_SRC }, + { 0x5c400, "CSEL.FGE.f32", FMA_FOUR_SRC }, + { 0x5c600, "CSEL.IEQ.f32", FMA_FOUR_SRC }, + { 0x5c800, "CSEL.IGT.i32", FMA_FOUR_SRC }, + { 0x5ca00, "CSEL.IGE.i32", FMA_FOUR_SRC }, + { 0x5cc00, "CSEL.UGT.i32", FMA_FOUR_SRC }, + { 0x5ce00, "CSEL.UGE.i32", FMA_FOUR_SRC }, + { 0x5d8d0, "ICMP.D3D.GT.v2i16", FMA_TWO_SRC }, + { 0x5d9d0, "UCMP.D3D.GT.v2i16", FMA_TWO_SRC }, + { 0x5dad0, "ICMP.D3D.GE.v2i16", FMA_TWO_SRC }, + { 0x5dbd0, "UCMP.D3D.GE.v2i16", FMA_TWO_SRC }, + { 0x5dcd0, "ICMP.D3D.EQ.v2i16", FMA_TWO_SRC }, + { 0x5de40, "ICMP.GL.GT.i32", FMA_TWO_SRC }, // src0 > src1 ? 1 : 0 + { 0x5de48, "ICMP.GL.GE.i32", FMA_TWO_SRC }, + { 0x5de50, "UCMP.GL.GT.i32", FMA_TWO_SRC }, + { 0x5de58, "UCMP.GL.GE.i32", FMA_TWO_SRC }, + { 0x5de60, "ICMP.GL.EQ.i32", FMA_TWO_SRC }, + { 0x5dec0, "ICMP.D3D.GT.i32", FMA_TWO_SRC }, // src0 > src1 ? ~0 : 0 + { 0x5dec8, "ICMP.D3D.GE.i32", FMA_TWO_SRC }, + { 0x5ded0, "UCMP.D3D.GT.i32", FMA_TWO_SRC }, + { 0x5ded8, "UCMP.D3D.GE.i32", FMA_TWO_SRC }, + { 0x5dee0, "ICMP.D3D.EQ.i32", FMA_TWO_SRC }, + { 0x60200, "RSHIFT_NAND.i32", FMA_THREE_SRC }, + { 0x603c0, "RSHIFT_NAND.v2i16", FMA_THREE_SRC }, + { 0x60e00, "RSHIFT_OR.i32", FMA_THREE_SRC }, + { 0x60fc0, "RSHIFT_OR.v2i16", FMA_THREE_SRC }, + { 0x61200, "RSHIFT_AND.i32", FMA_THREE_SRC }, + { 0x613c0, "RSHIFT_AND.v2i16", FMA_THREE_SRC }, + { 0x61e00, "RSHIFT_NOR.i32", FMA_THREE_SRC }, // ~((src0 << src2) | src1) + { 0x61fc0, "RSHIFT_NOR.v2i16", FMA_THREE_SRC }, // ~((src0 << src2) | src1) + { 0x62200, "LSHIFT_NAND.i32", FMA_THREE_SRC }, + { 0x623c0, "LSHIFT_NAND.v2i16", FMA_THREE_SRC }, + { 0x62e00, "LSHIFT_OR.i32", FMA_THREE_SRC }, // (src0 << src2) | src1 + { 0x62fc0, "LSHIFT_OR.v2i16", FMA_THREE_SRC }, // (src0 << src2) | src1 + { 0x63200, "LSHIFT_AND.i32", FMA_THREE_SRC }, // (src0 << src2) & src1 + { 0x633c0, "LSHIFT_AND.v2i16", FMA_THREE_SRC }, + { 0x63e00, "LSHIFT_NOR.i32", FMA_THREE_SRC }, + { 0x63fc0, "LSHIFT_NOR.v2i16", FMA_THREE_SRC }, + { 0x64200, "RSHIFT_XOR.i32", FMA_THREE_SRC }, + { 0x643c0, "RSHIFT_XOR.v2i16", FMA_THREE_SRC }, + { 0x64600, "RSHIFT_XNOR.i32", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1) + { 0x647c0, "RSHIFT_XNOR.v2i16", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1) + { 0x64a00, "LSHIFT_XOR.i32", FMA_THREE_SRC }, + { 0x64bc0, "LSHIFT_XOR.v2i16", FMA_THREE_SRC }, + { 0x64e00, "LSHIFT_XNOR.i32", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1) + { 0x64fc0, "LSHIFT_XNOR.v2i16", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1) + { 0x65200, "LSHIFT_ADD.i32", FMA_THREE_SRC }, + { 0x65600, "LSHIFT_SUB.i32", FMA_THREE_SRC }, // (src0 << src2) - src1 + { 0x65a00, "LSHIFT_RSUB.i32", FMA_THREE_SRC }, // src1 - (src0 << src2) + { 0x65e00, "RSHIFT_ADD.i32", FMA_THREE_SRC }, + { 0x66200, "RSHIFT_SUB.i32", FMA_THREE_SRC }, + { 0x66600, "RSHIFT_RSUB.i32", FMA_THREE_SRC }, + { 0x66a00, "ARSHIFT_ADD.i32", FMA_THREE_SRC }, + { 0x66e00, "ARSHIFT_SUB.i32", FMA_THREE_SRC }, + { 0x67200, "ARSHIFT_RSUB.i32", FMA_THREE_SRC }, + { 0x80000, "FMA.v2f16", FMA_FMA16 }, + { 0xc0000, "MAX.v2f16", FMA_FMINMAX16 }, + { 0xc4000, "MIN.v2f16", FMA_FMINMAX16 }, + { 0xc8000, "FCMP.GL", FMA_FCMP16 }, + { 0xcc000, "FCMP.D3D", FMA_FCMP16 }, + { 0xcf900, "ADD.v2i16", FMA_TWO_SRC }, + { 0xcfc10, "ADDC.i32", FMA_TWO_SRC }, + { 0xcfd80, "ADD.i32.i16.X", FMA_TWO_SRC }, + { 0xcfd90, "ADD.i32.u16.X", FMA_TWO_SRC }, + { 0xcfdc0, "ADD.i32.i16.Y", FMA_TWO_SRC }, + { 0xcfdd0, "ADD.i32.u16.Y", FMA_TWO_SRC }, + { 0xd8000, "ADD.v2f16", FMA_FADD16 }, + { 0xdc000, "CSEL.FEQ.v2f16", FMA_FOUR_SRC }, + { 0xdc200, "CSEL.FGT.v2f16", FMA_FOUR_SRC }, + { 0xdc400, "CSEL.FGE.v2f16", FMA_FOUR_SRC }, + { 0xdc600, "CSEL.IEQ.v2f16", FMA_FOUR_SRC }, + { 0xdc800, "CSEL.IGT.v2i16", FMA_FOUR_SRC }, + { 0xdca00, "CSEL.IGE.v2i16", FMA_FOUR_SRC }, + { 0xdcc00, "CSEL.UGT.v2i16", FMA_FOUR_SRC }, + { 0xdce00, "CSEL.UGE.v2i16", FMA_FOUR_SRC }, + { 0xdd000, "F32_TO_F16", FMA_TWO_SRC }, + { 0xe0046, "F16_TO_I16.XX", FMA_ONE_SRC }, + { 0xe0047, "F16_TO_U16.XX", FMA_ONE_SRC }, + { 0xe004e, "F16_TO_I16.YX", FMA_ONE_SRC }, + { 0xe004f, "F16_TO_U16.YX", FMA_ONE_SRC }, + { 0xe0056, "F16_TO_I16.XY", FMA_ONE_SRC }, + { 0xe0057, "F16_TO_U16.XY", FMA_ONE_SRC }, + { 0xe005e, "F16_TO_I16.YY", FMA_ONE_SRC }, + { 0xe005f, "F16_TO_U16.YY", FMA_ONE_SRC }, + { 0xe00c0, "I16_TO_F16.XX", FMA_ONE_SRC }, + { 0xe00c1, "U16_TO_F16.XX", FMA_ONE_SRC }, + { 0xe00c8, "I16_TO_F16.YX", FMA_ONE_SRC }, + { 0xe00c9, "U16_TO_F16.YX", FMA_ONE_SRC }, + { 0xe00d0, "I16_TO_F16.XY", FMA_ONE_SRC }, + { 0xe00d1, "U16_TO_F16.XY", FMA_ONE_SRC }, + { 0xe00d8, "I16_TO_F16.YY", FMA_ONE_SRC }, + { 0xe00d9, "U16_TO_F16.YY", FMA_ONE_SRC }, + { 0xe0136, "F32_TO_I32", FMA_ONE_SRC }, + { 0xe0137, "F32_TO_U32", FMA_ONE_SRC }, + { 0xe0178, "I32_TO_F32", FMA_ONE_SRC }, + { 0xe0179, "U32_TO_F32", FMA_ONE_SRC }, + { 0xe0198, "I16_TO_I32.X", FMA_ONE_SRC }, + { 0xe0199, "U16_TO_U32.X", FMA_ONE_SRC }, + { 0xe019a, "I16_TO_I32.Y", FMA_ONE_SRC }, + { 0xe019b, "U16_TO_U32.Y", FMA_ONE_SRC }, + { 0xe019c, "I16_TO_F32.X", FMA_ONE_SRC }, + { 0xe019d, "U16_TO_F32.X", FMA_ONE_SRC }, + { 0xe019e, "I16_TO_F32.Y", FMA_ONE_SRC }, + { 0xe019f, "U16_TO_F32.Y", FMA_ONE_SRC }, + { 0xe01a2, "F16_TO_F32.X", FMA_ONE_SRC }, + { 0xe01a3, "F16_TO_F32.Y", FMA_ONE_SRC }, + { 0xe032c, "NOP", FMA_ONE_SRC }, + { 0xe032d, "MOV", FMA_ONE_SRC }, + { 0xe032f, "SWZ.YY.v2i16", FMA_ONE_SRC }, + // From the ARM patent US20160364209A1: + // "Decompose v (the input) into numbers x1 and s such that v = x1 * 2^s, + // and x1 is a floating point value in a predetermined range where the + // value 1 is within the range and not at one extremity of the range (e.g. + // choose a range where 1 is towards middle of range)." + // + // This computes x1. + { 0xe0345, "LOG_FREXPM", FMA_ONE_SRC }, + // Given a floating point number m * 2^e, returns m * 2^{-1}. This is + // exactly the same as the mantissa part of frexp(). + { 0xe0365, "FRCP_FREXPM", FMA_ONE_SRC }, + // Given a floating point number m * 2^e, returns m * 2^{-2} if e is even, + // and m * 2^{-1} if e is odd. In other words, scales by powers of 4 until + // within the range [0.25, 1). Used for square-root and reciprocal + // square-root. + { 0xe0375, "FSQRT_FREXPM", FMA_ONE_SRC }, + // Given a floating point number m * 2^e, computes -e - 1 as an integer. + // Zero and infinity/NaN return 0. + { 0xe038d, "FRCP_FREXPE", FMA_ONE_SRC }, + // Computes floor(e/2) + 1. + { 0xe03a5, "FSQRT_FREXPE", FMA_ONE_SRC }, + // Given a floating point number m * 2^e, computes -floor(e/2) - 1 as an + // integer. + { 0xe03ad, "FRSQ_FREXPE", FMA_ONE_SRC }, + { 0xe03c5, "LOG_FREXPE", FMA_ONE_SRC }, + { 0xe0b80, "IMAX3", FMA_THREE_SRC }, + { 0xe0bc0, "UMAX3", FMA_THREE_SRC }, + { 0xe0c00, "IMIN3", FMA_THREE_SRC }, + { 0xe0c40, "UMIN3", FMA_THREE_SRC }, + { 0xe0f40, "CSEL", FMA_THREE_SRC }, // src2 != 0 ? src1 : src0 + { 0xe0fc0, "MUX.i32", FMA_THREE_SRC }, // see ADD comment + { 0xe1845, "CEIL", FMA_ONE_SRC }, + { 0xe1885, "FLOOR", FMA_ONE_SRC }, + { 0xe19b0, "ATAN_LDEXP.Y.f32", FMA_TWO_SRC }, + { 0xe19b8, "ATAN_LDEXP.X.f32", FMA_TWO_SRC }, + // These instructions in the FMA slot, together with LSHIFT_ADD_HIGH32.i32 + // in the ADD slot, allow one to do a 64-bit addition with an extra small + // shift on one of the sources. There are three possible scenarios: + // + // 1) Full 64-bit addition. Do: + // out.x = LSHIFT_ADD_LOW32.i64 src1.x, src2.x, shift + // out.y = LSHIFT_ADD_HIGH32.i32 src1.y, src2.y + // + // The shift amount is applied to src2 before adding. The shift amount, and + // any extra bits from src2 plus the overflow bit, are sent directly from + // FMA to ADD instead of being passed explicitly. Hence, these two must be + // bundled together into the same instruction. + // + // 2) Add a 64-bit value src1 to a zero-extended 32-bit value src2. Do: + // out.x = LSHIFT_ADD_LOW32.u32 src1.x, src2, shift + // out.y = LSHIFT_ADD_HIGH32.i32 src1.x, 0 + // + // Note that in this case, the second argument to LSHIFT_ADD_HIGH32 is + // ignored, so it can actually be anything. As before, the shift is applied + // to src2 before adding. + // + // 3) Add a 64-bit value to a sign-extended 32-bit value src2. Do: + // out.x = LSHIFT_ADD_LOW32.i32 src1.x, src2, shift + // out.y = LSHIFT_ADD_HIGH32.i32 src1.x, 0 + // + // The only difference is the .i32 instead of .u32. Otherwise, this is + // exactly the same as before. + // + // In all these instructions, the shift amount is stored where the third + // source would be, so the shift has to be a small immediate from 0 to 7. + // This is fine for the expected use-case of these instructions, which is + // manipulating 64-bit pointers. + // + // These instructions can also be combined with various load/store + // instructions which normally take a 64-bit pointer in order to add a + // 32-bit or 64-bit offset to the pointer before doing the operation, + // optionally shifting the offset. The load/store op implicity does + // LSHIFT_ADD_HIGH32.i32 internally. Letting ptr be the pointer, and offset + // the desired offset, the cases go as follows: + // + // 1) Add a 64-bit offset: + // LSHIFT_ADD_LOW32.i64 ptr.x, offset.x, shift + // ld_st_op ptr.y, offset.y, ... + // + // Note that the output of LSHIFT_ADD_LOW32.i64 is not used, instead being + // implicitly sent to the load/store op to serve as the low 32 bits of the + // pointer. + // + // 2) Add a 32-bit unsigned offset: + // temp = LSHIFT_ADD_LOW32.u32 ptr.x, offset, shift + // ld_st_op temp, ptr.y, ... + // + // Now, the low 32 bits of offset << shift + ptr are passed explicitly to + // the ld_st_op, to match the case where there is no offset and ld_st_op is + // called directly. + // + // 3) Add a 32-bit signed offset: + // temp = LSHIFT_ADD_LOW32.i32 ptr.x, offset, shift + // ld_st_op temp, ptr.y, ... + // + // Again, the same as the unsigned case except for the offset. + { 0xe1c80, "LSHIFT_ADD_LOW32.u32", FMA_SHIFT_ADD64 }, + { 0xe1cc0, "LSHIFT_ADD_LOW32.i64", FMA_SHIFT_ADD64 }, + { 0xe1d80, "LSHIFT_ADD_LOW32.i32", FMA_SHIFT_ADD64 }, + { 0xe1e00, "SEL.XX.i16", FMA_TWO_SRC }, + { 0xe1e08, "SEL.YX.i16", FMA_TWO_SRC }, + { 0xe1e10, "SEL.XY.i16", FMA_TWO_SRC }, + { 0xe1e18, "SEL.YY.i16", FMA_TWO_SRC }, + { 0xe7800, "IMAD", FMA_THREE_SRC }, + { 0xe78db, "POPCNT", FMA_ONE_SRC }, +}; + +static struct fma_op_info find_fma_op_info(unsigned op) +{ + for (unsigned i = 0; i < ARRAY_SIZE(FMAOpInfos); i++) { + unsigned opCmp = ~0; + switch (FMAOpInfos[i].src_type) { + case FMA_ONE_SRC: + opCmp = op; + break; + case FMA_TWO_SRC: + opCmp = op & ~0x7; + break; + case FMA_FCMP: + case FMA_FCMP16: + opCmp = op & ~0x1fff; + break; + case FMA_THREE_SRC: + case FMA_SHIFT_ADD64: + opCmp = op & ~0x3f; + break; + case FMA_FADD: + case FMA_FMINMAX: + case FMA_FADD16: + case FMA_FMINMAX16: + opCmp = op & ~0x3fff; + break; + case FMA_FMA: + case FMA_FMA16: + opCmp = op & ~0x3ffff; + break; + case FMA_FOUR_SRC: + opCmp = op & ~0x1ff; + break; + case FMA_FMA_MSCALE: + opCmp = op & ~0x7fff; + break; + default: + opCmp = ~0; + break; + } + if (FMAOpInfos[i].op == opCmp) + return FMAOpInfos[i]; + } + + struct fma_op_info info; + snprintf(info.name, sizeof(info.name), "op%04x", op); + info.op = op; + info.src_type = FMA_THREE_SRC; + return info; +} + +static void dump_fcmp(unsigned op) +{ + switch (op) { + case 0: + printf(".OEQ"); + break; + case 1: + printf(".OGT"); + break; + case 2: + printf(".OGE"); + break; + case 3: + printf(".UNE"); + break; + case 4: + printf(".OLT"); + break; + case 5: + printf(".OLE"); + break; + default: + printf(".unk%d", op); + break; + } +} + +static void dump_16swizzle(unsigned swiz) +{ + if (swiz == 2) + return; + printf(".%c%c", "xy"[swiz & 1], "xy"[(swiz >> 1) & 1]); +} + +static void dump_fma_expand_src0(unsigned ctrl) +{ + switch (ctrl) { + case 3: + case 4: + case 6: + printf(".x"); + break; + case 5: + case 7: + printf(".y"); + break; + case 0: + case 1: + case 2: + break; + default: + printf(".unk"); + break; + } +} + +static void dump_fma_expand_src1(unsigned ctrl) +{ + switch (ctrl) { + case 1: + case 3: + printf(".x"); + break; + case 2: + case 4: + case 5: + printf(".y"); + break; + case 0: + case 6: + case 7: + break; + default: + printf(".unk"); + break; + } +} + +static void dump_fma(uint64_t word, struct bifrost_regs regs, struct bifrost_regs next_regs, uint64_t *consts, bool verbose) +{ + if (verbose) { + printf("# FMA: %016" PRIx64 "\n", word); + } + struct bifrost_fma_inst FMA; + memcpy((char *) &FMA, (char *) &word, sizeof(struct bifrost_fma_inst)); + struct fma_op_info info = find_fma_op_info(FMA.op); + + printf("%s", info.name); + if (info.src_type == FMA_FADD || + info.src_type == FMA_FMINMAX || + info.src_type == FMA_FMA || + info.src_type == FMA_FADD16 || + info.src_type == FMA_FMINMAX16 || + info.src_type == FMA_FMA16) { + dump_output_mod(bits(FMA.op, 12, 14)); + switch (info.src_type) { + case FMA_FADD: + case FMA_FMA: + case FMA_FADD16: + case FMA_FMA16: + dump_round_mode(bits(FMA.op, 10, 12)); + break; + case FMA_FMINMAX: + case FMA_FMINMAX16: + dump_minmax_mode(bits(FMA.op, 10, 12)); + break; + default: + assert(0); + } + } else if (info.src_type == FMA_FCMP || info.src_type == FMA_FCMP16) { + dump_fcmp(bits(FMA.op, 10, 13)); + if (info.src_type == FMA_FCMP) + printf(".f32"); + else + printf(".v2f16"); + } else if (info.src_type == FMA_FMA_MSCALE) { + if (FMA.op & (1 << 11)) { + switch ((FMA.op >> 9) & 0x3) { + case 0: + /* This mode seems to do a few things: + * - Makes 0 * infinity (and incidentally 0 * nan) return 0, + * since generating a nan would poison the result of + * 1/infinity and 1/0. + * - Fiddles with which nan is returned in nan * nan, + * presumably to make sure that the same exact nan is + * returned for 1/nan. + */ + printf(".rcp_mode"); + break; + case 3: + /* Similar to the above, but src0 always wins when multiplying + * 0 by infinity. + */ + printf(".sqrt_mode"); + break; + default: + printf(".unk%d_mode", (int) (FMA.op >> 9) & 0x3); + } + } else { + dump_output_mod(bits(FMA.op, 9, 11)); + } + } + + printf(" "); + + struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(next_regs); + if (next_ctrl.fma_write_unit != REG_WRITE_NONE) { + printf("{R%d, T0}, ", GetRegToWrite(next_ctrl.fma_write_unit, next_regs)); + } else { + printf("T0, "); + } + + switch (info.src_type) { + case FMA_ONE_SRC: + dump_src(FMA.src0, regs, consts, true); + break; + case FMA_TWO_SRC: + dump_src(FMA.src0, regs, consts, true); + printf(", "); + dump_src(FMA.op & 0x7, regs, consts, true); + break; + case FMA_FADD: + case FMA_FMINMAX: + if (FMA.op & 0x10) + printf("-"); + if (FMA.op & 0x200) + printf("abs("); + dump_src(FMA.src0, regs, consts, true); + dump_fma_expand_src0((FMA.op >> 6) & 0x7); + if (FMA.op & 0x200) + printf(")"); + printf(", "); + if (FMA.op & 0x20) + printf("-"); + if (FMA.op & 0x8) + printf("abs("); + dump_src(FMA.op & 0x7, regs, consts, true); + dump_fma_expand_src1((FMA.op >> 6) & 0x7); + if (FMA.op & 0x8) + printf(")"); + break; + case FMA_FADD16: + case FMA_FMINMAX16: { + bool abs1 = FMA.op & 0x8; + bool abs2 = (FMA.op & 0x7) < FMA.src0; + if (FMA.op & 0x10) + printf("-"); + if (abs1 || abs2) + printf("abs("); + dump_src(FMA.src0, regs, consts, true); + dump_16swizzle((FMA.op >> 6) & 0x3); + if (abs1 || abs2) + printf(")"); + printf(", "); + if (FMA.op & 0x20) + printf("-"); + if (abs1 && abs2) + printf("abs("); + dump_src(FMA.op & 0x7, regs, consts, true); + dump_16swizzle((FMA.op >> 8) & 0x3); + if (abs1 && abs2) + printf(")"); + break; + } + case FMA_FCMP: + if (FMA.op & 0x200) + printf("abs("); + dump_src(FMA.src0, regs, consts, true); + dump_fma_expand_src0((FMA.op >> 6) & 0x7); + if (FMA.op & 0x200) + printf(")"); + printf(", "); + if (FMA.op & 0x20) + printf("-"); + if (FMA.op & 0x8) + printf("abs("); + dump_src(FMA.op & 0x7, regs, consts, true); + dump_fma_expand_src1((FMA.op >> 6) & 0x7); + if (FMA.op & 0x8) + printf(")"); + break; + case FMA_FCMP16: + dump_src(FMA.src0, regs, consts, true); + // Note: this is kinda a guess, I haven't seen the blob set this to + // anything other than the identity, but it matches FMA_TWO_SRCFmod16 + dump_16swizzle((FMA.op >> 6) & 0x3); + printf(", "); + dump_src(FMA.op & 0x7, regs, consts, true); + dump_16swizzle((FMA.op >> 8) & 0x3); + break; + case FMA_SHIFT_ADD64: + dump_src(FMA.src0, regs, consts, true); + printf(", "); + dump_src(FMA.op & 0x7, regs, consts, true); + printf(", "); + printf("shift:%u", (FMA.op >> 3) & 0x7); + break; + case FMA_THREE_SRC: + dump_src(FMA.src0, regs, consts, true); + printf(", "); + dump_src(FMA.op & 0x7, regs, consts, true); + printf(", "); + dump_src((FMA.op >> 3) & 0x7, regs, consts, true); + break; + case FMA_FMA: + if (FMA.op & (1 << 14)) + printf("-"); + if (FMA.op & (1 << 9)) + printf("abs("); + dump_src(FMA.src0, regs, consts, true); + dump_fma_expand_src0((FMA.op >> 6) & 0x7); + if (FMA.op & (1 << 9)) + printf(")"); + printf(", "); + if (FMA.op & (1 << 16)) + printf("abs("); + dump_src(FMA.op & 0x7, regs, consts, true); + dump_fma_expand_src1((FMA.op >> 6) & 0x7); + if (FMA.op & (1 << 16)) + printf(")"); + printf(", "); + if (FMA.op & (1 << 15)) + printf("-"); + if (FMA.op & (1 << 17)) + printf("abs("); + dump_src((FMA.op >> 3) & 0x7, regs, consts, true); + if (FMA.op & (1 << 17)) + printf(")"); + break; + case FMA_FMA16: + if (FMA.op & (1 << 14)) + printf("-"); + dump_src(FMA.src0, regs, consts, true); + dump_16swizzle((FMA.op >> 6) & 0x3); + printf(", "); + dump_src(FMA.op & 0x7, regs, consts, true); + dump_16swizzle((FMA.op >> 8) & 0x3); + printf(", "); + if (FMA.op & (1 << 15)) + printf("-"); + dump_src((FMA.op >> 3) & 0x7, regs, consts, true); + dump_16swizzle((FMA.op >> 16) & 0x3); + break; + case FMA_FOUR_SRC: + dump_src(FMA.src0, regs, consts, true); + printf(", "); + dump_src(FMA.op & 0x7, regs, consts, true); + printf(", "); + dump_src((FMA.op >> 3) & 0x7, regs, consts, true); + printf(", "); + dump_src((FMA.op >> 6) & 0x7, regs, consts, true); + break; + case FMA_FMA_MSCALE: + if (FMA.op & (1 << 12)) + printf("abs("); + dump_src(FMA.src0, regs, consts, true); + if (FMA.op & (1 << 12)) + printf(")"); + printf(", "); + if (FMA.op & (1 << 13)) + printf("-"); + dump_src(FMA.op & 0x7, regs, consts, true); + printf(", "); + if (FMA.op & (1 << 14)) + printf("-"); + dump_src((FMA.op >> 3) & 0x7, regs, consts, true); + printf(", "); + dump_src((FMA.op >> 6) & 0x7, regs, consts, true); + break; + } + printf("\n"); +} + +static const struct add_op_info add_op_infos[] = { + { 0x00000, "MAX.f32", ADD_FMINMAX }, + { 0x02000, "MIN.f32", ADD_FMINMAX }, + { 0x04000, "ADD.f32", ADD_FADD }, + { 0x06000, "FCMP.GL", ADD_FCMP }, + { 0x07000, "FCMP.D3D", ADD_FCMP }, + { 0x07856, "F16_TO_I16", ADD_ONE_SRC }, + { 0x07857, "F16_TO_U16", ADD_ONE_SRC }, + { 0x078c0, "I16_TO_F16.XX", ADD_ONE_SRC }, + { 0x078c1, "U16_TO_F16.XX", ADD_ONE_SRC }, + { 0x078c8, "I16_TO_F16.YX", ADD_ONE_SRC }, + { 0x078c9, "U16_TO_F16.YX", ADD_ONE_SRC }, + { 0x078d0, "I16_TO_F16.XY", ADD_ONE_SRC }, + { 0x078d1, "U16_TO_F16.XY", ADD_ONE_SRC }, + { 0x078d8, "I16_TO_F16.YY", ADD_ONE_SRC }, + { 0x078d9, "U16_TO_F16.YY", ADD_ONE_SRC }, + { 0x07936, "F32_TO_I32", ADD_ONE_SRC }, + { 0x07937, "F32_TO_U32", ADD_ONE_SRC }, + { 0x07978, "I32_TO_F32", ADD_ONE_SRC }, + { 0x07979, "U32_TO_F32", ADD_ONE_SRC }, + { 0x07998, "I16_TO_I32.X", ADD_ONE_SRC }, + { 0x07999, "U16_TO_U32.X", ADD_ONE_SRC }, + { 0x0799a, "I16_TO_I32.Y", ADD_ONE_SRC }, + { 0x0799b, "U16_TO_U32.Y", ADD_ONE_SRC }, + { 0x0799c, "I16_TO_F32.X", ADD_ONE_SRC }, + { 0x0799d, "U16_TO_F32.X", ADD_ONE_SRC }, + { 0x0799e, "I16_TO_F32.Y", ADD_ONE_SRC }, + { 0x0799f, "U16_TO_F32.Y", ADD_ONE_SRC }, + // take the low 16 bits, and expand it to a 32-bit float + { 0x079a2, "F16_TO_F32.X", ADD_ONE_SRC }, + // take the high 16 bits, ... + { 0x079a3, "F16_TO_F32.Y", ADD_ONE_SRC }, + { 0x07b2b, "SWZ.YX.v2i16", ADD_ONE_SRC }, + { 0x07b2c, "NOP", ADD_ONE_SRC }, + { 0x07b29, "SWZ.XX.v2i16", ADD_ONE_SRC }, + // Logically, this should be SWZ.XY, but that's equivalent to a move, and + // this seems to be the canonical way the blob generates a MOV. + { 0x07b2d, "MOV", ADD_ONE_SRC }, + { 0x07b2f, "SWZ.YY.v2i16", ADD_ONE_SRC }, + // Given a floating point number m * 2^e, returns m ^ 2^{-1}. + { 0x07b65, "FRCP_FREXPM", ADD_ONE_SRC }, + { 0x07b75, "FSQRT_FREXPM", ADD_ONE_SRC }, + { 0x07b8d, "FRCP_FREXPE", ADD_ONE_SRC }, + { 0x07ba5, "FSQRT_FREXPE", ADD_ONE_SRC }, + { 0x07bad, "FRSQ_FREXPE", ADD_ONE_SRC }, + // From the ARM patent US20160364209A1: + // "Decompose v (the input) into numbers x1 and s such that v = x1 * 2^s, + // and x1 is a floating point value in a predetermined range where the + // value 1 is within the range and not at one extremity of the range (e.g. + // choose a range where 1 is towards middle of range)." + // + // This computes s. + { 0x07bc5, "FLOG_FREXPE", ADD_ONE_SRC }, + { 0x07d45, "CEIL", ADD_ONE_SRC }, + { 0x07d85, "FLOOR", ADD_ONE_SRC }, + { 0x07f18, "LSHIFT_ADD_HIGH32.i32", ADD_TWO_SRC }, + { 0x08000, "LD_ATTR.f16", ADD_LOAD_ATTR, true }, + { 0x08100, "LD_ATTR.v2f16", ADD_LOAD_ATTR, true }, + { 0x08200, "LD_ATTR.v3f16", ADD_LOAD_ATTR, true }, + { 0x08300, "LD_ATTR.v4f16", ADD_LOAD_ATTR, true }, + { 0x08400, "LD_ATTR.f32", ADD_LOAD_ATTR, true }, + { 0x08500, "LD_ATTR.v3f32", ADD_LOAD_ATTR, true }, + { 0x08600, "LD_ATTR.v3f32", ADD_LOAD_ATTR, true }, + { 0x08700, "LD_ATTR.v4f32", ADD_LOAD_ATTR, true }, + { 0x08800, "LD_ATTR.i32", ADD_LOAD_ATTR, true }, + { 0x08900, "LD_ATTR.v3i32", ADD_LOAD_ATTR, true }, + { 0x08a00, "LD_ATTR.v3i32", ADD_LOAD_ATTR, true }, + { 0x08b00, "LD_ATTR.v4i32", ADD_LOAD_ATTR, true }, + { 0x08c00, "LD_ATTR.u32", ADD_LOAD_ATTR, true }, + { 0x08d00, "LD_ATTR.v3u32", ADD_LOAD_ATTR, true }, + { 0x08e00, "LD_ATTR.v3u32", ADD_LOAD_ATTR, true }, + { 0x08f00, "LD_ATTR.v4u32", ADD_LOAD_ATTR, true }, + { 0x0a000, "LD_VAR.32", ADD_VARYING_INTERP, true }, + { 0x0b000, "TEX", ADD_TEX_COMPACT, true }, + { 0x0c188, "LOAD.i32", ADD_TWO_SRC, true }, + { 0x0c1a0, "LD_UBO.i32", ADD_TWO_SRC, true }, + { 0x0c1b8, "LD_SCRATCH.v2i32", ADD_TWO_SRC, true }, + { 0x0c1c8, "LOAD.v2i32", ADD_TWO_SRC, true }, + { 0x0c1e0, "LD_UBO.v2i32", ADD_TWO_SRC, true }, + { 0x0c1f8, "LD_SCRATCH.v2i32", ADD_TWO_SRC, true }, + { 0x0c208, "LOAD.v4i32", ADD_TWO_SRC, true }, + // src0 = offset, src1 = binding + { 0x0c220, "LD_UBO.v4i32", ADD_TWO_SRC, true }, + { 0x0c238, "LD_SCRATCH.v4i32", ADD_TWO_SRC, true }, + { 0x0c248, "STORE.v4i32", ADD_TWO_SRC, true }, + { 0x0c278, "ST_SCRATCH.v4i32", ADD_TWO_SRC, true }, + { 0x0c588, "STORE.i32", ADD_TWO_SRC, true }, + { 0x0c5b8, "ST_SCRATCH.i32", ADD_TWO_SRC, true }, + { 0x0c5c8, "STORE.v2i32", ADD_TWO_SRC, true }, + { 0x0c5f8, "ST_SCRATCH.v2i32", ADD_TWO_SRC, true }, + { 0x0c648, "LOAD.u16", ADD_TWO_SRC, true }, // zero-extends + { 0x0ca88, "LOAD.v3i32", ADD_TWO_SRC, true }, + { 0x0caa0, "LD_UBO.v3i32", ADD_TWO_SRC, true }, + { 0x0cab8, "LD_SCRATCH.v3i32", ADD_TWO_SRC, true }, + { 0x0cb88, "STORE.v3i32", ADD_TWO_SRC, true }, + { 0x0cbb8, "ST_SCRATCH.v3i32", ADD_TWO_SRC, true }, + // *_FAST does not exist on G71 (added to G51, G72, and everything after) + { 0x0cc00, "FRCP_FAST.f32", ADD_ONE_SRC }, + { 0x0cc20, "FRSQ_FAST.f32", ADD_ONE_SRC }, + // Given a floating point number m * 2^e, produces a table-based + // approximation of 2/m using the top 17 bits. Includes special cases for + // infinity, NaN, and zero, and copies the sign bit. + { 0x0ce00, "FRCP_TABLE", ADD_ONE_SRC }, + // Exists on G71 + { 0x0ce10, "FRCP_FAST.f16.X", ADD_ONE_SRC }, + // A similar table for inverse square root, using the high 17 bits of the + // mantissa as well as the low bit of the exponent. + { 0x0ce20, "FRSQ_TABLE", ADD_ONE_SRC }, + { 0x0ce30, "FRCP_FAST.f16.Y", ADD_ONE_SRC }, + { 0x0ce50, "FRSQ_FAST.f16.X", ADD_ONE_SRC }, + // Used in the argument reduction for log. Given a floating-point number + // m * 2^e, uses the top 4 bits of m to produce an approximation to 1/m + // with the exponent forced to 0 and only the top 5 bits are nonzero. 0, + // infinity, and NaN all return 1.0. + // See the ARM patent for more information. + { 0x0ce60, "FRCP_APPROX", ADD_ONE_SRC }, + { 0x0ce70, "FRSQ_FAST.f16.Y", ADD_ONE_SRC }, + { 0x0cf40, "ATAN_ASSIST", ADD_TWO_SRC }, + { 0x0cf48, "ATAN_TABLE", ADD_TWO_SRC }, + { 0x0cf50, "SIN_TABLE", ADD_ONE_SRC }, + { 0x0cf51, "COS_TABLE", ADD_ONE_SRC }, + { 0x0cf58, "EXP_TABLE", ADD_ONE_SRC }, + { 0x0cf60, "FLOG2_TABLE", ADD_ONE_SRC }, + { 0x0cf64, "FLOGE_TABLE", ADD_ONE_SRC }, + { 0x0d000, "BRANCH", ADD_BRANCH }, + // For each bit i, return src2[i] ? src0[i] : src1[i]. In other words, this + // is the same as (src2 & src0) | (~src2 & src1). + { 0x0e8c0, "MUX", ADD_THREE_SRC }, + { 0x0e9b0, "ATAN_LDEXP.Y.f32", ADD_TWO_SRC }, + { 0x0e9b8, "ATAN_LDEXP.X.f32", ADD_TWO_SRC }, + { 0x0ea60, "SEL.XX.i16", ADD_TWO_SRC }, + { 0x0ea70, "SEL.XY.i16", ADD_TWO_SRC }, + { 0x0ea68, "SEL.YX.i16", ADD_TWO_SRC }, + { 0x0ea78, "SEL.YY.i16", ADD_TWO_SRC }, + { 0x0ec00, "F32_TO_F16", ADD_TWO_SRC }, + { 0x0f640, "ICMP.GL.GT", ADD_TWO_SRC }, // src0 > src1 ? 1 : 0 + { 0x0f648, "ICMP.GL.GE", ADD_TWO_SRC }, + { 0x0f650, "UCMP.GL.GT", ADD_TWO_SRC }, + { 0x0f658, "UCMP.GL.GE", ADD_TWO_SRC }, + { 0x0f660, "ICMP.GL.EQ", ADD_TWO_SRC }, + { 0x0f6c0, "ICMP.D3D.GT", ADD_TWO_SRC }, // src0 > src1 ? ~0 : 0 + { 0x0f6c8, "ICMP.D3D.GE", ADD_TWO_SRC }, + { 0x0f6d0, "UCMP.D3D.GT", ADD_TWO_SRC }, + { 0x0f6d8, "UCMP.D3D.GE", ADD_TWO_SRC }, + { 0x0f6e0, "ICMP.D3D.EQ", ADD_TWO_SRC }, + { 0x10000, "MAX.v2f16", ADD_FMINMAX16 }, + { 0x11000, "ADD_MSCALE.f32", ADD_FADDMscale }, + { 0x12000, "MIN.v2f16", ADD_FMINMAX16 }, + { 0x14000, "ADD.v2f16", ADD_FADD16 }, + { 0x17000, "FCMP.D3D", ADD_FCMP16 }, + { 0x178c0, "ADD.i32", ADD_TWO_SRC }, + { 0x17900, "ADD.v2i16", ADD_TWO_SRC }, + { 0x17ac0, "SUB.i32", ADD_TWO_SRC }, + { 0x17c10, "ADDC.i32", ADD_TWO_SRC }, // adds src0 to the bottom bit of src1 + { 0x17d80, "ADD.i32.i16.X", ADD_TWO_SRC }, + { 0x17d90, "ADD.i32.u16.X", ADD_TWO_SRC }, + { 0x17dc0, "ADD.i32.i16.Y", ADD_TWO_SRC }, + { 0x17dd0, "ADD.i32.u16.Y", ADD_TWO_SRC }, + // Compute varying address and datatype (for storing in the vertex shader), + // and store the vec3 result in the data register. The result is passed as + // the 3 normal arguments to ST_VAR. + { 0x18000, "LD_VAR_ADDR.f16", ADD_VARYING_ADDRESS, true }, + { 0x18100, "LD_VAR_ADDR.f32", ADD_VARYING_ADDRESS, true }, + { 0x18200, "LD_VAR_ADDR.i32", ADD_VARYING_ADDRESS, true }, + { 0x18300, "LD_VAR_ADDR.u32", ADD_VARYING_ADDRESS, true }, + // Implements alpha-to-coverage, as well as possibly the late depth and + // stencil tests. The first source is the existing sample mask in R60 + // (possibly modified by gl_SampleMask), and the second source is the alpha + // value. The sample mask is written right away based on the + // alpha-to-coverage result using the normal register write mechanism, + // since that doesn't need to read from any memory, and then written again + // later based on the result of the stencil and depth tests using the + // special register. + { 0x191e8, "ATEST.f32", ADD_TWO_SRC, true }, + { 0x191f0, "ATEST.X.f16", ADD_TWO_SRC, true }, + { 0x191f8, "ATEST.Y.f16", ADD_TWO_SRC, true }, + // store a varying given the address and datatype from LD_VAR_ADDR + { 0x19300, "ST_VAR.v1", ADD_THREE_SRC, true }, + { 0x19340, "ST_VAR.v2", ADD_THREE_SRC, true }, + { 0x19380, "ST_VAR.v3", ADD_THREE_SRC, true }, + { 0x193c0, "ST_VAR.v4", ADD_THREE_SRC, true }, + // This takes the sample coverage mask (computed by ATEST above) as a + // regular argument, in addition to the vec4 color in the special register. + { 0x1952c, "BLEND", ADD_BLENDING, true }, + { 0x1a000, "LD_VAR.16", ADD_VARYING_INTERP, true }, + { 0x1ae60, "TEX", ADD_TEX, true }, + { 0x1c000, "RSHIFT_NAND.i32", ADD_THREE_SRC }, + { 0x1c300, "RSHIFT_OR.i32", ADD_THREE_SRC }, + { 0x1c400, "RSHIFT_AND.i32", ADD_THREE_SRC }, + { 0x1c700, "RSHIFT_NOR.i32", ADD_THREE_SRC }, + { 0x1c800, "LSHIFT_NAND.i32", ADD_THREE_SRC }, + { 0x1cb00, "LSHIFT_OR.i32", ADD_THREE_SRC }, + { 0x1cc00, "LSHIFT_AND.i32", ADD_THREE_SRC }, + { 0x1cf00, "LSHIFT_NOR.i32", ADD_THREE_SRC }, + { 0x1d000, "RSHIFT_XOR.i32", ADD_THREE_SRC }, + { 0x1d100, "RSHIFT_XNOR.i32", ADD_THREE_SRC }, + { 0x1d200, "LSHIFT_XOR.i32", ADD_THREE_SRC }, + { 0x1d300, "LSHIFT_XNOR.i32", ADD_THREE_SRC }, + { 0x1d400, "LSHIFT_ADD.i32", ADD_THREE_SRC }, + { 0x1d500, "LSHIFT_SUB.i32", ADD_THREE_SRC }, + { 0x1d500, "LSHIFT_RSUB.i32", ADD_THREE_SRC }, + { 0x1d700, "RSHIFT_ADD.i32", ADD_THREE_SRC }, + { 0x1d800, "RSHIFT_SUB.i32", ADD_THREE_SRC }, + { 0x1d900, "RSHIFT_RSUB.i32", ADD_THREE_SRC }, + { 0x1da00, "ARSHIFT_ADD.i32", ADD_THREE_SRC }, + { 0x1db00, "ARSHIFT_SUB.i32", ADD_THREE_SRC }, + { 0x1dc00, "ARSHIFT_RSUB.i32", ADD_THREE_SRC }, + { 0x1dd18, "OR.i32", ADD_TWO_SRC }, + { 0x1dd20, "AND.i32", ADD_TWO_SRC }, + { 0x1dd60, "LSHIFT.i32", ADD_TWO_SRC }, + { 0x1dd50, "XOR.i32", ADD_TWO_SRC }, + { 0x1dd80, "RSHIFT.i32", ADD_TWO_SRC }, + { 0x1dda0, "ARSHIFT.i32", ADD_TWO_SRC }, +}; + +static struct add_op_info find_add_op_info(unsigned op) +{ + for (unsigned i = 0; i < ARRAY_SIZE(add_op_infos); i++) { + unsigned opCmp = ~0; + switch (add_op_infos[i].src_type) { + case ADD_ONE_SRC: + case ADD_BLENDING: + opCmp = op; + break; + case ADD_TWO_SRC: + opCmp = op & ~0x7; + break; + case ADD_THREE_SRC: + opCmp = op & ~0x3f; + break; + case ADD_TEX: + opCmp = op & ~0xf; + break; + case ADD_FADD: + case ADD_FMINMAX: + case ADD_FADD16: + opCmp = op & ~0x1fff; + break; + case ADD_FMINMAX16: + case ADD_FADDMscale: + opCmp = op & ~0xfff; + break; + case ADD_FCMP: + case ADD_FCMP16: + opCmp = op & ~0x7ff; + break; + case ADD_TEX_COMPACT: + opCmp = op & ~0x3ff; + break; + case ADD_VARYING_INTERP: + opCmp = op & ~0x7ff; + break; + case ADD_VARYING_ADDRESS: + opCmp = op & ~0xff; + break; + case ADD_LOAD_ATTR: + opCmp = op & ~0x7f; + break; + case ADD_BRANCH: + opCmp = op & ~0xfff; + break; + default: + opCmp = ~0; + break; + } + if (add_op_infos[i].op == opCmp) + return add_op_infos[i]; + } + + struct add_op_info info; + snprintf(info.name, sizeof(info.name), "op%04x", op); + info.op = op; + info.src_type = ADD_TWO_SRC; + info.has_data_reg = true; + return info; +} + +static void dump_add(uint64_t word, struct bifrost_regs regs, struct bifrost_regs next_regs, uint64_t *consts, + unsigned data_reg, unsigned offset, bool verbose) +{ + if (verbose) { + printf("# ADD: %016" PRIx64 "\n", word); + } + struct bifrost_add_inst ADD; + memcpy((char *) &ADD, (char *) &word, sizeof(ADD)); + struct add_op_info info = find_add_op_info(ADD.op); + + printf("%s", info.name); + + // float16 seems like it doesn't support output modifiers + if (info.src_type == ADD_FADD || info.src_type == ADD_FMINMAX) { + // output modifiers + dump_output_mod(bits(ADD.op, 8, 10)); + if (info.src_type == ADD_FADD) + dump_round_mode(bits(ADD.op, 10, 12)); + else + dump_minmax_mode(bits(ADD.op, 10, 12)); + } else if (info.src_type == ADD_FCMP || info.src_type == ADD_FCMP16) { + dump_fcmp(bits(ADD.op, 3, 6)); + if (info.src_type == ADD_FCMP) + printf(".f32"); + else + printf(".v2f16"); + } else if (info.src_type == ADD_FADDMscale) { + switch ((ADD.op >> 6) & 0x7) { + case 0: break; + // causes GPU hangs on G71 + case 1: printf(".invalid"); break; + // Same as usual outmod value. + case 2: printf(".clamp_0_1"); break; + // If src0 is infinite or NaN, flush it to zero so that the other + // source is passed through unmodified. + case 3: printf(".flush_src0_inf_nan"); break; + // Vice versa. + case 4: printf(".flush_src1_inf_nan"); break; + // Every other case seems to behave the same as the above? + default: printf(".unk%d", (ADD.op >> 6) & 0x7); break; + } + } else if (info.src_type == ADD_VARYING_INTERP) { + if (ADD.op & 0x200) + printf(".reuse"); + if (ADD.op & 0x400) + printf(".flat"); + switch ((ADD.op >> 7) & 0x3) { + case 0: printf(".per_frag"); break; + case 1: printf(".centroid"); break; + case 2: break; + case 3: printf(".explicit"); break; + } + printf(".v%d", ((ADD.op >> 5) & 0x3) + 1); + } else if (info.src_type == ADD_BRANCH) { + enum branch_code branchCode = (enum branch_code) ((ADD.op >> 6) & 0x3f); + if (branchCode == BR_ALWAYS) { + // unconditional branch + } else { + enum branch_cond cond = (enum branch_cond) ((ADD.op >> 6) & 0x7); + enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7); + bool portSwapped = (ADD.op & 0x7) < ADD.src0; + // See the comment in branch_bit_size + if (size == BR_SIZE_16YX0) + portSwapped = true; + if (size == BR_SIZE_16YX1) + portSwapped = false; + // These sizes are only for floating point comparisons, so the + // non-floating-point comparisons are reused to encode the flipped + // versions. + if (size == BR_SIZE_32_AND_16X || size == BR_SIZE_32_AND_16Y) + portSwapped = false; + // There's only one argument, so we reuse the extra argument to + // encode this. + if (size == BR_SIZE_ZERO) + portSwapped = !(ADD.op & 1); + + switch (cond) { + case BR_COND_LT: + if (portSwapped) + printf(".LT.u"); + else + printf(".LT.i"); + break; + case BR_COND_LE: + if (size == BR_SIZE_32_AND_16X || size == BR_SIZE_32_AND_16Y) { + printf(".UNE.f"); + } else { + if (portSwapped) + printf(".LE.u"); + else + printf(".LE.i"); + } + break; + case BR_COND_GT: + if (portSwapped) + printf(".GT.u"); + else + printf(".GT.i"); + break; + case BR_COND_GE: + if (portSwapped) + printf(".GE.u"); + else + printf(".GE.i"); + break; + case BR_COND_EQ: + if (portSwapped) + printf(".NE.i"); + else + printf(".EQ.i"); + break; + case BR_COND_OEQ: + if (portSwapped) + printf(".UNE.f"); + else + printf(".OEQ.f"); + break; + case BR_COND_OGT: + if (portSwapped) + printf(".OGT.unk.f"); + else + printf(".OGT.f"); + break; + case BR_COND_OLT: + if (portSwapped) + printf(".OLT.unk.f"); + else + printf(".OLT.f"); + break; + } + switch (size) { + case BR_SIZE_32: + case BR_SIZE_32_AND_16X: + case BR_SIZE_32_AND_16Y: + printf("32"); + break; + case BR_SIZE_16XX: + case BR_SIZE_16YY: + case BR_SIZE_16YX0: + case BR_SIZE_16YX1: + printf("16"); + break; + case BR_SIZE_ZERO: { + unsigned ctrl = (ADD.op >> 1) & 0x3; + if (ctrl == 0) + printf("32.Z"); + else + printf("16.Z"); + break; + } + } + } + } + printf(" "); + + struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(next_regs); + if (next_ctrl.add_write_unit != REG_WRITE_NONE) { + printf("{R%d, T1}, ", GetRegToWrite(next_ctrl.add_write_unit, next_regs)); + } else { + printf("T1, "); + } + + switch (info.src_type) { + case ADD_BLENDING: + // Note: in this case, regs.uniform_const == location | 0x8 + // This probably means we can't load uniforms or immediates in the + // same instruction. This re-uses the encoding that normally means + // "disabled", where the low 4 bits are ignored. Perhaps the extra + // 0x8 or'd in indicates this is happening. + printf("location:%d, ", regs.uniform_const & 0x7); + // fallthrough + case ADD_ONE_SRC: + dump_src(ADD.src0, regs, consts, false); + break; + case ADD_TEX: + case ADD_TEX_COMPACT: { + int tex_index; + int sampler_index; + bool dualTex = false; + if (info.src_type == ADD_TEX_COMPACT) { + tex_index = (ADD.op >> 3) & 0x7; + sampler_index = (ADD.op >> 7) & 0x7; + bool unknown = (ADD.op & 0x40); + // TODO: figure out if the unknown bit is ever 0 + if (!unknown) + printf("unknown "); + } else { + uint64_t constVal = get_const(consts, regs); + uint32_t controlBits = (ADD.op & 0x8) ? (constVal >> 32) : constVal; + struct bifrost_tex_ctrl ctrl; + memcpy((char *) &ctrl, (char *) &controlBits, sizeof(ctrl)); + + // TODO: figure out what actually triggers dual-tex + if (ctrl.result_type == 9) { + struct bifrost_dual_tex_ctrl dualCtrl; + memcpy((char *) &dualCtrl, (char *) &controlBits, sizeof(ctrl)); + printf("(dualtex) tex0:%d samp0:%d tex1:%d samp1:%d ", + dualCtrl.tex_index0, dualCtrl.sampler_index0, + dualCtrl.tex_index1, dualCtrl.sampler_index1); + if (dualCtrl.unk0 != 3) + printf("unk:%d ", dualCtrl.unk0); + dualTex = true; + } else { + if (ctrl.no_merge_index) { + tex_index = ctrl.tex_index; + sampler_index = ctrl.sampler_index; + } else { + tex_index = sampler_index = ctrl.tex_index; + unsigned unk = ctrl.sampler_index >> 2; + if (unk != 3) + printf("unk:%d ", unk); + if (ctrl.sampler_index & 1) + tex_index = -1; + if (ctrl.sampler_index & 2) + sampler_index = -1; + } + + if (ctrl.unk0 != 3) + printf("unk0:%d ", ctrl.unk0); + if (ctrl.unk1) + printf("unk1 "); + if (ctrl.unk2 != 0xf) + printf("unk2:%x ", ctrl.unk2); + + switch (ctrl.result_type) { + case 0x4: + printf("f32 "); break; + case 0xe: + printf("i32 "); break; + case 0xf: + printf("u32 "); break; + default: + printf("unktype(%x) ", ctrl.result_type); + } + + switch (ctrl.tex_type) { + case 0: + printf("cube "); break; + case 1: + printf("buffer "); break; + case 2: + printf("2D "); break; + case 3: + printf("3D "); break; + } + + if (ctrl.is_shadow) + printf("shadow "); + if (ctrl.is_array) + printf("array "); + + if (!ctrl.filter) { + if (ctrl.calc_gradients) { + int comp = (controlBits >> 20) & 0x3; + printf("txg comp:%d ", comp); + } else { + printf("txf "); + } + } else { + if (!ctrl.not_supply_lod) { + if (ctrl.compute_lod) + printf("lod_bias "); + else + printf("lod "); + } + + if (!ctrl.calc_gradients) + printf("grad "); + } + + if (ctrl.texel_offset) + printf("offset "); + } + } + + if (!dualTex) { + if (tex_index == -1) + printf("tex:indirect "); + else + printf("tex:%d ", tex_index); + + if (sampler_index == -1) + printf("samp:indirect "); + else + printf("samp:%d ", sampler_index); + } + break; + } + case ADD_VARYING_INTERP: { + unsigned addr = ADD.op & 0x1f; + if (addr < 0b10100) { + // direct addr + printf("%d", addr); + } else if (addr < 0b11000) { + if (addr == 22) + printf("fragw"); + else if (addr == 23) + printf("fragz"); + else + printf("unk%d", addr); + } else { + dump_src(ADD.op & 0x7, regs, consts, false); + } + printf(", "); + dump_src(ADD.src0, regs, consts, false); + break; + } + case ADD_VARYING_ADDRESS: { + dump_src(ADD.src0, regs, consts, false); + printf(", "); + dump_src(ADD.op & 0x7, regs, consts, false); + printf(", "); + unsigned location = (ADD.op >> 3) & 0x1f; + if (location < 16) { + printf("location:%d", location); + } else if (location == 20) { + printf("location:%u", (uint32_t) get_const(consts, regs)); + } else if (location == 21) { + printf("location:%u", (uint32_t) (get_const(consts, regs) >> 32)); + } else { + printf("location:%d(unk)", location); + } + break; + } + case ADD_LOAD_ATTR: + printf("location:%d, ", (ADD.op >> 3) & 0xf); + case ADD_TWO_SRC: + dump_src(ADD.src0, regs, consts, false); + printf(", "); + dump_src(ADD.op & 0x7, regs, consts, false); + break; + case ADD_THREE_SRC: + dump_src(ADD.src0, regs, consts, false); + printf(", "); + dump_src(ADD.op & 0x7, regs, consts, false); + printf(", "); + dump_src((ADD.op >> 3) & 0x7, regs, consts, false); + break; + case ADD_FADD: + case ADD_FMINMAX: + if (ADD.op & 0x10) + printf("-"); + if (ADD.op & 0x1000) + printf("abs("); + dump_src(ADD.src0, regs, consts, false); + switch ((ADD.op >> 6) & 0x3) { + case 3: + printf(".x"); + break; + default: + break; + } + if (ADD.op & 0x1000) + printf(")"); + printf(", "); + if (ADD.op & 0x20) + printf("-"); + if (ADD.op & 0x8) + printf("abs("); + dump_src(ADD.op & 0x7, regs, consts, false); + switch ((ADD.op >> 6) & 0x3) { + case 1: + case 3: + printf(".x"); + break; + case 2: + printf(".y"); + break; + case 0: + break; + default: + printf(".unk"); + break; + } + if (ADD.op & 0x8) + printf(")"); + break; + case ADD_FADD16: + if (ADD.op & 0x10) + printf("-"); + if (ADD.op & 0x1000) + printf("abs("); + dump_src(ADD.src0, regs, consts, false); + if (ADD.op & 0x1000) + printf(")"); + dump_16swizzle((ADD.op >> 6) & 0x3); + printf(", "); + if (ADD.op & 0x20) + printf("-"); + if (ADD.op & 0x8) + printf("abs("); + dump_src(ADD.op & 0x7, regs, consts, false); + dump_16swizzle((ADD.op >> 8) & 0x3); + if (ADD.op & 0x8) + printf(")"); + break; + case ADD_FMINMAX16: { + bool abs1 = ADD.op & 0x8; + bool abs2 = (ADD.op & 0x7) < ADD.src0; + if (ADD.op & 0x10) + printf("-"); + if (abs1 || abs2) + printf("abs("); + dump_src(ADD.src0, regs, consts, false); + dump_16swizzle((ADD.op >> 6) & 0x3); + if (abs1 || abs2) + printf(")"); + printf(", "); + if (ADD.op & 0x20) + printf("-"); + if (abs1 && abs2) + printf("abs("); + dump_src(ADD.op & 0x7, regs, consts, false); + dump_16swizzle((ADD.op >> 8) & 0x3); + if (abs1 && abs2) + printf(")"); + break; + } + case ADD_FADDMscale: { + if (ADD.op & 0x400) + printf("-"); + if (ADD.op & 0x200) + printf("abs("); + dump_src(ADD.src0, regs, consts, false); + if (ADD.op & 0x200) + printf(")"); + + printf(", "); + + if (ADD.op & 0x800) + printf("-"); + dump_src(ADD.op & 0x7, regs, consts, false); + + printf(", "); + + dump_src((ADD.op >> 3) & 0x7, regs, consts, false); + break; + } + case ADD_FCMP: + if (ADD.op & 0x400) { + printf("-"); + } + if (ADD.op & 0x100) { + printf("abs("); + } + dump_src(ADD.src0, regs, consts, false); + switch ((ADD.op >> 6) & 0x3) { + case 3: + printf(".x"); + break; + default: + break; + } + if (ADD.op & 0x100) { + printf(")"); + } + printf(", "); + if (ADD.op & 0x200) { + printf("abs("); + } + dump_src(ADD.op & 0x7, regs, consts, false); + switch ((ADD.op >> 6) & 0x3) { + case 1: + case 3: + printf(".x"); + break; + case 2: + printf(".y"); + break; + case 0: + break; + default: + printf(".unk"); + break; + } + if (ADD.op & 0x200) { + printf(")"); + } + break; + case ADD_FCMP16: + dump_src(ADD.src0, regs, consts, false); + dump_16swizzle((ADD.op >> 6) & 0x3); + printf(", "); + dump_src(ADD.op & 0x7, regs, consts, false); + dump_16swizzle((ADD.op >> 8) & 0x3); + break; + case ADD_BRANCH: { + enum branch_code code = (enum branch_code) ((ADD.op >> 6) & 0x3f); + enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7); + if (code != BR_ALWAYS) { + dump_src(ADD.src0, regs, consts, false); + switch (size) { + case BR_SIZE_16XX: + printf(".x"); + break; + case BR_SIZE_16YY: + case BR_SIZE_16YX0: + case BR_SIZE_16YX1: + printf(".y"); + break; + case BR_SIZE_ZERO: { + unsigned ctrl = (ADD.op >> 1) & 0x3; + switch (ctrl) { + case 1: + printf(".y"); + break; + case 2: + printf(".x"); + break; + default: + break; + } + } + default: + break; + } + printf(", "); + } + if (code != BR_ALWAYS && size != BR_SIZE_ZERO) { + dump_src(ADD.op & 0x7, regs, consts, false); + switch (size) { + case BR_SIZE_16XX: + case BR_SIZE_16YX0: + case BR_SIZE_16YX1: + case BR_SIZE_32_AND_16X: + printf(".x"); + break; + case BR_SIZE_16YY: + case BR_SIZE_32_AND_16Y: + printf(".y"); + break; + default: + break; + } + printf(", "); + } + // I haven't had the chance to test if this actually specifies the + // branch offset, since I couldn't get it to produce values other + // than 5 (uniform/const high), but these three bits are always + // consistent across branch instructions, so it makes sense... + int offsetSrc = (ADD.op >> 3) & 0x7; + if (offsetSrc == 4 || offsetSrc == 5) { + // If the offset is known/constant, we can decode it + uint32_t raw_offset; + if (offsetSrc == 4) + raw_offset = get_const(consts, regs); + else + raw_offset = get_const(consts, regs) >> 32; + // The high 4 bits are flags, while the rest is the + // twos-complement offset in bytes (here we convert to + // clauses). + int32_t branch_offset = ((int32_t) raw_offset << 4) >> 8; + + // If high4 is the high 4 bits of the last 64-bit constant, + // this is calculated as (high4 + 4) & 0xf, or 0 if the branch + // offset itself is the last constant. Not sure if this is + // actually used, or just garbage in unused bits, but in any + // case, we can just ignore it here since it's redundant. Note + // that if there is any padding, this will be 4 since the + // padding counts as the last constant. + unsigned flags = raw_offset >> 28; + (void) flags; + + // Note: the offset is in bytes, relative to the beginning of the + // current clause, so a zero offset would be a loop back to the + // same clause (annoyingly different from Midgard). + printf("clause_%d", offset + branch_offset); + } else { + dump_src(offsetSrc, regs, consts, false); + } + } + } + if (info.has_data_reg) { + printf(", R%d", data_reg); + } + printf("\n"); +} + +void dump_instr(const struct bifrost_alu_inst *instr, struct bifrost_regs next_regs, uint64_t *consts, + unsigned data_reg, unsigned offset, bool verbose) +{ + struct bifrost_regs regs; + memcpy((char *) ®s, (char *) &instr->reg_bits, sizeof(regs)); + + if (verbose) { + printf("# regs: %016" PRIx64 "\n", instr->reg_bits); + dump_regs(regs); + } + dump_fma(instr->fma_bits, regs, next_regs, consts, verbose); + dump_add(instr->add_bits, regs, next_regs, consts, data_reg, offset, verbose); +} + +bool dump_clause(uint32_t *words, unsigned *size, unsigned offset, bool verbose) { + // State for a decoded clause + struct bifrost_alu_inst instrs[8] = {}; + uint64_t consts[6] = {}; + unsigned num_instrs = 0; + unsigned num_consts = 0; + uint64_t header_bits = 0; + bool stopbit = false; + + unsigned i; + for (i = 0; ; i++, words += 4) { + if (verbose) { + printf("# "); + for (int j = 0; j < 4; j++) + printf("%08x ", words[3 - j]); // low bit on the right + printf("\n"); + } + unsigned tag = bits(words[0], 0, 8); + + // speculatively decode some things that are common between many formats, so we can share some code + struct bifrost_alu_inst main_instr = {}; + // 20 bits + main_instr.add_bits = bits(words[2], 2, 32 - 13); + // 23 bits + main_instr.fma_bits = bits(words[1], 11, 32) | bits(words[2], 0, 2) << (32 - 11); + // 35 bits + main_instr.reg_bits = ((uint64_t) bits(words[1], 0, 11)) << 24 | (uint64_t) bits(words[0], 8, 32); + + uint64_t const0 = bits(words[0], 8, 32) << 4 | (uint64_t) words[1] << 28 | bits(words[2], 0, 4) << 60; + uint64_t const1 = bits(words[2], 4, 32) << 4 | (uint64_t) words[3] << 32; + + bool stop = tag & 0x40; + + if (verbose) { + printf("# tag: 0x%02x\n", tag); + } + if (tag & 0x80) { + unsigned idx = stop ? 5 : 2; + main_instr.add_bits |= ((tag >> 3) & 0x7) << 17; + instrs[idx + 1] = main_instr; + instrs[idx].add_bits = bits(words[3], 0, 17) | ((tag & 0x7) << 17); + instrs[idx].fma_bits |= bits(words[2], 19, 32) << 10; + consts[0] = bits(words[3], 17, 32) << 4; + } else { + bool done = false; + switch ((tag >> 3) & 0x7) { + case 0x0: + switch (tag & 0x7) { + case 0x3: + main_instr.add_bits |= bits(words[3], 29, 32) << 17; + instrs[1] = main_instr; + num_instrs = 2; + done = stop; + break; + case 0x4: + instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; + instrs[2].fma_bits |= bits(words[2], 19, 32) << 10; + consts[0] = const0; + num_instrs = 3; + num_consts = 1; + done = stop; + break; + case 0x1: + case 0x5: + instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; + instrs[2].fma_bits |= bits(words[2], 19, 32) << 10; + main_instr.add_bits |= bits(words[3], 26, 29) << 17; + instrs[3] = main_instr; + if ((tag & 0x7) == 0x5) { + num_instrs = 4; + done = stop; + } + break; + case 0x6: + instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; + instrs[5].fma_bits |= bits(words[2], 19, 32) << 10; + consts[0] = const0; + num_instrs = 6; + num_consts = 1; + done = stop; + break; + case 0x7: + instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; + instrs[5].fma_bits |= bits(words[2], 19, 32) << 10; + main_instr.add_bits |= bits(words[3], 26, 29) << 17; + instrs[6] = main_instr; + num_instrs = 7; + done = stop; + break; + default: + printf("unknown tag bits 0x%02x\n", tag); + } + break; + case 0x2: + case 0x3: { + unsigned idx = ((tag >> 3) & 0x7) == 2 ? 4 : 7; + main_instr.add_bits |= (tag & 0x7) << 17; + instrs[idx] = main_instr; + consts[0] |= (bits(words[2], 19, 32) | ((uint64_t) words[3] << 13)) << 19; + num_consts = 1; + num_instrs = idx + 1; + done = stop; + break; + } + case 0x4: { + unsigned idx = stop ? 4 : 1; + main_instr.add_bits |= (tag & 0x7) << 17; + instrs[idx] = main_instr; + instrs[idx + 1].fma_bits |= bits(words[3], 22, 32); + instrs[idx + 1].reg_bits = bits(words[2], 19, 32) | (bits(words[3], 0, 22) << (32 - 19)); + break; + } + case 0x1: + // only constants can come after this + num_instrs = 1; + done = stop; + case 0x5: + header_bits = bits(words[2], 19, 32) | ((uint64_t) words[3] << (32 - 19)); + main_instr.add_bits |= (tag & 0x7) << 17; + instrs[0] = main_instr; + break; + case 0x6: + case 0x7: { + unsigned pos = tag & 0xf; + // note that `pos' encodes both the total number of + // instructions and the position in the constant stream, + // presumably because decoded constants and instructions + // share a buffer in the decoder, but we only care about + // the position in the constant stream; the total number of + // instructions is redundant. + unsigned const_idx = 7; + switch (pos) { + case 0: + case 1: + case 2: + case 6: + const_idx = 0; + break; + case 3: + case 4: + case 7: + case 9: + const_idx = 1; + break; + case 5: + case 0xa: + const_idx = 2; + break; + case 8: + case 0xb: + case 0xc: + const_idx = 3; + break; + case 0xd: + const_idx = 4; + break; + default: + printf("# unknown pos 0x%x\n", pos); + } + if (num_consts < const_idx + 2) + num_consts = const_idx + 2; + consts[const_idx] = const0; + consts[const_idx + 1] = const1; + done = stop; + break; + } + default: + break; + } + + if (done) + break; + } + } + + *size = i + 1; + + if (verbose) { + printf("# header: %012" PRIx64 "\n", header_bits); + } + + struct bifrost_header header; + memcpy((char *) &header, (char *) &header_bits, sizeof(struct bifrost_header)); + dump_header(header, verbose); + if (!header.no_end_of_shader) + stopbit = true; + + printf("{\n"); + for (i = 0; i < num_instrs; i++) { + struct bifrost_regs next_regs; + if (i + 1 == num_instrs) { + memcpy((char *) &next_regs, (char *) &instrs[0].reg_bits, + sizeof(next_regs)); + } else { + memcpy((char *) &next_regs, (char *) &instrs[i + 1].reg_bits, + sizeof(next_regs)); + } + + dump_instr(&instrs[i], next_regs, consts, header.datareg, offset, verbose); + } + printf("}\n"); + + if (verbose) { + for (unsigned i = 0; i < num_consts; i++) { + printf("# const%d: %08" PRIx64 "\n", 2 * i, consts[i] & 0xffffffff); + printf("# const%d: %08" PRIx64 "\n", 2 * i + 1, consts[i] >> 32); + } + } + return stopbit; +} + +void disassemble_bifrost(uint8_t *code, size_t size, bool verbose) +{ + uint32_t *words = (uint32_t *) code; + uint32_t *words_end = words + (size / 4); + // used for displaying branch targets + unsigned offset = 0; + while (words != words_end) + { + // we don't know what the program-end bit is quite yet, so for now just + // assume that an all-0 quadword is padding + uint32_t zero[4] = {}; + if (memcmp(words, zero, 4 * sizeof(uint32_t)) == 0) + break; + printf("clause_%d:\n", offset); + unsigned size; + if (dump_clause(words, &size, offset, verbose) == true) { + break; + } + words += size * 4; + offset += size; + } +} + diff --git a/src/panfrost/bifrost/disassemble.h b/src/panfrost/bifrost/disassemble.h new file mode 100644 index 00000000000..7c22d43ac77 --- /dev/null +++ b/src/panfrost/bifrost/disassemble.h @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2019 Connor Abbott <[email protected]> + * Copyright (C) 2019 Lyude Paul <[email protected]> + * Copyright (C) 2019 Ryan Houdek <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdbool.h> +#include <stddef.h> +void disassemble_bifrost(uint8_t *code, size_t size, bool verbose); diff --git a/src/panfrost/bifrost/meson.build b/src/panfrost/bifrost/meson.build new file mode 100644 index 00000000000..1258cd04caf --- /dev/null +++ b/src/panfrost/bifrost/meson.build @@ -0,0 +1,33 @@ +# Copyright © 2018 Rob Clark +# Copyright © 2019 Collabora + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +libpanfrost_bifrost_files = files( + 'disassemble.c', +) + +libpanfrost_bifrost = static_library( + 'panfrost_bifrost', + [libpanfrost_bifrost_files], + include_directories : [inc_common], + c_args : [c_vis_args, no_override_init_args], + cpp_args : [cpp_vis_args], + build_by_default : false, +) diff --git a/src/panfrost/include/panfrost-job.h b/src/panfrost/include/panfrost-job.h new file mode 100644 index 00000000000..0c559309946 --- /dev/null +++ b/src/panfrost/include/panfrost-job.h @@ -0,0 +1,1602 @@ +/* + * © Copyright 2017-2018 Alyssa Rosenzweig + * © Copyright 2017-2018 Connor Abbott + * © Copyright 2017-2018 Lyude Paul + * © Copyright2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __PANFROST_JOB_H__ +#define __PANFROST_JOB_H__ + +#include <stdint.h> +#include <panfrost-misc.h> + +#define MALI_SHORT_PTR_BITS (sizeof(uintptr_t)*8) + +#define MALI_FBD_HIERARCHY_WEIGHTS 8 + +#define MALI_PAYLOAD_SIZE 256 + +typedef u32 mali_jd_core_req; + +enum mali_job_type { + JOB_NOT_STARTED = 0, + JOB_TYPE_NULL = 1, + JOB_TYPE_SET_VALUE = 2, + JOB_TYPE_CACHE_FLUSH = 3, + JOB_TYPE_COMPUTE = 4, + JOB_TYPE_VERTEX = 5, + JOB_TYPE_GEOMETRY = 6, + JOB_TYPE_TILER = 7, + JOB_TYPE_FUSED = 8, + JOB_TYPE_FRAGMENT = 9, +}; + +enum mali_draw_mode { + MALI_DRAW_NONE = 0x0, + MALI_POINTS = 0x1, + MALI_LINES = 0x2, + MALI_LINE_STRIP = 0x4, + MALI_LINE_LOOP = 0x6, + MALI_TRIANGLES = 0x8, + MALI_TRIANGLE_STRIP = 0xA, + MALI_TRIANGLE_FAN = 0xC, + MALI_POLYGON = 0xD, + MALI_QUADS = 0xE, + MALI_QUAD_STRIP = 0xF, + + /* All other modes invalid */ +}; + +/* Applies to tiler_gl_enables */ + + +#define MALI_OCCLUSION_QUERY (1 << 3) +#define MALI_OCCLUSION_PRECISE (1 << 4) + +/* Set for a glFrontFace(GL_CCW) in a Y=0=TOP coordinate system (like Gallium). + * In OpenGL, this would corresponds to glFrontFace(GL_CW). Mesa and the blob + * disagree about how to do viewport flipping, so the blob actually sets this + * for GL_CW but then has a negative viewport stride */ +#define MALI_FRONT_CCW_TOP (1 << 5) + +#define MALI_CULL_FACE_FRONT (1 << 6) +#define MALI_CULL_FACE_BACK (1 << 7) + +/* TODO: Might this actually be a finer bitfield? */ +#define MALI_DEPTH_STENCIL_ENABLE 0x6400 + +#define DS_ENABLE(field) \ + (field == MALI_DEPTH_STENCIL_ENABLE) \ + ? "MALI_DEPTH_STENCIL_ENABLE" \ + : (field == 0) ? "0" \ + : "0 /* XXX: Unknown, check hexdump */" + +/* Used in stencil and depth tests */ + +enum mali_func { + MALI_FUNC_NEVER = 0, + MALI_FUNC_LESS = 1, + MALI_FUNC_EQUAL = 2, + MALI_FUNC_LEQUAL = 3, + MALI_FUNC_GREATER = 4, + MALI_FUNC_NOTEQUAL = 5, + MALI_FUNC_GEQUAL = 6, + MALI_FUNC_ALWAYS = 7 +}; + +/* Same OpenGL, but mixed up. Why? Because forget me, that's why! */ + +enum mali_alt_func { + MALI_ALT_FUNC_NEVER = 0, + MALI_ALT_FUNC_GREATER = 1, + MALI_ALT_FUNC_EQUAL = 2, + MALI_ALT_FUNC_GEQUAL = 3, + MALI_ALT_FUNC_LESS = 4, + MALI_ALT_FUNC_NOTEQUAL = 5, + MALI_ALT_FUNC_LEQUAL = 6, + MALI_ALT_FUNC_ALWAYS = 7 +}; + +/* Flags apply to unknown2_3? */ + +#define MALI_HAS_MSAA (1 << 0) +#define MALI_CAN_DISCARD (1 << 5) + +/* Applies on SFBD systems, specifying that programmable blending is in use */ +#define MALI_HAS_BLEND_SHADER (1 << 6) + +/* func is mali_func */ +#define MALI_DEPTH_FUNC(func) (func << 8) +#define MALI_GET_DEPTH_FUNC(flags) ((flags >> 8) & 0x7) +#define MALI_DEPTH_FUNC_MASK MALI_DEPTH_FUNC(0x7) + +#define MALI_DEPTH_TEST (1 << 11) + +/* Next flags to unknown2_4 */ +#define MALI_STENCIL_TEST (1 << 0) + +/* What?! */ +#define MALI_SAMPLE_ALPHA_TO_COVERAGE_NO_BLEND_SHADER (1 << 1) + +#define MALI_NO_DITHER (1 << 9) +#define MALI_DEPTH_RANGE_A (1 << 12) +#define MALI_DEPTH_RANGE_B (1 << 13) +#define MALI_NO_MSAA (1 << 14) + +/* Stencil test state is all encoded in a single u32, just with a lot of + * enums... */ + +enum mali_stencil_op { + MALI_STENCIL_KEEP = 0, + MALI_STENCIL_REPLACE = 1, + MALI_STENCIL_ZERO = 2, + MALI_STENCIL_INVERT = 3, + MALI_STENCIL_INCR_WRAP = 4, + MALI_STENCIL_DECR_WRAP = 5, + MALI_STENCIL_INCR = 6, + MALI_STENCIL_DECR = 7 +}; + +struct mali_stencil_test { + unsigned ref : 8; + unsigned mask : 8; + enum mali_func func : 3; + enum mali_stencil_op sfail : 3; + enum mali_stencil_op dpfail : 3; + enum mali_stencil_op dppass : 3; + unsigned zero : 4; +} __attribute__((packed)); + +#define MALI_MASK_R (1 << 0) +#define MALI_MASK_G (1 << 1) +#define MALI_MASK_B (1 << 2) +#define MALI_MASK_A (1 << 3) + +enum mali_nondominant_mode { + MALI_BLEND_NON_MIRROR = 0, + MALI_BLEND_NON_ZERO = 1 +}; + +enum mali_dominant_blend { + MALI_BLEND_DOM_SOURCE = 0, + MALI_BLEND_DOM_DESTINATION = 1 +}; + +enum mali_dominant_factor { + MALI_DOMINANT_UNK0 = 0, + MALI_DOMINANT_ZERO = 1, + MALI_DOMINANT_SRC_COLOR = 2, + MALI_DOMINANT_DST_COLOR = 3, + MALI_DOMINANT_UNK4 = 4, + MALI_DOMINANT_SRC_ALPHA = 5, + MALI_DOMINANT_DST_ALPHA = 6, + MALI_DOMINANT_CONSTANT = 7, +}; + +enum mali_blend_modifier { + MALI_BLEND_MOD_UNK0 = 0, + MALI_BLEND_MOD_NORMAL = 1, + MALI_BLEND_MOD_SOURCE_ONE = 2, + MALI_BLEND_MOD_DEST_ONE = 3, +}; + +struct mali_blend_mode { + enum mali_blend_modifier clip_modifier : 2; + unsigned unused_0 : 1; + unsigned negate_source : 1; + + enum mali_dominant_blend dominant : 1; + + enum mali_nondominant_mode nondominant_mode : 1; + + unsigned unused_1 : 1; + + unsigned negate_dest : 1; + + enum mali_dominant_factor dominant_factor : 3; + unsigned complement_dominant : 1; +} __attribute__((packed)); + +struct mali_blend_equation { + /* Of type mali_blend_mode */ + unsigned rgb_mode : 12; + unsigned alpha_mode : 12; + + unsigned zero1 : 4; + + /* Corresponds to MALI_MASK_* above and glColorMask arguments */ + + unsigned color_mask : 4; +} __attribute__((packed)); + +/* Used with channel swizzling */ +enum mali_channel { + MALI_CHANNEL_RED = 0, + MALI_CHANNEL_GREEN = 1, + MALI_CHANNEL_BLUE = 2, + MALI_CHANNEL_ALPHA = 3, + MALI_CHANNEL_ZERO = 4, + MALI_CHANNEL_ONE = 5, + MALI_CHANNEL_RESERVED_0 = 6, + MALI_CHANNEL_RESERVED_1 = 7, +}; + +struct mali_channel_swizzle { + enum mali_channel r : 3; + enum mali_channel g : 3; + enum mali_channel b : 3; + enum mali_channel a : 3; +} __attribute__((packed)); + +/* Compressed per-pixel formats. Each of these formats expands to one to four + * floating-point or integer numbers, as defined by the OpenGL specification. + * There are various places in OpenGL where the user can specify a compressed + * format in memory, which all use the same 8-bit enum in the various + * descriptors, although different hardware units support different formats. + */ + +/* The top 3 bits specify how the bits of each component are interpreted. */ + +/* e.g. R11F_G11F_B10F */ +#define MALI_FORMAT_SPECIAL (2 << 5) + +/* signed normalized, e.g. RGBA8_SNORM */ +#define MALI_FORMAT_SNORM (3 << 5) + +/* e.g. RGBA8UI */ +#define MALI_FORMAT_UINT (4 << 5) + +/* e.g. RGBA8 and RGBA32F */ +#define MALI_FORMAT_UNORM (5 << 5) + +/* e.g. RGBA8I and RGBA16F */ +#define MALI_FORMAT_SINT (6 << 5) + +/* These formats seem to largely duplicate the others. They're used at least + * for Bifrost framebuffer output. + */ +#define MALI_FORMAT_SPECIAL2 (7 << 5) + +/* If the high 3 bits are 3 to 6 these two bits say how many components + * there are. + */ +#define MALI_NR_CHANNELS(n) ((n - 1) << 3) + +/* If the high 3 bits are 3 to 6, then the low 3 bits say how big each + * component is, except the special MALI_CHANNEL_FLOAT which overrides what the + * bits mean. + */ + +#define MALI_CHANNEL_4 2 + +#define MALI_CHANNEL_8 3 + +#define MALI_CHANNEL_16 4 + +#define MALI_CHANNEL_32 5 + +/* For MALI_FORMAT_SINT it means a half-float (e.g. RG16F). For + * MALI_FORMAT_UNORM, it means a 32-bit float. + */ +#define MALI_CHANNEL_FLOAT 7 + +enum mali_format { + MALI_RGB565 = MALI_FORMAT_SPECIAL | 0x0, + MALI_RGB5_A1_UNORM = MALI_FORMAT_SPECIAL | 0x2, + MALI_RGB10_A2_UNORM = MALI_FORMAT_SPECIAL | 0x3, + MALI_RGB10_A2_SNORM = MALI_FORMAT_SPECIAL | 0x5, + MALI_RGB10_A2UI = MALI_FORMAT_SPECIAL | 0x7, + MALI_RGB10_A2I = MALI_FORMAT_SPECIAL | 0x9, + + /* YUV formats */ + MALI_NV12 = MALI_FORMAT_SPECIAL | 0xc, + + MALI_Z32_UNORM = MALI_FORMAT_SPECIAL | 0xD, + MALI_R32_FIXED = MALI_FORMAT_SPECIAL | 0x11, + MALI_RG32_FIXED = MALI_FORMAT_SPECIAL | 0x12, + MALI_RGB32_FIXED = MALI_FORMAT_SPECIAL | 0x13, + MALI_RGBA32_FIXED = MALI_FORMAT_SPECIAL | 0x14, + MALI_R11F_G11F_B10F = MALI_FORMAT_SPECIAL | 0x19, + MALI_R9F_G9F_B9F_E5F = MALI_FORMAT_SPECIAL | 0x1b, + /* Only used for varyings, to indicate the transformed gl_Position */ + MALI_VARYING_POS = MALI_FORMAT_SPECIAL | 0x1e, + /* Only used for varyings, to indicate that the write should be + * discarded. + */ + MALI_VARYING_DISCARD = MALI_FORMAT_SPECIAL | 0x1f, + + MALI_R8_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8, + MALI_R16_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16, + MALI_R32_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32, + MALI_RG8_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8, + MALI_RG16_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16, + MALI_RG32_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32, + MALI_RGB8_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8, + MALI_RGB16_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16, + MALI_RGB32_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32, + MALI_RGBA8_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8, + MALI_RGBA16_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16, + MALI_RGBA32_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32, + + MALI_R8UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8, + MALI_R16UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16, + MALI_R32UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32, + MALI_RG8UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8, + MALI_RG16UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16, + MALI_RG32UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32, + MALI_RGB8UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8, + MALI_RGB16UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16, + MALI_RGB32UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32, + MALI_RGBA8UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8, + MALI_RGBA16UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16, + MALI_RGBA32UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32, + + MALI_R8_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8, + MALI_R16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16, + MALI_R32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32, + MALI_R32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_FLOAT, + MALI_RG8_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8, + MALI_RG16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16, + MALI_RG32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32, + MALI_RG32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_FLOAT, + MALI_RGB8_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8, + MALI_RGB16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16, + MALI_RGB32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32, + MALI_RGB32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_FLOAT, + MALI_RGBA4_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_4, + MALI_RGBA8_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8, + MALI_RGBA16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16, + MALI_RGBA32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32, + MALI_RGBA32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_FLOAT, + + MALI_R8I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8, + MALI_R16I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16, + MALI_R32I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32, + MALI_R16F = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_FLOAT, + MALI_RG8I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8, + MALI_RG16I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16, + MALI_RG32I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32, + MALI_RG16F = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_FLOAT, + MALI_RGB8I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8, + MALI_RGB16I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16, + MALI_RGB32I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32, + MALI_RGB16F = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_FLOAT, + MALI_RGBA8I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8, + MALI_RGBA16I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16, + MALI_RGBA32I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32, + MALI_RGBA16F = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_FLOAT, + + MALI_RGBA4 = MALI_FORMAT_SPECIAL2 | 0x8, + MALI_RGBA8_2 = MALI_FORMAT_SPECIAL2 | 0xd, + MALI_RGB10_A2_2 = MALI_FORMAT_SPECIAL2 | 0xe, +}; + + +/* Alpha coverage is encoded as 4-bits (from a clampf), with inversion + * literally performing a bitwise invert. This function produces slightly wrong + * results and I'm not sure why; some rounding issue I suppose... */ + +#define MALI_ALPHA_COVERAGE(clampf) ((uint16_t) (int) (clampf * 15.0f)) +#define MALI_GET_ALPHA_COVERAGE(nibble) ((float) nibble / 15.0f) + +/* Applies to midgard1.flags */ + +/* Should the hardware perform early-Z testing? Normally should be set + * for performance reasons. Clear if you use: discard, + * alpha-to-coverage... * It's also possible this disables + * forward-pixel kill; we're not quite sure which bit is which yet. + * TODO: How does this interact with blending?*/ + +#define MALI_EARLY_Z (1 << 6) + +/* Should the hardware calculate derivatives (via helper invocations)? Set in a + * fragment shader that uses texturing or derivative functions */ + +#define MALI_HELPER_INVOCATIONS (1 << 7) + +/* Flags denoting the fragment shader's use of tilebuffer readback. If the + * shader might read any part of the tilebuffer, set MALI_READS_TILEBUFFER. If + * it might read depth/stencil in particular, also set MALI_READS_ZS */ + +#define MALI_READS_ZS (1 << 8) +#define MALI_READS_TILEBUFFER (1 << 12) + +/* The raw Midgard blend payload can either be an equation or a shader + * address, depending on the context */ + +union midgard_blend { + mali_ptr shader; + + struct { + struct mali_blend_equation equation; + float constant; + }; +}; + +/* On MRT Midgard systems (using an MFBD), each render target gets its own + * blend descriptor */ + +#define MALI_BLEND_SRGB (0x400) + +struct midgard_blend_rt { + /* Flags base value of 0x200 to enable the render target. + * OR with 0x1 for blending (anything other than REPLACE). + * OR with 0x2 for programmable blending with 0-2 registers + * OR with 0x3 for programmable blending with 2+ registers + * OR with MALI_BLEND_SRGB for implicit sRGB + */ + + u64 flags; + union midgard_blend blend; +} __attribute__((packed)); + +/* On Bifrost systems (all MRT), each render target gets one of these + * descriptors */ + +struct bifrost_blend_rt { + /* This is likely an analogue of the flags on + * midgard_blend_rt */ + + u16 flags; // = 0x200 + + /* Single-channel blend constants are encoded in a sort of + * fixed-point. Basically, the float is mapped to a byte, becoming + * a high byte, and then the lower-byte is added for precision. + * For the original float f: + * + * f = (constant_hi / 255) + (constant_lo / 65535) + * + * constant_hi = int(f / 255) + * constant_lo = 65535*f - (65535/255) * constant_hi + */ + + u16 constant; + + struct mali_blend_equation equation; + /* + * - 0x19 normally + * - 0x3 when this slot is unused (everything else is 0 except the index) + * - 0x11 when this is the fourth slot (and it's used) ++ * - 0 when there is a blend shader + */ + u16 unk2; + /* increments from 0 to 3 */ + u16 index; + + union { + struct { + /* So far, I've only seen: + * - R001 for 1-component formats + * - RG01 for 2-component formats + * - RGB1 for 3-component formats + * - RGBA for 4-component formats + */ + u32 swizzle : 12; + enum mali_format format : 8; + + /* Type of the shader output variable. Note, this can + * be different from the format. + * + * 0: f16 (mediump float) + * 1: f32 (highp float) + * 2: i32 (highp int) + * 3: u32 (highp uint) + * 4: i16 (mediump int) + * 5: u16 (mediump uint) + */ + u32 shader_type : 3; + u32 zero : 9; + }; + + /* Only the low 32 bits of the blend shader are stored, the + * high 32 bits are implicitly the same as the original shader. + * According to the kernel driver, the program counter for + * shaders is actually only 24 bits, so shaders cannot cross + * the 2^24-byte boundary, and neither can the blend shader. + * The blob handles this by allocating a 2^24 byte pool for + * shaders, and making sure that any blend shaders are stored + * in the same pool as the original shader. The kernel will + * make sure this allocation is aligned to 2^24 bytes. + */ + u32 shader; + }; +} __attribute__((packed)); + +/* Descriptor for the shader. Following this is at least one, up to four blend + * descriptors for each active render target */ + +struct mali_shader_meta { + mali_ptr shader; + u16 texture_count; + u16 sampler_count; + u16 attribute_count; + u16 varying_count; + + union { + struct { + u32 uniform_buffer_count : 4; + u32 unk1 : 28; // = 0x800000 for vertex, 0x958020 for tiler + } bifrost1; + struct { + unsigned uniform_buffer_count : 4; + unsigned flags : 12; + + /* Whole number of uniform registers used, times two; + * whole number of work registers used (no scale). + */ + unsigned work_count : 5; + unsigned uniform_count : 5; + unsigned unknown2 : 6; + } midgard1; + }; + + /* On bifrost: Exactly the same as glPolygonOffset() for both. + * On midgard: Depth factor is exactly as passed to glPolygonOffset. + * Depth units is equal to the value passed to glDeptOhffset + 1.0f + * (use MALI_NEGATIVE) + */ + float depth_units; + float depth_factor; + + u32 unknown2_2; + + u16 alpha_coverage; + u16 unknown2_3; + + u8 stencil_mask_front; + u8 stencil_mask_back; + u16 unknown2_4; + + struct mali_stencil_test stencil_front; + struct mali_stencil_test stencil_back; + + union { + struct { + u32 unk3 : 7; + /* On Bifrost, some system values are preloaded in + * registers R55-R62 by the thread dispatcher prior to + * the start of shader execution. This is a bitfield + * with one entry for each register saying which + * registers need to be preloaded. Right now, the known + * values are: + * + * Vertex/compute: + * - R55 : gl_LocalInvocationID.xy + * - R56 : gl_LocalInvocationID.z + unknown in high 16 bits + * - R57 : gl_WorkGroupID.x + * - R58 : gl_WorkGroupID.y + * - R59 : gl_WorkGroupID.z + * - R60 : gl_GlobalInvocationID.x + * - R61 : gl_GlobalInvocationID.y/gl_VertexID (without base) + * - R62 : gl_GlobalInvocationID.z/gl_InstanceID (without base) + * + * Fragment: + * - R55 : unknown, never seen (but the bit for this is + * always set?) + * - R56 : unknown (bit always unset) + * - R57 : gl_PrimitiveID + * - R58 : gl_FrontFacing in low bit, potentially other stuff + * - R59 : u16 fragment coordinates (used to compute + * gl_FragCoord.xy, together with sample positions) + * - R60 : gl_SampleMask (used in epilog, so pretty + * much always used, but the bit is always 0 -- is + * this just always pushed?) + * - R61 : gl_SampleMaskIn and gl_SampleID, used by + * varying interpolation. + * - R62 : unknown (bit always unset). + */ + u32 preload_regs : 8; + /* In units of 8 bytes or 64 bits, since the + * uniform/const port loads 64 bits at a time. + */ + u32 uniform_count : 7; + u32 unk4 : 10; // = 2 + } bifrost2; + struct { + u32 unknown2_7; + } midgard2; + }; + + /* zero on bifrost */ + u32 unknown2_8; + + /* Blending information for the older non-MRT Midgard HW. Check for + * MALI_HAS_BLEND_SHADER to decide how to interpret. + */ + + union midgard_blend blend; +} __attribute__((packed)); + +/* This only concerns hardware jobs */ + +/* Possible values for job_descriptor_size */ + +#define MALI_JOB_32 0 +#define MALI_JOB_64 1 + +struct mali_job_descriptor_header { + u32 exception_status; + u32 first_incomplete_task; + u64 fault_pointer; + u8 job_descriptor_size : 1; + enum mali_job_type job_type : 7; + u8 job_barrier : 1; + u8 unknown_flags : 7; + u16 job_index; + u16 job_dependency_index_1; + u16 job_dependency_index_2; + + union { + u64 next_job_64; + u32 next_job_32; + }; +} __attribute__((packed)); + +struct mali_payload_set_value { + u64 out; + u64 unknown; +} __attribute__((packed)); + +/* Special attributes have a fixed index */ +#define MALI_SPECIAL_ATTRIBUTE_BASE 16 +#define MALI_VERTEX_ID (MALI_SPECIAL_ATTRIBUTE_BASE + 0) +#define MALI_INSTANCE_ID (MALI_SPECIAL_ATTRIBUTE_BASE + 1) + +/* + * Mali Attributes + * + * This structure lets the attribute unit compute the address of an attribute + * given the vertex and instance ID. Unfortunately, the way this works is + * rather complicated when instancing is enabled. + * + * To explain this, first we need to explain how compute and vertex threads are + * dispatched. This is a guess (although a pretty firm guess!) since the + * details are mostly hidden from the driver, except for attribute instancing. + * When a quad is dispatched, it receives a single, linear index. However, we + * need to translate that index into a (vertex id, instance id) pair, or a + * (local id x, local id y, local id z) triple for compute shaders (although + * vertex shaders and compute shaders are handled almost identically). + * Focusing on vertex shaders, one option would be to do: + * + * vertex_id = linear_id % num_vertices + * instance_id = linear_id / num_vertices + * + * but this involves a costly division and modulus by an arbitrary number. + * Instead, we could pad num_vertices. We dispatch padded_num_vertices * + * num_instances threads instead of num_vertices * num_instances, which results + * in some "extra" threads with vertex_id >= num_vertices, which we have to + * discard. The more we pad num_vertices, the more "wasted" threads we + * dispatch, but the division is potentially easier. + * + * One straightforward choice is to pad num_vertices to the next power of two, + * which means that the division and modulus are just simple bit shifts and + * masking. But the actual algorithm is a bit more complicated. The thread + * dispatcher has special support for dividing by 3, 5, 7, and 9, in addition + * to dividing by a power of two. This is possibly using the technique + * described in patent US20170010862A1. As a result, padded_num_vertices can be + * 1, 3, 5, 7, or 9 times a power of two. This results in less wasted threads, + * since we need less padding. + * + * padded_num_vertices is picked by the hardware. The driver just specifies the + * actual number of vertices. At least for Mali G71, the first few cases are + * given by: + * + * num_vertices | padded_num_vertices + * 3 | 4 + * 4-7 | 8 + * 8-11 | 12 (3 * 4) + * 12-15 | 16 + * 16-19 | 20 (5 * 4) + * + * Note that padded_num_vertices is a multiple of four (presumably because + * threads are dispatched in groups of 4). Also, padded_num_vertices is always + * at least one more than num_vertices, which seems like a quirk of the + * hardware. For larger num_vertices, the hardware uses the following + * algorithm: using the binary representation of num_vertices, we look at the + * most significant set bit as well as the following 3 bits. Let n be the + * number of bits after those 4 bits. Then we set padded_num_vertices according + * to the following table: + * + * high bits | padded_num_vertices + * 1000 | 9 * 2^n + * 1001 | 5 * 2^(n+1) + * 101x | 3 * 2^(n+2) + * 110x | 7 * 2^(n+1) + * 111x | 2^(n+4) + * + * For example, if num_vertices = 70 is passed to glDraw(), its binary + * representation is 1000110, so n = 3 and the high bits are 1000, and + * therefore padded_num_vertices = 9 * 2^3 = 72. + * + * The attribute unit works in terms of the original linear_id. if + * num_instances = 1, then they are the same, and everything is simple. + * However, with instancing things get more complicated. There are four + * possible modes, two of them we can group together: + * + * 1. Use the linear_id directly. Only used when there is no instancing. + * + * 2. Use the linear_id modulo a constant. This is used for per-vertex + * attributes with instancing enabled by making the constant equal + * padded_num_vertices. Because the modulus is always padded_num_vertices, this + * mode only supports a modulus that is a power of 2 times 1, 3, 5, 7, or 9. + * The shift field specifies the power of two, while the extra_flags field + * specifies the odd number. If shift = n and extra_flags = m, then the modulus + * is (2m + 1) * 2^n. As an example, if num_vertices = 70, then as computed + * above, padded_num_vertices = 9 * 2^3, so we should set extra_flags = 4 and + * shift = 3. Note that we must exactly follow the hardware algorithm used to + * get padded_num_vertices in order to correctly implement per-vertex + * attributes. + * + * 3. Divide the linear_id by a constant. In order to correctly implement + * instance divisors, we have to divide linear_id by padded_num_vertices times + * to user-specified divisor. So first we compute padded_num_vertices, again + * following the exact same algorithm that the hardware uses, then multiply it + * by the GL-level divisor to get the hardware-level divisor. This case is + * further divided into two more cases. If the hardware-level divisor is a + * power of two, then we just need to shift. The shift amount is specified by + * the shift field, so that the hardware-level divisor is just 2^shift. + * + * If it isn't a power of two, then we have to divide by an arbitrary integer. + * For that, we use the well-known technique of multiplying by an approximation + * of the inverse. The driver must compute the magic multiplier and shift + * amount, and then the hardware does the multiplication and shift. The + * hardware and driver also use the "round-down" optimization as described in + * http://ridiculousfish.com/files/faster_unsigned_division_by_constants.pdf. + * The hardware further assumes the multiplier is between 2^31 and 2^32, so the + * high bit is implicitly set to 1 even though it is set to 0 by the driver -- + * presumably this simplifies the hardware multiplier a little. The hardware + * first multiplies linear_id by the multiplier and takes the high 32 bits, + * then applies the round-down correction if extra_flags = 1, then finally + * shifts right by the shift field. + * + * There are some differences between ridiculousfish's algorithm and the Mali + * hardware algorithm, which means that the reference code from ridiculousfish + * doesn't always produce the right constants. Mali does not use the pre-shift + * optimization, since that would make a hardware implementation slower (it + * would have to always do the pre-shift, multiply, and post-shift operations). + * It also forces the multplier to be at least 2^31, which means that the + * exponent is entirely fixed, so there is no trial-and-error. Altogether, + * given the divisor d, the algorithm the driver must follow is: + * + * 1. Set shift = floor(log2(d)). + * 2. Compute m = ceil(2^(shift + 32) / d) and e = 2^(shift + 32) % d. + * 3. If e <= 2^shift, then we need to use the round-down algorithm. Set + * magic_divisor = m - 1 and extra_flags = 1. + * 4. Otherwise, set magic_divisor = m and extra_flags = 0. + */ + +enum mali_attr_mode { + MALI_ATTR_UNUSED = 0, + MALI_ATTR_LINEAR = 1, + MALI_ATTR_POT_DIVIDE = 2, + MALI_ATTR_MODULO = 3, + MALI_ATTR_NPOT_DIVIDE = 4, +}; + +/* This magic "pseudo-address" is used as `elements` to implement + * gl_PointCoord. When read from a fragment shader, it generates a point + * coordinate per the OpenGL ES 2.0 specification. Flipped coordinate spaces + * require an affine transformation in the shader. */ + +#define MALI_VARYING_POINT_COORD (0x60) + +union mali_attr { + /* This is used for actual attributes. */ + struct { + /* The bottom 3 bits are the mode */ + mali_ptr elements : 64 - 8; + u32 shift : 5; + u32 extra_flags : 3; + u32 stride; + u32 size; + }; + /* The entry after an NPOT_DIVIDE entry has this format. It stores + * extra information that wouldn't fit in a normal entry. + */ + struct { + u32 unk; /* = 0x20 */ + u32 magic_divisor; + u32 zero; + /* This is the original, GL-level divisor. */ + u32 divisor; + }; +} __attribute__((packed)); + +struct mali_attr_meta { + /* Vertex buffer index */ + u8 index; + + unsigned unknown1 : 2; + unsigned swizzle : 12; + enum mali_format format : 8; + + /* Always observed to be zero at the moment */ + unsigned unknown3 : 2; + + /* When packing multiple attributes in a buffer, offset addresses by + * this value. Obscurely, this is signed. */ + int32_t src_offset; +} __attribute__((packed)); + +enum mali_fbd_type { + MALI_SFBD = 0, + MALI_MFBD = 1, +}; + +#define FBD_TYPE (1) +#define FBD_MASK (~0x3f) + +struct mali_uniform_buffer_meta { + /* This is actually the size minus 1 (MALI_POSITIVE), in units of 16 + * bytes. This gives a maximum of 2^14 bytes, which just so happens to + * be the GL minimum-maximum for GL_MAX_UNIFORM_BLOCK_SIZE. + */ + u64 size : 10; + + /* This is missing the bottom 2 bits and top 8 bits. The top 8 bits + * should be 0 for userspace pointers, according to + * https://lwn.net/Articles/718895/. By reusing these bits, we can make + * each entry in the table only 64 bits. + */ + mali_ptr ptr : 64 - 10; +}; + +/* On Bifrost, these fields are the same between the vertex and tiler payloads. + * They also seem to be the same between Bifrost and Midgard. They're shared in + * fused payloads. + */ + +/* Applies to unknown_draw */ + +#define MALI_DRAW_INDEXED_UINT8 (0x10) +#define MALI_DRAW_INDEXED_UINT16 (0x20) +#define MALI_DRAW_INDEXED_UINT32 (0x30) +#define MALI_DRAW_VARYING_SIZE (0x100) +#define MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX (0x10000) + +struct mali_vertex_tiler_prefix { + /* This is a dynamic bitfield containing the following things in this order: + * + * - gl_WorkGroupSize.x + * - gl_WorkGroupSize.y + * - gl_WorkGroupSize.z + * - gl_NumWorkGroups.x + * - gl_NumWorkGroups.y + * - gl_NumWorkGroups.z + * + * The number of bits allocated for each number is based on the *_shift + * fields below. For example, workgroups_y_shift gives the bit that + * gl_NumWorkGroups.y starts at, and workgroups_z_shift gives the bit + * that gl_NumWorkGroups.z starts at (and therefore one after the bit + * that gl_NumWorkGroups.y ends at). The actual value for each gl_* + * value is one more than the stored value, since if any of the values + * are zero, then there would be no invocations (and hence no job). If + * there were 0 bits allocated to a given field, then it must be zero, + * and hence the real value is one. + * + * Vertex jobs reuse the same job dispatch mechanism as compute jobs, + * effectively doing glDispatchCompute(1, vertex_count, instance_count) + * where vertex count is the number of vertices. + */ + u32 invocation_count; + + u32 size_y_shift : 5; + u32 size_z_shift : 5; + u32 workgroups_x_shift : 6; + u32 workgroups_y_shift : 6; + u32 workgroups_z_shift : 6; + /* This is max(workgroups_x_shift, 2) in all the cases I've seen. */ + u32 workgroups_x_shift_2 : 4; + + u32 draw_mode : 4; + u32 unknown_draw : 22; + + /* This is the the same as workgroups_x_shift_2 in compute shaders, but + * always 5 for vertex jobs and 6 for tiler jobs. I suspect this has + * something to do with how many quads get put in the same execution + * engine, which is a balance (you don't want to starve the engine, but + * you also want to distribute work evenly). + */ + u32 workgroups_x_shift_3 : 6; + + + /* Negative of draw_start for TILER jobs from what I've seen */ + int32_t negative_start; + u32 zero1; + + /* Like many other strictly nonzero quantities, index_count is + * subtracted by one. For an indexed cube, this is equal to 35 = 6 + * faces * 2 triangles/per face * 3 vertices/per triangle - 1. That is, + * for an indexed draw, index_count is the number of actual vertices + * rendered whereas invocation_count is the number of unique vertices + * rendered (the number of times the vertex shader must be invoked). + * For non-indexed draws, this is just equal to invocation_count. */ + + u32 index_count; + + /* No hidden structure; literally just a pointer to an array of uint + * indices (width depends on flags). Thanks, guys, for not making my + * life insane for once! NULL for non-indexed draws. */ + + uintptr_t indices; +} __attribute__((packed)); + +/* Point size / line width can either be specified as a 32-bit float (for + * constant size) or as a [machine word size]-bit GPU pointer (for varying size). If a pointer + * is selected, by setting the appropriate MALI_DRAW_VARYING_SIZE bit in the tiler + * payload, the contents of varying_pointer will be intepreted as an array of + * fp16 sizes, one for each vertex. gl_PointSize is therefore implemented by + * creating a special MALI_R16F varying writing to varying_pointer. */ + +union midgard_primitive_size { + float constant; + uintptr_t pointer; +}; + +struct bifrost_vertex_only { + u32 unk2; /* =0x2 */ + + u32 zero0; + + u64 zero1; +} __attribute__((packed)); + +struct bifrost_tiler_heap_meta { + u32 zero; + u32 heap_size; + /* note: these are just guesses! */ + mali_ptr tiler_heap_start; + mali_ptr tiler_heap_free; + mali_ptr tiler_heap_end; + + /* hierarchy weights? but they're still 0 after the job has run... */ + u32 zeros[12]; +} __attribute__((packed)); + +struct bifrost_tiler_meta { + u64 zero0; + u16 hierarchy_mask; + u16 flags; + u16 width; + u16 height; + u64 zero1; + mali_ptr tiler_heap_meta; + /* TODO what is this used for? */ + u64 zeros[20]; +} __attribute__((packed)); + +struct bifrost_tiler_only { + /* 0x20 */ + union midgard_primitive_size primitive_size; + + mali_ptr tiler_meta; + + u64 zero1, zero2, zero3, zero4, zero5, zero6; + + u32 gl_enables; + u32 zero7; + u64 zero8; +} __attribute__((packed)); + +struct bifrost_scratchpad { + u32 zero; + u32 flags; // = 0x1f + /* This is a pointer to a CPU-inaccessible buffer, 16 pages, allocated + * during startup. It seems to serve the same purpose as the + * gpu_scratchpad in the SFBD for Midgard, although it's slightly + * larger. + */ + mali_ptr gpu_scratchpad; +} __attribute__((packed)); + +struct mali_vertex_tiler_postfix { + /* Zero for vertex jobs. Pointer to the position (gl_Position) varying + * output from the vertex shader for tiler jobs. + */ + + uintptr_t position_varying; + + /* An array of mali_uniform_buffer_meta's. The size is given by the + * shader_meta. + */ + uintptr_t uniform_buffers; + + /* This is a pointer to an array of pointers to the texture + * descriptors, number of pointers bounded by number of textures. The + * indirection is needed to accomodate varying numbers and sizes of + * texture descriptors */ + uintptr_t texture_trampoline; + + /* For OpenGL, from what I've seen, this is intimately connected to + * texture_meta. cwabbott says this is not the case under Vulkan, hence + * why this field is seperate (Midgard is Vulkan capable). Pointer to + * array of sampler descriptors (which are uniform in size) */ + uintptr_t sampler_descriptor; + + uintptr_t uniforms; + u8 flags : 4; + uintptr_t _shader_upper : MALI_SHORT_PTR_BITS - 4; /* struct shader_meta */ + uintptr_t attributes; /* struct attribute_buffer[] */ + uintptr_t attribute_meta; /* attribute_meta[] */ + uintptr_t varyings; /* struct attr */ + uintptr_t varying_meta; /* pointer */ + uintptr_t viewport; + uintptr_t occlusion_counter; /* A single bit as far as I can tell */ + + /* Note: on Bifrost, this isn't actually the FBD. It points to + * bifrost_scratchpad instead. However, it does point to the same thing + * in vertex and tiler jobs. + */ + mali_ptr framebuffer; +} __attribute__((packed)); + +struct midgard_payload_vertex_tiler { +#ifndef __LP64__ + union midgard_primitive_size primitive_size; +#endif + + struct mali_vertex_tiler_prefix prefix; + +#ifndef __LP64__ + u32 zero3; +#endif + + u16 gl_enables; // 0x5 + + /* Both zero for non-instanced draws. For instanced draws, a + * decomposition of padded_num_vertices. See the comments about the + * corresponding fields in mali_attr for context. */ + + unsigned instance_shift : 5; + unsigned instance_odd : 3; + + u8 zero4; + + /* Offset for first vertex in buffer */ + u32 draw_start; + + uintptr_t zero5; + + struct mali_vertex_tiler_postfix postfix; + +#ifdef __LP64__ + union midgard_primitive_size primitive_size; +#endif +} __attribute__((packed)); + +struct bifrost_payload_vertex { + struct mali_vertex_tiler_prefix prefix; + struct bifrost_vertex_only vertex; + struct mali_vertex_tiler_postfix postfix; +} __attribute__((packed)); + +struct bifrost_payload_tiler { + struct mali_vertex_tiler_prefix prefix; + struct bifrost_tiler_only tiler; + struct mali_vertex_tiler_postfix postfix; +} __attribute__((packed)); + +struct bifrost_payload_fused { + struct mali_vertex_tiler_prefix prefix; + struct bifrost_tiler_only tiler; + struct mali_vertex_tiler_postfix tiler_postfix; + u64 padding; /* zero */ + struct bifrost_vertex_only vertex; + struct mali_vertex_tiler_postfix vertex_postfix; +} __attribute__((packed)); + +/* Purposeful off-by-one in width, height fields. For example, a (64, 64) + * texture is stored as (63, 63) in these fields. This adjusts for that. + * There's an identical pattern in the framebuffer descriptor. Even vertex + * count fields work this way, hence the generic name -- integral fields that + * are strictly positive generally need this adjustment. */ + +#define MALI_POSITIVE(dim) (dim - 1) + +/* Opposite of MALI_POSITIVE, found in the depth_units field */ + +#define MALI_NEGATIVE(dim) (dim + 1) + +/* Used with wrapping. Incomplete (this is a 4-bit field...) */ + +enum mali_wrap_mode { + MALI_WRAP_REPEAT = 0x8, + MALI_WRAP_CLAMP_TO_EDGE = 0x9, + MALI_WRAP_CLAMP_TO_BORDER = 0xB, + MALI_WRAP_MIRRORED_REPEAT = 0xC +}; + +/* Shared across both command stream and Midgard, and even with Bifrost */ + +enum mali_texture_type { + MALI_TEX_CUBE = 0x0, + MALI_TEX_1D = 0x1, + MALI_TEX_2D = 0x2, + MALI_TEX_3D = 0x3 +}; + +/* 8192x8192 */ +#define MAX_MIP_LEVELS (13) + +/* Cubemap bloats everything up */ +#define MAX_CUBE_FACES (6) + +/* For each pointer, there is an address and optionally also a stride */ +#define MAX_ELEMENTS (2) + +/* Corresponds to the type passed to glTexImage2D and so forth */ + +/* Flags for usage2 */ +#define MALI_TEX_MANUAL_STRIDE (0x20) + +struct mali_texture_format { + unsigned swizzle : 12; + enum mali_format format : 8; + + unsigned srgb : 1; + unsigned unknown1 : 1; + + enum mali_texture_type type : 2; + + unsigned usage2 : 8; +} __attribute__((packed)); + +struct mali_texture_descriptor { + uint16_t width; + uint16_t height; + uint16_t depth; + uint16_t array_size; + + struct mali_texture_format format; + + uint16_t unknown3; + + /* One for non-mipmapped, zero for mipmapped */ + uint8_t unknown3A; + + /* Zero for non-mipmapped, (number of levels - 1) for mipmapped */ + uint8_t nr_mipmap_levels; + + /* Swizzling is a single 32-bit word, broken up here for convenience. + * Here, swizzling refers to the ES 3.0 texture parameters for channel + * level swizzling, not the internal pixel-level swizzling which is + * below OpenGL's reach */ + + unsigned swizzle : 12; + unsigned swizzle_zero : 20; + + uint32_t unknown5; + uint32_t unknown6; + uint32_t unknown7; + + mali_ptr payload[MAX_MIP_LEVELS * MAX_CUBE_FACES * MAX_ELEMENTS]; +} __attribute__((packed)); + +/* Used as part of filter_mode */ + +#define MALI_LINEAR 0 +#define MALI_NEAREST 1 +#define MALI_MIP_LINEAR (0x18) + +/* Used to construct low bits of filter_mode */ + +#define MALI_TEX_MAG(mode) (((mode) & 1) << 0) +#define MALI_TEX_MIN(mode) (((mode) & 1) << 1) + +#define MALI_TEX_MAG_MASK (1) +#define MALI_TEX_MIN_MASK (2) + +#define MALI_FILTER_NAME(filter) (filter ? "MALI_NEAREST" : "MALI_LINEAR") + +/* Used for lod encoding. Thanks @urjaman for pointing out these routines can + * be cleaned up a lot. */ + +#define DECODE_FIXED_16(x) ((float) (x / 256.0)) + +static inline uint16_t +FIXED_16(float x) +{ + /* Clamp inputs, accounting for float error */ + float max_lod = (32.0 - (1.0 / 512.0)); + + x = ((x > max_lod) ? max_lod : ((x < 0.0) ? 0.0 : x)); + + return (int) (x * 256.0); +} + +struct mali_sampler_descriptor { + uint32_t filter_mode; + + /* Fixed point. Upper 8-bits is before the decimal point, although it + * caps [0-31]. Lower 8-bits is after the decimal point: int(round(x * + * 256)) */ + + uint16_t min_lod; + uint16_t max_lod; + + /* All one word in reality, but packed a bit */ + + enum mali_wrap_mode wrap_s : 4; + enum mali_wrap_mode wrap_t : 4; + enum mali_wrap_mode wrap_r : 4; + enum mali_alt_func compare_func : 3; + + /* No effect on 2D textures. For cubemaps, set for ES3 and clear for + * ES2, controlling seamless cubemapping */ + unsigned seamless_cube_map : 1; + + unsigned zero : 16; + + uint32_t zero2; + float border_color[4]; +} __attribute__((packed)); + +/* viewport0/viewport1 form the arguments to glViewport. viewport1 is + * modified by MALI_POSITIVE; viewport0 is as-is. + */ + +struct mali_viewport { + /* XY clipping planes */ + float clip_minx; + float clip_miny; + float clip_maxx; + float clip_maxy; + + /* Depth clipping planes */ + float clip_minz; + float clip_maxz; + + u16 viewport0[2]; + u16 viewport1[2]; +} __attribute__((packed)); + +/* From presentations, 16x16 tiles externally. Use shift for fast computation + * of tile numbers. */ + +#define MALI_TILE_SHIFT 4 +#define MALI_TILE_LENGTH (1 << MALI_TILE_SHIFT) + +/* Tile coordinates are stored as a compact u32, as only 12 bits are needed to + * each component. Notice that this provides a theoretical upper bound of (1 << + * 12) = 4096 tiles in each direction, addressing a maximum framebuffer of size + * 65536x65536. Multiplying that together, times another four given that Mali + * framebuffers are 32-bit ARGB8888, means that this upper bound would take 16 + * gigabytes of RAM just to store the uncompressed framebuffer itself, let + * alone rendering in real-time to such a buffer. + * + * Nice job, guys.*/ + +/* From mali_kbase_10969_workaround.c */ +#define MALI_X_COORD_MASK 0x00000FFF +#define MALI_Y_COORD_MASK 0x0FFF0000 + +/* Extract parts of a tile coordinate */ + +#define MALI_TILE_COORD_X(coord) ((coord) & MALI_X_COORD_MASK) +#define MALI_TILE_COORD_Y(coord) (((coord) & MALI_Y_COORD_MASK) >> 16) +#define MALI_TILE_COORD_FLAGS(coord) ((coord) & ~(MALI_X_COORD_MASK | MALI_Y_COORD_MASK)) + +/* No known flags yet, but just in case...? */ + +#define MALI_TILE_NO_FLAG (0) + +/* Helpers to generate tile coordinates based on the boundary coordinates in + * screen space. So, with the bounds (0, 0) to (128, 128) for the screen, these + * functions would convert it to the bounding tiles (0, 0) to (7, 7). + * Intentional "off-by-one"; finding the tile number is a form of fencepost + * problem. */ + +#define MALI_MAKE_TILE_COORDS(X, Y) ((X) | ((Y) << 16)) +#define MALI_BOUND_TO_TILE(B, bias) ((B - bias) >> MALI_TILE_SHIFT) +#define MALI_COORDINATE_TO_TILE(W, H, bias) MALI_MAKE_TILE_COORDS(MALI_BOUND_TO_TILE(W, bias), MALI_BOUND_TO_TILE(H, bias)) +#define MALI_COORDINATE_TO_TILE_MIN(W, H) MALI_COORDINATE_TO_TILE(W, H, 0) +#define MALI_COORDINATE_TO_TILE_MAX(W, H) MALI_COORDINATE_TO_TILE(W, H, 1) + +struct mali_payload_fragment { + u32 min_tile_coord; + u32 max_tile_coord; + mali_ptr framebuffer; +} __attribute__((packed)); + +/* Single Framebuffer Descriptor */ + +/* Flags apply to format. With just MSAA_A and MSAA_B, the framebuffer is + * configured for 4x. With MSAA_8, it is configured for 8x. */ + +#define MALI_FRAMEBUFFER_MSAA_8 (1 << 3) +#define MALI_FRAMEBUFFER_MSAA_A (1 << 4) +#define MALI_FRAMEBUFFER_MSAA_B (1 << 23) + +/* Fast/slow based on whether all three buffers are cleared at once */ + +#define MALI_CLEAR_FAST (1 << 18) +#define MALI_CLEAR_SLOW (1 << 28) +#define MALI_CLEAR_SLOW_STENCIL (1 << 31) + +/* Configures hierarchical tiling on Midgard for both SFBD/MFBD (embedded + * within the larget framebuffer descriptor). Analogous to + * bifrost_tiler_heap_meta and bifrost_tiler_meta*/ + +struct midgard_tiler_descriptor { + /* Size of the entire polygon list; see pan_tiler.c for the + * computation. It's based on hierarchical tiling */ + + u32 polygon_list_size; + + /* Name known from the replay workaround in the kernel. What exactly is + * flagged here is less known. We do that (tiler_hierarchy_mask & 0x1ff) + * specifies a mask of hierarchy weights, which explains some of the + * performance mysteries around setting it. We also see the bottom bit + * of tiler_flags set in the kernel, but no comment why. */ + + u16 hierarchy_mask; + u16 flags; + + /* See mali_tiler.c for an explanation */ + mali_ptr polygon_list; + mali_ptr polygon_list_body; + + /* Names based on we see symmetry with replay jobs which name these + * explicitly */ + + mali_ptr heap_start; /* tiler heap_free_address */ + mali_ptr heap_end; + + /* Hierarchy weights. We know these are weights based on the kernel, + * but I've never seen them be anything other than zero */ + u32 weights[8]; +}; + +struct mali_single_framebuffer { + u32 unknown1; + u32 unknown2; + u64 unknown_address_0; + u64 zero1; + u64 zero0; + + /* Exact format is ironically not known, since EGL is finnicky with the + * blob. MSAA, colourspace, etc are configured here. */ + + u32 format; + + u32 clear_flags; + u32 zero2; + + /* Purposeful off-by-one in these fields should be accounted for by the + * MALI_DIMENSION macro */ + + u16 width; + u16 height; + + u32 zero3[8]; + + /* By default, the framebuffer is upside down from OpenGL's + * perspective. Set framebuffer to the end and negate the stride to + * flip in the Y direction */ + + mali_ptr framebuffer; + int32_t stride; + + u32 zero4; + + /* Depth and stencil buffers are interleaved, it appears, as they are + * set to the same address in captures. Both fields set to zero if the + * buffer is not being cleared. Depending on GL_ENABLE magic, you might + * get a zero enable despite the buffer being present; that still is + * disabled. */ + + mali_ptr depth_buffer; // not SAME_VA + u64 depth_buffer_enable; + + mali_ptr stencil_buffer; // not SAME_VA + u64 stencil_buffer_enable; + + u32 clear_color_1; // RGBA8888 from glClear, actually used by hardware + u32 clear_color_2; // always equal, but unclear function? + u32 clear_color_3; // always equal, but unclear function? + u32 clear_color_4; // always equal, but unclear function? + + /* Set to zero if not cleared */ + + float clear_depth_1; // float32, ditto + float clear_depth_2; // float32, ditto + float clear_depth_3; // float32, ditto + float clear_depth_4; // float32, ditto + + u32 clear_stencil; // Exactly as it appears in OpenGL + + u32 zero6[7]; + + struct midgard_tiler_descriptor tiler; + + /* More below this, maybe */ +} __attribute__((packed)); + +/* On Midgard, this "framebuffer descriptor" is used for the framebuffer field + * of compute jobs. Superficially resembles a single framebuffer descriptor */ + +struct mali_compute_fbd { + u32 unknown1[16]; +} __attribute__((packed)); + +/* Format bits for the render target flags */ + +#define MALI_MFBD_FORMAT_MSAA (1 << 1) +#define MALI_MFBD_FORMAT_SRGB (1 << 2) + +enum mali_mfbd_block_format { + MALI_MFBD_BLOCK_TILED = 0x0, + MALI_MFBD_BLOCK_UNKNOWN = 0x1, + MALI_MFBD_BLOCK_LINEAR = 0x2, + MALI_MFBD_BLOCK_AFBC = 0x3, +}; + +struct mali_rt_format { + unsigned unk1 : 32; + unsigned unk2 : 3; + + unsigned nr_channels : 2; /* MALI_POSITIVE */ + + unsigned unk3 : 5; + enum mali_mfbd_block_format block : 2; + unsigned flags : 4; + + unsigned swizzle : 12; + + unsigned unk4 : 4; +} __attribute__((packed)); + +struct bifrost_render_target { + struct mali_rt_format format; + + u64 zero1; + + union { + struct { + /* Stuff related to ARM Framebuffer Compression. When AFBC is enabled, + * there is an extra metadata buffer that contains 16 bytes per tile. + * The framebuffer needs to be the same size as before, since we don't + * know ahead of time how much space it will take up. The + * framebuffer_stride is set to 0, since the data isn't stored linearly + * anymore. + */ + + mali_ptr metadata; + u32 stride; // stride in units of tiles + u32 unk; // = 0x20000 + } afbc; + + struct { + /* Heck if I know */ + u64 unk; + mali_ptr pointer; + } chunknown; + }; + + mali_ptr framebuffer; + + u32 zero2 : 4; + u32 framebuffer_stride : 28; // in units of bytes + u32 zero3; + + u32 clear_color_1; // RGBA8888 from glClear, actually used by hardware + u32 clear_color_2; // always equal, but unclear function? + u32 clear_color_3; // always equal, but unclear function? + u32 clear_color_4; // always equal, but unclear function? +} __attribute__((packed)); + +/* An optional part of bifrost_framebuffer. It comes between the main structure + * and the array of render targets. It must be included if any of these are + * enabled: + * + * - Transaction Elimination + * - Depth/stencil + * - TODO: Anything else? + */ + +/* Flags field: note, these are guesses */ + +#define MALI_EXTRA_PRESENT (0x400) +#define MALI_EXTRA_AFBC (0x20) +#define MALI_EXTRA_AFBC_ZS (0x10) +#define MALI_EXTRA_ZS (0x4) + +struct bifrost_fb_extra { + mali_ptr checksum; + /* Each tile has an 8 byte checksum, so the stride is "width in tiles * 8" */ + u32 checksum_stride; + + u32 flags; + + union { + /* Note: AFBC is only allowed for 24/8 combined depth/stencil. */ + struct { + mali_ptr depth_stencil_afbc_metadata; + u32 depth_stencil_afbc_stride; // in units of tiles + u32 zero1; + + mali_ptr depth_stencil; + + u64 padding; + } ds_afbc; + + struct { + /* Depth becomes depth/stencil in case of combined D/S */ + mali_ptr depth; + u32 depth_stride_zero : 4; + u32 depth_stride : 28; + u32 zero1; + + mali_ptr stencil; + u32 stencil_stride_zero : 4; + u32 stencil_stride : 28; + u32 zero2; + } ds_linear; + }; + + + u64 zero3, zero4; +} __attribute__((packed)); + +/* Flags for mfbd_flags */ + +/* Enables writing depth results back to main memory (rather than keeping them + * on-chip in the tile buffer and then discarding) */ + +#define MALI_MFBD_DEPTH_WRITE (1 << 10) + +/* The MFBD contains the extra bifrost_fb_extra section */ + +#define MALI_MFBD_EXTRA (1 << 13) + +struct bifrost_framebuffer { + u32 unk0; // = 0x10 + + u32 unknown2; // = 0x1f, same as SFBD + mali_ptr scratchpad; + + /* 0x10 */ + mali_ptr sample_locations; + mali_ptr unknown1; + /* 0x20 */ + u16 width1, height1; + u32 zero3; + u16 width2, height2; + u32 unk1 : 19; // = 0x01000 + u32 rt_count_1 : 2; // off-by-one (use MALI_POSITIVE) + u32 unk2 : 3; // = 0 + u32 rt_count_2 : 3; // no off-by-one + u32 zero4 : 5; + /* 0x30 */ + u32 clear_stencil : 8; + u32 mfbd_flags : 24; // = 0x100 + float clear_depth; + + struct midgard_tiler_descriptor tiler; + + /* optional: struct bifrost_fb_extra extra */ + /* struct bifrost_render_target rts[] */ +} __attribute__((packed)); + +#endif /* __PANFROST_JOB_H__ */ diff --git a/src/panfrost/include/panfrost-misc.h b/src/panfrost/include/panfrost-misc.h new file mode 100644 index 00000000000..82363d8730b --- /dev/null +++ b/src/panfrost/include/panfrost-misc.h @@ -0,0 +1,47 @@ +/* + * © Copyright 2017-2018 The Panfrost Community + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __PANFROST_MISC_H__ +#define __PANFROST_MISC_H__ + +#include <inttypes.h> + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; + +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; + +typedef uint64_t mali_ptr; + +#define MALI_PTR_FMT "0x%" PRIx64 + +/* FIXME: put this somewhere more fitting */ +#define MALI_MEM_MAP_TRACKING_HANDLE (3ull << 12) + +#endif diff --git a/src/panfrost/meson.build b/src/panfrost/meson.build index 6b167d04b9c..9c12ff8fd8e 100644 --- a/src/panfrost/meson.build +++ b/src/panfrost/meson.build @@ -19,6 +19,64 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -inc_panfrost = include_directories(['.', 'shared']) +inc_panfrost_hw = include_directories([ + 'include' +]) + +inc_panfrost = include_directories([ + '.', 'include', 'shared', 'midgard', 'bifrost' +]) subdir('shared') +subdir('midgard') +subdir('bifrost') +subdir('pandecode') + +files_pandecode = files( + 'pandecode/cmdline.c', + 'pandecode/common.c', + 'pandecode/decode.c', + 'pandecode/pan_pretty_print.c', + + 'midgard/disassemble.c', + 'midgard/midgard_ops.c', + 'bifrost/disassemble.c', +) + +pandecode = executable( + 'pandecoder', + files_pandecode, + include_directories : [inc_common, inc_include, inc_src, inc_panfrost], + dependencies : [ + dep_thread, + ], + link_with : [ + libmesa_util + ], + build_by_default : true +) + +files_bifrost = files( + 'bifrost/cmdline.c', +) + +bifrost_compiler = executable( + 'bifrost_compiler', + [files_bifrost], + include_directories : [ + inc_common, + inc_include, + inc_src, + inc_panfrost, + ], + dependencies : [ + dep_thread, + idep_nir + ], + link_with : [ + libglsl_standalone, + libmesa_util, + libpanfrost_bifrost + ], + build_by_default : true +) diff --git a/src/panfrost/midgard/compiler.h b/src/panfrost/midgard/compiler.h new file mode 100644 index 00000000000..79fe7dfc78a --- /dev/null +++ b/src/panfrost/midgard/compiler.h @@ -0,0 +1,456 @@ +/* + * Copyright (C) 2019 Alyssa Rosenzweig <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _MDG_COMPILER_H +#define _MDG_COMPILER_H + +#include "midgard.h" +#include "helpers.h" +#include "midgard_compile.h" + +#include "util/hash_table.h" +#include "util/u_dynarray.h" +#include "util/set.h" +#include "util/list.h" + +#include "main/mtypes.h" +#include "compiler/nir_types.h" +#include "compiler/nir/nir.h" + +/* Forward declare */ +struct midgard_block; + +/* Target types. Defaults to TARGET_GOTO (the type corresponding directly to + * the hardware), hence why that must be zero. TARGET_DISCARD signals this + * instruction is actually a discard op. */ + +#define TARGET_GOTO 0 +#define TARGET_BREAK 1 +#define TARGET_CONTINUE 2 +#define TARGET_DISCARD 3 + +typedef struct midgard_branch { + /* If conditional, the condition is specified in r31.w */ + bool conditional; + + /* For conditionals, if this is true, we branch on FALSE. If false, we branch on TRUE. */ + bool invert_conditional; + + /* Branch targets: the start of a block, the start of a loop (continue), the end of a loop (break). Value is one of TARGET_ */ + unsigned target_type; + + /* The actual target */ + union { + int target_block; + int target_break; + int target_continue; + }; +} midgard_branch; + +/* Instruction arguments represented as block-local SSA indices, rather than + * registers. Negative values mean unused. */ + +typedef struct { + int src0; + int src1; + int dest; + + /* src1 is -not- SSA but instead a 16-bit inline constant to be smudged + * in. Only valid for ALU ops. */ + bool inline_constant; +} ssa_args; + +/* Generic in-memory data type repesenting a single logical instruction, rather + * than a single instruction group. This is the preferred form for code gen. + * Multiple midgard_insturctions will later be combined during scheduling, + * though this is not represented in this structure. Its format bridges + * the low-level binary representation with the higher level semantic meaning. + * + * Notably, it allows registers to be specified as block local SSA, for code + * emitted before the register allocation pass. + */ + +typedef struct midgard_instruction { + /* Must be first for casting */ + struct list_head link; + + unsigned type; /* ALU, load/store, texture */ + + /* If the register allocator has not run yet... */ + ssa_args ssa_args; + + /* Special fields for an ALU instruction */ + midgard_reg_info registers; + + /* I.e. (1 << alu_bit) */ + int unit; + + /* When emitting bundle, should this instruction have a break forced + * before it? Used for r31 writes which are valid only within a single + * bundle and *need* to happen as early as possible... this is a hack, + * TODO remove when we have a scheduler */ + bool precede_break; + + bool has_constants; + float constants[4]; + uint16_t inline_constant; + bool has_blend_constant; + + bool compact_branch; + bool writeout; + bool prepacked_branch; + + /* Masks in a saneish format. One bit per channel, not packed fancy. + * Use this instead of the op specific ones, and switch over at emit + * time */ + uint16_t mask; + + union { + midgard_load_store_word load_store; + midgard_vector_alu alu; + midgard_texture_word texture; + midgard_branch_extended branch_extended; + uint16_t br_compact; + + /* General branch, rather than packed br_compact. Higher level + * than the other components */ + midgard_branch branch; + }; +} midgard_instruction; + +typedef struct midgard_block { + /* Link to next block. Must be first for mir_get_block */ + struct list_head link; + + /* List of midgard_instructions emitted for the current block */ + struct list_head instructions; + + bool is_scheduled; + + /* List of midgard_bundles emitted (after the scheduler has run) */ + struct util_dynarray bundles; + + /* Number of quadwords _actually_ emitted, as determined after scheduling */ + unsigned quadword_count; + + /* Successors: always one forward (the block after us), maybe + * one backwards (for a backward branch). No need for a second + * forward, since graph traversal would get there eventually + * anyway */ + struct midgard_block *successors[2]; + unsigned nr_successors; + + /* The successors pointer form a graph, and in the case of + * complex control flow, this graph has a cycles. To aid + * traversal during liveness analysis, we have a visited? + * boolean for passes to use as they see fit, provided they + * clean up later */ + bool visited; +} midgard_block; + +typedef struct midgard_bundle { + /* Tag for the overall bundle */ + int tag; + + /* Instructions contained by the bundle */ + int instruction_count; + midgard_instruction *instructions[5]; + + /* Bundle-wide ALU configuration */ + int padding; + int control; + bool has_embedded_constants; + float constants[4]; + bool has_blend_constant; +} midgard_bundle; + +typedef struct compiler_context { + nir_shader *nir; + gl_shader_stage stage; + + /* Is internally a blend shader? Depends on stage == FRAGMENT */ + bool is_blend; + + /* Tracking for blend constant patching */ + int blend_constant_offset; + + /* Current NIR function */ + nir_function *func; + + /* Unordered list of midgard_blocks */ + int block_count; + struct list_head blocks; + + midgard_block *initial_block; + midgard_block *previous_source_block; + midgard_block *final_block; + + /* List of midgard_instructions emitted for the current block */ + midgard_block *current_block; + + /* The current "depth" of the loop, for disambiguating breaks/continues + * when using nested loops */ + int current_loop_depth; + + /* Total number of loops for shader-db */ + unsigned loop_count; + + /* Constants which have been loaded, for later inlining */ + struct hash_table_u64 *ssa_constants; + + /* SSA values / registers which have been aliased. Naively, these + * demand a fmov output; instead, we alias them in a later pass to + * avoid the wasted op. + * + * A note on encoding: to avoid dynamic memory management here, rather + * than ampping to a pointer, we map to the source index; the key + * itself is just the destination index. */ + + struct hash_table_u64 *ssa_to_alias; + struct set *leftover_ssa_to_alias; + + /* Actual SSA-to-register for RA */ + struct hash_table_u64 *ssa_to_register; + + /* Mapping of hashes computed from NIR indices to the sequential temp indices ultimately used in MIR */ + struct hash_table_u64 *hash_to_temp; + int temp_count; + int max_hash; + + /* Just the count of the max register used. Higher count => higher + * register pressure */ + int work_registers; + + /* Used for cont/last hinting. Increase when a tex op is added. + * Decrease when a tex op is removed. */ + int texture_op_count; + + /* Mapping of texture register -> SSA index for unaliasing */ + int texture_index[2]; + + /* If any path hits a discard instruction */ + bool can_discard; + + /* The number of uniforms allowable for the fast path */ + int uniform_cutoff; + + /* Count of instructions emitted from NIR overall, across all blocks */ + int instruction_count; + + /* Alpha ref value passed in */ + float alpha_ref; + + /* The index corresponding to the fragment output */ + unsigned fragment_output; + + /* The mapping of sysvals to uniforms, the count, and the off-by-one inverse */ + unsigned sysvals[MAX_SYSVAL_COUNT]; + unsigned sysval_count; + struct hash_table_u64 *sysval_to_id; +} compiler_context; + +/* Helpers for manipulating the above structures (forming the driver IR) */ + +/* Append instruction to end of current block */ + +static inline midgard_instruction * +mir_upload_ins(struct midgard_instruction ins) +{ + midgard_instruction *heap = malloc(sizeof(ins)); + memcpy(heap, &ins, sizeof(ins)); + return heap; +} + +static inline void +emit_mir_instruction(struct compiler_context *ctx, struct midgard_instruction ins) +{ + list_addtail(&(mir_upload_ins(ins))->link, &ctx->current_block->instructions); +} + +static inline void +mir_insert_instruction_before(struct midgard_instruction *tag, struct midgard_instruction ins) +{ + list_addtail(&(mir_upload_ins(ins))->link, &tag->link); +} + +static inline void +mir_remove_instruction(struct midgard_instruction *ins) +{ + list_del(&ins->link); +} + +static inline midgard_instruction* +mir_prev_op(struct midgard_instruction *ins) +{ + return list_last_entry(&(ins->link), midgard_instruction, link); +} + +static inline midgard_instruction* +mir_next_op(struct midgard_instruction *ins) +{ + return list_first_entry(&(ins->link), midgard_instruction, link); +} + +#define mir_foreach_block(ctx, v) \ + list_for_each_entry(struct midgard_block, v, &ctx->blocks, link) + +#define mir_foreach_block_from(ctx, from, v) \ + list_for_each_entry_from(struct midgard_block, v, from, &ctx->blocks, link) + +#define mir_foreach_instr(ctx, v) \ + list_for_each_entry(struct midgard_instruction, v, &ctx->current_block->instructions, link) + +#define mir_foreach_instr_safe(ctx, v) \ + list_for_each_entry_safe(struct midgard_instruction, v, &ctx->current_block->instructions, link) + +#define mir_foreach_instr_in_block(block, v) \ + list_for_each_entry(struct midgard_instruction, v, &block->instructions, link) + +#define mir_foreach_instr_in_block_safe(block, v) \ + list_for_each_entry_safe(struct midgard_instruction, v, &block->instructions, link) + +#define mir_foreach_instr_in_block_safe_rev(block, v) \ + list_for_each_entry_safe_rev(struct midgard_instruction, v, &block->instructions, link) + +#define mir_foreach_instr_in_block_from(block, v, from) \ + list_for_each_entry_from(struct midgard_instruction, v, from, &block->instructions, link) + +#define mir_foreach_instr_in_block_from_rev(block, v, from) \ + list_for_each_entry_from_rev(struct midgard_instruction, v, from, &block->instructions, link) + +#define mir_foreach_bundle_in_block(block, v) \ + util_dynarray_foreach(&block->bundles, midgard_bundle, v) + +#define mir_foreach_instr_global(ctx, v) \ + mir_foreach_block(ctx, v_block) \ + mir_foreach_instr_in_block(v_block, v) + + +static inline midgard_instruction * +mir_last_in_block(struct midgard_block *block) +{ + return list_last_entry(&block->instructions, struct midgard_instruction, link); +} + +static inline midgard_block * +mir_get_block(compiler_context *ctx, int idx) +{ + struct list_head *lst = &ctx->blocks; + + while ((idx--) + 1) + lst = lst->next; + + return (struct midgard_block *) lst; +} + +static inline bool +mir_is_alu_bundle(midgard_bundle *bundle) +{ + return IS_ALU(bundle->tag); +} + +/* MIR manipulation */ + +void mir_rewrite_index(compiler_context *ctx, unsigned old, unsigned new); +void mir_rewrite_index_src(compiler_context *ctx, unsigned old, unsigned new); +void mir_rewrite_index_dst(compiler_context *ctx, unsigned old, unsigned new); + +/* MIR printing */ + +void mir_print_instruction(midgard_instruction *ins); +void mir_print_bundle(midgard_bundle *ctx); +void mir_print_block(midgard_block *block); +void mir_print_shader(compiler_context *ctx); + +/* MIR goodies */ + +static const midgard_vector_alu_src blank_alu_src = { + .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), +}; + +static const midgard_vector_alu_src blank_alu_src_xxxx = { + .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_X, COMPONENT_X, COMPONENT_X), +}; + +static const midgard_scalar_alu_src blank_scalar_alu_src = { + .full = true +}; + +/* Used for encoding the unused source of 1-op instructions */ +static const midgard_vector_alu_src zero_alu_src = { 0 }; + +/* 'Intrinsic' move for aliasing */ + +static inline midgard_instruction +v_mov(unsigned src, midgard_vector_alu_src mod, unsigned dest) +{ + midgard_instruction ins = { + .type = TAG_ALU_4, + .mask = 0xF, + .ssa_args = { + .src0 = SSA_UNUSED_1, + .src1 = src, + .dest = dest, + }, + .alu = { + .op = midgard_alu_op_imov, + .reg_mode = midgard_reg_mode_32, + .dest_override = midgard_dest_override_none, + .outmod = midgard_outmod_int_wrap, + .src1 = vector_alu_srco_unsigned(zero_alu_src), + .src2 = vector_alu_srco_unsigned(mod) + }, + }; + + return ins; +} + +/* Scheduling */ + +void schedule_program(compiler_context *ctx); + +/* Register allocation */ + +struct ra_graph; + +struct ra_graph* allocate_registers(compiler_context *ctx); +void install_registers(compiler_context *ctx, struct ra_graph *g); +bool mir_is_live_after(compiler_context *ctx, midgard_block *block, midgard_instruction *start, int src); +bool mir_has_multiple_writes(compiler_context *ctx, int src); + +void mir_create_pipeline_registers(compiler_context *ctx); + +/* Final emission */ + +void emit_binary_bundle( + compiler_context *ctx, + midgard_bundle *bundle, + struct util_dynarray *emission, + int next_tag); + +/* NIR stuff */ + +bool +nir_undef_to_zero(nir_shader *shader); + +#endif diff --git a/src/panfrost/midgard/cppwrap.cpp b/src/panfrost/midgard/cppwrap.cpp new file mode 100644 index 00000000000..cf2ca3b7a11 --- /dev/null +++ b/src/panfrost/midgard/cppwrap.cpp @@ -0,0 +1,9 @@ +struct exec_list; + +bool do_mat_op_to_vec(struct exec_list *instructions); + +extern "C" { + bool c_do_mat_op_to_vec(struct exec_list *instructions) { + return do_mat_op_to_vec(instructions); + } +}; diff --git a/src/panfrost/midgard/disassemble.c b/src/panfrost/midgard/disassemble.c new file mode 100644 index 00000000000..bed803162f3 --- /dev/null +++ b/src/panfrost/midgard/disassemble.c @@ -0,0 +1,1317 @@ +/* Author(s): + * Connor Abbott + * Alyssa Rosenzweig + * + * Copyright (c) 2013 Connor Abbott ([email protected]) + * Copyright (c) 2018 Alyssa Rosenzweig ([email protected]) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <stdio.h> +#include <stdint.h> +#include <assert.h> +#include <inttypes.h> +#include <ctype.h> +#include <string.h> +#include "midgard.h" +#include "midgard-parse.h" +#include "midgard_ops.h" +#include "disassemble.h" +#include "helpers.h" +#include "util/half_float.h" +#include "util/u_math.h" + +#define DEFINE_CASE(define, str) case define: { printf(str); break; } + +static bool is_instruction_int = false; + +/* Prints a short form of the tag for branching, the minimum needed to be + * legible and unambiguous */ + +static void +print_tag_short(unsigned tag) +{ + switch (midgard_word_types[tag]) { + case midgard_word_type_texture: + printf("tex/%X", tag); + break; + + case midgard_word_type_load_store: + printf("ldst"); + break; + + case midgard_word_type_alu: + printf("alu%d/%X", midgard_word_size[tag], tag); + break; + + default: + printf("%s%X", (tag > 0) ? "" : "unk", tag); + break; + } +} + +static void +print_alu_opcode(midgard_alu_op op) +{ + bool int_op = false; + + if (alu_opcode_props[op].name) { + printf("%s", alu_opcode_props[op].name); + + int_op = midgard_is_integer_op(op); + } else + printf("alu_op_%02X", op); + + /* For constant analysis */ + is_instruction_int = int_op; +} + +static void +print_ld_st_opcode(midgard_load_store_op op) +{ + if (load_store_opcode_names[op]) + printf("%s", load_store_opcode_names[op]); + else + printf("ldst_op_%02X", op); +} + +static bool is_embedded_constant_half = false; +static bool is_embedded_constant_int = false; + +static char +prefix_for_bits(unsigned bits) +{ + switch (bits) { + case 8: + return 'q'; + case 16: + return 'h'; + case 64: + return 'd'; + default: + return 0; + } +} + +static void +print_reg(unsigned reg, unsigned bits) +{ + /* Perform basic static analysis for expanding constants correctly */ + + if (reg == 26) { + is_embedded_constant_int = is_instruction_int; + is_embedded_constant_half = (bits < 32); + } + + char prefix = prefix_for_bits(bits); + + if (prefix) + putchar(prefix); + + printf("r%u", reg); +} + +static char *outmod_names_float[4] = { + "", + ".pos", + ".unk2", + ".sat" +}; + +static char *outmod_names_int[4] = { + ".isat", + ".usat", + "", + ".hi" +}; + +static char *srcmod_names_int[4] = { + "sext(", + "zext(", + "", + "(" +}; + +static void +print_outmod(unsigned outmod, bool is_int) +{ + printf("%s", is_int ? outmod_names_int[outmod] : + outmod_names_float[outmod]); +} + +static void +print_quad_word(uint32_t *words, unsigned tabs) +{ + unsigned i; + + for (i = 0; i < 4; i++) + printf("0x%08X%s ", words[i], i == 3 ? "" : ","); + + printf("\n"); +} + +static const char components[16] = "xyzwefghijklmnop"; + +/* Helper to print 4 chars of a swizzle */ +static void +print_swizzle_helper(unsigned swizzle, bool upper) +{ + for (unsigned i = 0; i < 4; ++i) { + unsigned c = (swizzle >> (i * 2)) & 3; + c += upper*4; + printf("%c", components[c]); + } +} + +/* Helper to print 8 chars of a swizzle, duplicating over */ +static void +print_swizzle_helper_8(unsigned swizzle, bool upper) +{ + for (unsigned i = 0; i < 4; ++i) { + unsigned c = (swizzle >> (i * 2)) & 3; + c *= 2; + c += upper*8; + printf("%c%c", components[c], components[c+1]); + } +} + +static void +print_swizzle_vec16(unsigned swizzle, bool rep_high, bool rep_low, + midgard_dest_override override) +{ + printf("."); + + if (override == midgard_dest_override_upper) { + if (rep_high) + printf(" /* rep_high */ "); + if (rep_low) + printf(" /* rep_low */ "); + + if (!rep_high && rep_low) + print_swizzle_helper_8(swizzle, true); + else + print_swizzle_helper_8(swizzle, false); + } else { + print_swizzle_helper_8(swizzle, rep_high & 1); + print_swizzle_helper_8(swizzle, !rep_low & 1); + } +} + +static void +print_swizzle_vec8(unsigned swizzle, bool rep_high, bool rep_low) +{ + printf("."); + + print_swizzle_helper(swizzle, rep_high & 1); + print_swizzle_helper(swizzle, !rep_low & 1); +} + +static void +print_swizzle_vec4(unsigned swizzle, bool rep_high, bool rep_low) +{ + if (rep_high) + printf(" /* rep_high */ "); + if (rep_low) + printf(" /* rep_low */ "); + + if (swizzle == 0xE4) return; /* xyzw */ + + printf("."); + print_swizzle_helper(swizzle, 0); +} +static void +print_swizzle_vec2(unsigned swizzle, bool rep_high, bool rep_low) +{ + if (rep_high) + printf(" /* rep_high */ "); + if (rep_low) + printf(" /* rep_low */ "); + + if (swizzle == 0xE4) return; /* XY */ + + printf("."); + + for (unsigned i = 0; i < 4; i += 2) { + unsigned a = (swizzle >> (i * 2)) & 3; + unsigned b = (swizzle >> ((i+1) * 2)) & 3; + + /* Normally we're adjacent, but if there's an issue, don't make + * it ambiguous */ + + if (a & 0x1) + printf("[%c%c]", components[a], components[b]); + else if (a == b) + printf("%c", components[a >> 1]); + else if (b == (a + 1)) + printf("%c", "XY"[a >> 1]); + else + printf("[%c%c]", components[a], components[b]); + } +} + +static int +bits_for_mode(midgard_reg_mode mode) +{ + switch (mode) { + case midgard_reg_mode_8: + return 8; + case midgard_reg_mode_16: + return 16; + case midgard_reg_mode_32: + return 32; + case midgard_reg_mode_64: + return 64; + default: + return 0; + } +} + +static int +bits_for_mode_halved(midgard_reg_mode mode, bool half) +{ + unsigned bits = bits_for_mode(mode); + + if (half) + bits >>= 1; + + return bits; +} + +static void +print_vector_src(unsigned src_binary, + midgard_reg_mode mode, unsigned reg, + midgard_dest_override override, bool is_int) +{ + midgard_vector_alu_src *src = (midgard_vector_alu_src *)&src_binary; + + /* Modifiers change meaning depending on the op's context */ + + midgard_int_mod int_mod = src->mod; + + if (is_int) { + printf("%s", srcmod_names_int[int_mod]); + } else { + if (src->mod & MIDGARD_FLOAT_MOD_NEG) + printf("-"); + + if (src->mod & MIDGARD_FLOAT_MOD_ABS) + printf("abs("); + } + + //register + unsigned bits = bits_for_mode_halved(mode, src->half); + print_reg(reg, bits); + + //swizzle + if (bits == 16) + print_swizzle_vec8(src->swizzle, src->rep_high, src->rep_low); + else if (bits == 8) + print_swizzle_vec16(src->swizzle, src->rep_high, src->rep_low, override); + else if (bits == 32) + print_swizzle_vec4(src->swizzle, src->rep_high, src->rep_low); + else if (bits == 64) + print_swizzle_vec2(src->swizzle, src->rep_high, src->rep_low); + + /* Since we wrapped with a function-looking thing */ + + if (is_int && int_mod == midgard_int_shift) + printf(") << %d", bits); + else if ((is_int && (int_mod != midgard_int_normal)) + || (!is_int && src->mod & MIDGARD_FLOAT_MOD_ABS)) + printf(")"); +} + +static uint16_t +decode_vector_imm(unsigned src2_reg, unsigned imm) +{ + uint16_t ret; + ret = src2_reg << 11; + ret |= (imm & 0x7) << 8; + ret |= (imm >> 3) & 0xFF; + return ret; +} + +static void +print_immediate(uint16_t imm) +{ + if (is_instruction_int) + printf("#%d", imm); + else + printf("#%g", _mesa_half_to_float(imm)); +} + +static unsigned +print_dest(unsigned reg, midgard_reg_mode mode, midgard_dest_override override) +{ + /* Depending on the mode and override, we determine the type of + * destination addressed. Absent an override, we address just the + * type of the operation itself */ + + unsigned bits = bits_for_mode(mode); + + if (override != midgard_dest_override_none) + bits /= 2; + + print_reg(reg, bits); + + return bits; +} + +static void +print_mask_vec16(uint8_t mask, midgard_dest_override override) +{ + printf("."); + + if (override == midgard_dest_override_none) { + for (unsigned i = 0; i < 8; i++) { + if (mask & (1 << i)) + printf("%c%c", + components[i*2 + 0], + components[i*2 + 1]); + } + } else { + bool upper = (override == midgard_dest_override_upper); + + for (unsigned i = 0; i < 8; i++) { + if (mask & (1 << i)) + printf("%c", components[i + (upper ? 8 : 0)]); + } + } +} + +/* For 16-bit+ masks, we read off from the 8-bit mask field. For 16-bit (vec8), + * it's just one bit per channel, easy peasy. For 32-bit (vec4), it's one bit + * per channel with one duplicate bit in the middle. For 64-bit (vec2), it's + * one-bit per channel with _3_ duplicate bits in the middle. Basically, just + * subdividing the 128-bit word in 16-bit increments. For 64-bit, we uppercase + * the mask to make it obvious what happened */ + +static void +print_mask(uint8_t mask, unsigned bits, midgard_dest_override override) +{ + if (bits == 8) { + print_mask_vec16(mask, override); + return; + } + + /* Skip 'complete' masks */ + + if (bits >= 32 && mask == 0xFF) return; + + if (bits == 16) { + if (mask == 0x0F) + return; + else if (mask == 0xF0) { + printf("'"); + return; + } + } + + printf("."); + + unsigned skip = (bits / 16); + bool uppercase = bits > 32; + bool tripped = false; + + for (unsigned i = 0; i < 8; i += skip) { + bool a = (mask & (1 << i)) != 0; + + for (unsigned j = 1; j < skip; ++j) { + bool dupe = (mask & (1 << (i + j))) != 0; + tripped |= (dupe != a); + } + + if (a) { + char c = components[i / skip]; + + if (uppercase) + c = toupper(c); + + printf("%c", c); + } + } + + if (tripped) + printf(" /* %X */", mask); +} + +/* Prints the 4-bit masks found in texture and load/store ops, as opposed to + * the 8-bit masks found in (vector) ALU ops */ + +static void +print_mask_4(unsigned mask) +{ + if (mask == 0xF) return; + + printf("."); + + for (unsigned i = 0; i < 4; ++i) { + bool a = (mask & (1 << i)) != 0; + if (a) + printf("%c", components[i]); + } +} + +static void +print_vector_field(const char *name, uint16_t *words, uint16_t reg_word, + unsigned tabs) +{ + midgard_reg_info *reg_info = (midgard_reg_info *)®_word; + midgard_vector_alu *alu_field = (midgard_vector_alu *) words; + midgard_reg_mode mode = alu_field->reg_mode; + unsigned override = alu_field->dest_override; + + /* For now, prefix instruction names with their unit, until we + * understand how this works on a deeper level */ + printf("%s.", name); + + print_alu_opcode(alu_field->op); + + /* Postfix with the size to disambiguate if necessary */ + char postfix = prefix_for_bits(bits_for_mode(mode)); + bool size_ambiguous = override != midgard_dest_override_none; + + if (size_ambiguous) + printf("%c", postfix ? postfix : 'r'); + + /* Print the outmod, if there is one */ + print_outmod(alu_field->outmod, + midgard_is_integer_out_op(alu_field->op)); + + printf(" "); + + /* Mask denoting status of 8-lanes */ + uint8_t mask = alu_field->mask; + + /* First, print the destination */ + unsigned dest_size = + print_dest(reg_info->out_reg, mode, alu_field->dest_override); + + /* Apply the destination override to the mask */ + + if (mode == midgard_reg_mode_32 || mode == midgard_reg_mode_64) { + if (override == midgard_dest_override_lower) + mask &= 0x0F; + else if (override == midgard_dest_override_upper) + mask &= 0xF0; + } else if (mode == midgard_reg_mode_16 + && override == midgard_dest_override_lower) { + /* stub */ + } + + if (override != midgard_dest_override_none) { + bool modeable = (mode != midgard_reg_mode_8); + bool known = override != 0x3; /* Unused value */ + + if (!(modeable && known)) + printf("/* do%d */ ", override); + } + + print_mask(mask, dest_size, override); + + printf(", "); + + bool is_int = midgard_is_integer_op(alu_field->op); + print_vector_src(alu_field->src1, mode, reg_info->src1_reg, override, is_int); + + printf(", "); + + if (reg_info->src2_imm) { + uint16_t imm = decode_vector_imm(reg_info->src2_reg, alu_field->src2 >> 2); + print_immediate(imm); + } else { + print_vector_src(alu_field->src2, mode, + reg_info->src2_reg, override, is_int); + } + + printf("\n"); +} + +static void +print_scalar_src(unsigned src_binary, unsigned reg) +{ + midgard_scalar_alu_src *src = (midgard_scalar_alu_src *)&src_binary; + + if (src->negate) + printf("-"); + + if (src->abs) + printf("abs("); + + print_reg(reg, src->full ? 32 : 16); + + unsigned c = src->component; + + if (src->full) { + assert((c & 1) == 0); + c >>= 1; + } + + printf(".%c", components[c]); + + if (src->abs) + printf(")"); + +} + +static uint16_t +decode_scalar_imm(unsigned src2_reg, unsigned imm) +{ + uint16_t ret; + ret = src2_reg << 11; + ret |= (imm & 3) << 9; + ret |= (imm & 4) << 6; + ret |= (imm & 0x38) << 2; + ret |= imm >> 6; + return ret; +} + +static void +print_scalar_field(const char *name, uint16_t *words, uint16_t reg_word, + unsigned tabs) +{ + midgard_reg_info *reg_info = (midgard_reg_info *)®_word; + midgard_scalar_alu *alu_field = (midgard_scalar_alu *) words; + + if (alu_field->unknown) + printf("scalar ALU unknown bit set\n"); + + printf("%s.", name); + print_alu_opcode(alu_field->op); + print_outmod(alu_field->outmod, + midgard_is_integer_out_op(alu_field->op)); + printf(" "); + + bool full = alu_field->output_full; + print_reg(reg_info->out_reg, full ? 32 : 16); + unsigned c = alu_field->output_component; + + if (full) { + assert((c & 1) == 0); + c >>= 1; + } + + printf(".%c, ", components[c]); + + print_scalar_src(alu_field->src1, reg_info->src1_reg); + + printf(", "); + + if (reg_info->src2_imm) { + uint16_t imm = decode_scalar_imm(reg_info->src2_reg, + alu_field->src2); + print_immediate(imm); + } else + print_scalar_src(alu_field->src2, reg_info->src2_reg); + + printf("\n"); +} + +static void +print_branch_op(int op) +{ + switch (op) { + case midgard_jmp_writeout_op_branch_uncond: + printf("uncond."); + break; + + case midgard_jmp_writeout_op_branch_cond: + printf("cond."); + break; + + case midgard_jmp_writeout_op_writeout: + printf("write."); + break; + + case midgard_jmp_writeout_op_tilebuffer_pending: + printf("tilebuffer."); + break; + + case midgard_jmp_writeout_op_discard: + printf("discard."); + break; + + default: + printf("unk%d.", op); + break; + } +} + +static void +print_branch_cond(int cond) +{ + switch (cond) { + case midgard_condition_write0: + printf("write0"); + break; + + case midgard_condition_false: + printf("false"); + break; + + case midgard_condition_true: + printf("true"); + break; + + case midgard_condition_always: + printf("always"); + break; + + default: + printf("unk%X", cond); + break; + } +} + +static void +print_compact_branch_writeout_field(uint16_t word) +{ + midgard_jmp_writeout_op op = word & 0x7; + + switch (op) { + case midgard_jmp_writeout_op_branch_uncond: { + midgard_branch_uncond br_uncond; + memcpy((char *) &br_uncond, (char *) &word, sizeof(br_uncond)); + printf("br.uncond "); + + if (br_uncond.unknown != 1) + printf("unknown:%d, ", br_uncond.unknown); + + if (br_uncond.offset >= 0) + printf("+"); + + printf("%d -> ", br_uncond.offset); + print_tag_short(br_uncond.dest_tag); + printf("\n"); + + break; + } + + case midgard_jmp_writeout_op_branch_cond: + case midgard_jmp_writeout_op_writeout: + case midgard_jmp_writeout_op_discard: + default: { + midgard_branch_cond br_cond; + memcpy((char *) &br_cond, (char *) &word, sizeof(br_cond)); + + printf("br."); + + print_branch_op(br_cond.op); + print_branch_cond(br_cond.cond); + + printf(" "); + + if (br_cond.offset >= 0) + printf("+"); + + printf("%d -> ", br_cond.offset); + print_tag_short(br_cond.dest_tag); + printf("\n"); + + break; + } + } +} + +static void +print_extended_branch_writeout_field(uint8_t *words) +{ + midgard_branch_extended br; + memcpy((char *) &br, (char *) words, sizeof(br)); + + printf("brx."); + + print_branch_op(br.op); + + /* Condition repeated 8 times in all known cases. Check this. */ + + unsigned cond = br.cond & 0x3; + + for (unsigned i = 0; i < 16; i += 2) { + assert(((br.cond >> i) & 0x3) == cond); + } + + print_branch_cond(cond); + + if (br.unknown) + printf(".unknown%d", br.unknown); + + printf(" "); + + if (br.offset >= 0) + printf("+"); + + printf("%d -> ", br.offset); + print_tag_short(br.dest_tag); + printf("\n"); +} + +static unsigned +num_alu_fields_enabled(uint32_t control_word) +{ + unsigned ret = 0; + + if ((control_word >> 17) & 1) + ret++; + + if ((control_word >> 19) & 1) + ret++; + + if ((control_word >> 21) & 1) + ret++; + + if ((control_word >> 23) & 1) + ret++; + + if ((control_word >> 25) & 1) + ret++; + + return ret; +} + +static float +float_bitcast(uint32_t integer) +{ + union { + uint32_t i; + float f; + } v; + + v.i = integer; + return v.f; +} + +static void +print_alu_word(uint32_t *words, unsigned num_quad_words, + unsigned tabs) +{ + uint32_t control_word = words[0]; + uint16_t *beginning_ptr = (uint16_t *)(words + 1); + unsigned num_fields = num_alu_fields_enabled(control_word); + uint16_t *word_ptr = beginning_ptr + num_fields; + unsigned num_words = 2 + num_fields; + + if ((control_word >> 16) & 1) + printf("unknown bit 16 enabled\n"); + + if ((control_word >> 17) & 1) { + print_vector_field("vmul", word_ptr, *beginning_ptr, tabs); + beginning_ptr += 1; + word_ptr += 3; + num_words += 3; + } + + if ((control_word >> 18) & 1) + printf("unknown bit 18 enabled\n"); + + if ((control_word >> 19) & 1) { + print_scalar_field("sadd", word_ptr, *beginning_ptr, tabs); + beginning_ptr += 1; + word_ptr += 2; + num_words += 2; + } + + if ((control_word >> 20) & 1) + printf("unknown bit 20 enabled\n"); + + if ((control_word >> 21) & 1) { + print_vector_field("vadd", word_ptr, *beginning_ptr, tabs); + beginning_ptr += 1; + word_ptr += 3; + num_words += 3; + } + + if ((control_word >> 22) & 1) + printf("unknown bit 22 enabled\n"); + + if ((control_word >> 23) & 1) { + print_scalar_field("smul", word_ptr, *beginning_ptr, tabs); + beginning_ptr += 1; + word_ptr += 2; + num_words += 2; + } + + if ((control_word >> 24) & 1) + printf("unknown bit 24 enabled\n"); + + if ((control_word >> 25) & 1) { + print_vector_field("lut", word_ptr, *beginning_ptr, tabs); + beginning_ptr += 1; + word_ptr += 3; + num_words += 3; + } + + if ((control_word >> 26) & 1) { + print_compact_branch_writeout_field(*word_ptr); + word_ptr += 1; + num_words += 1; + } + + if ((control_word >> 27) & 1) { + print_extended_branch_writeout_field((uint8_t *) word_ptr); + word_ptr += 3; + num_words += 3; + } + + if (num_quad_words > (num_words + 7) / 8) { + assert(num_quad_words == (num_words + 15) / 8); + //Assume that the extra quadword is constants + void *consts = words + (4 * num_quad_words - 4); + + if (is_embedded_constant_int) { + if (is_embedded_constant_half) { + int16_t *sconsts = (int16_t *) consts; + printf("sconstants %d, %d, %d, %d\n", + sconsts[0], + sconsts[1], + sconsts[2], + sconsts[3]); + } else { + int32_t *iconsts = (int32_t *) consts; + printf("iconstants %d, %d, %d, %d\n", + iconsts[0], + iconsts[1], + iconsts[2], + iconsts[3]); + } + } else { + if (is_embedded_constant_half) { + uint16_t *hconsts = (uint16_t *) consts; + printf("hconstants %g, %g, %g, %g\n", + _mesa_half_to_float(hconsts[0]), + _mesa_half_to_float(hconsts[1]), + _mesa_half_to_float(hconsts[2]), + _mesa_half_to_float(hconsts[3])); + } else { + uint32_t *fconsts = (uint32_t *) consts; + printf("fconstants %g, %g, %g, %g\n", + float_bitcast(fconsts[0]), + float_bitcast(fconsts[1]), + float_bitcast(fconsts[2]), + float_bitcast(fconsts[3])); + } + + } + } +} + +static void +print_varying_parameters(midgard_load_store_word *word) +{ + midgard_varying_parameter param; + unsigned v = word->varying_parameters; + memcpy(¶m, &v, sizeof(param)); + + if (param.is_varying) { + /* If a varying, there are qualifiers */ + if (param.flat) + printf(".flat"); + + if (param.interpolation != midgard_interp_default) { + if (param.interpolation == midgard_interp_centroid) + printf(".centroid"); + else + printf(".interp%d", param.interpolation); + } + + if (param.modifier != midgard_varying_mod_none) { + if (param.modifier == midgard_varying_mod_perspective_w) + printf(".perspectivew"); + else if (param.modifier == midgard_varying_mod_perspective_z) + printf(".perspectivez"); + else + printf(".mod%d", param.modifier); + } + } else if (param.flat || param.interpolation || param.modifier) { + printf(" /* is_varying not set but varying metadata attached */"); + } + + if (param.zero0 || param.zero1 || param.zero2) + printf(" /* zero tripped, %d %d %d */ ", param.zero0, param.zero1, param.zero2); +} + +static bool +is_op_varying(unsigned op) +{ + switch (op) { + case midgard_op_st_vary_16: + case midgard_op_st_vary_32: + case midgard_op_ld_vary_16: + case midgard_op_ld_vary_32: + return true; + } + + return false; +} + +static void +print_load_store_instr(uint64_t data, + unsigned tabs) +{ + midgard_load_store_word *word = (midgard_load_store_word *) &data; + + print_ld_st_opcode(word->op); + + if (is_op_varying(word->op)) + print_varying_parameters(word); + + printf(" r%d", word->reg); + print_mask_4(word->mask); + + int address = word->address; + + if (word->op == midgard_op_ld_uniform_32) { + /* Uniforms use their own addressing scheme */ + + int lo = word->varying_parameters >> 7; + int hi = word->address; + + /* TODO: Combine fields logically */ + address = (hi << 3) | lo; + } + + printf(", %d", address); + + print_swizzle_vec4(word->swizzle, false, false); + + printf(", 0x%X /* %X */\n", word->unknown, word->varying_parameters); +} + +static void +print_load_store_word(uint32_t *word, unsigned tabs) +{ + midgard_load_store *load_store = (midgard_load_store *) word; + + if (load_store->word1 != 3) { + print_load_store_instr(load_store->word1, tabs); + } + + if (load_store->word2 != 3) { + print_load_store_instr(load_store->word2, tabs); + } +} + +static void +print_texture_reg(bool full, bool select, bool upper) +{ + if (full) + printf("r%d", REG_TEX_BASE + select); + else + printf("hr%d", (REG_TEX_BASE + select) * 2 + upper); + + if (full && upper) + printf("// error: out full / upper mutually exclusive\n"); + +} + +static void +print_texture_reg_triple(unsigned triple) +{ + bool full = triple & 1; + bool select = triple & 2; + bool upper = triple & 4; + + print_texture_reg(full, select, upper); +} + +static void +print_texture_format(int format) +{ + /* Act like a modifier */ + printf("."); + + switch (format) { + DEFINE_CASE(MALI_TEX_1D, "1d"); + DEFINE_CASE(MALI_TEX_2D, "2d"); + DEFINE_CASE(MALI_TEX_3D, "3d"); + DEFINE_CASE(MALI_TEX_CUBE, "cube"); + + default: + unreachable("Bad format"); + } +} + +static void +print_texture_op(unsigned op, bool gather) +{ + /* Act like a bare name, like ESSL functions */ + + if (gather) { + printf("textureGather"); + + unsigned component = op >> 4; + unsigned bottom = op & 0xF; + + if (bottom != 0x2) + printf("_unk%d", bottom); + + printf(".%c", components[component]); + return; + } + + switch (op) { + DEFINE_CASE(TEXTURE_OP_NORMAL, "texture"); + DEFINE_CASE(TEXTURE_OP_LOD, "textureLod"); + DEFINE_CASE(TEXTURE_OP_TEXEL_FETCH, "texelFetch"); + + default: + printf("tex_%d", op); + break; + } +} + +static bool +texture_op_takes_bias(unsigned op) +{ + return op == TEXTURE_OP_NORMAL; +} + +static char +sampler_type_name(enum mali_sampler_type t) +{ + switch (t) { + case MALI_SAMPLER_FLOAT: + return 'f'; + case MALI_SAMPLER_UNSIGNED: + return 'u'; + case MALI_SAMPLER_SIGNED: + return 'i'; + default: + return '?'; + } + +} + +#undef DEFINE_CASE + +static void +print_texture_word(uint32_t *word, unsigned tabs) +{ + midgard_texture_word *texture = (midgard_texture_word *) word; + + /* Broad category of texture operation in question */ + print_texture_op(texture->op, texture->is_gather); + + /* Specific format in question */ + print_texture_format(texture->format); + + assert(texture->zero == 0); + + /* Instruction "modifiers" parallel the ALU instructions. */ + + if (texture->shadow) + printf(".shadow"); + + if (texture->cont) + printf(".cont"); + + if (texture->last) + printf(".last"); + + printf(" "); + + print_texture_reg(texture->out_full, texture->out_reg_select, texture->out_upper); + print_mask_4(texture->mask); + printf(", "); + + printf("texture%d, ", texture->texture_handle); + + /* Print the type, GL style */ + printf("%c", sampler_type_name(texture->sampler_type)); + printf("sampler%d", texture->sampler_handle); + print_swizzle_vec4(texture->swizzle, false, false); + printf(", "); + + print_texture_reg(texture->in_reg_full, texture->in_reg_select, texture->in_reg_upper); + print_swizzle_vec4(texture->in_reg_swizzle, false, false); + + /* There is *always* an offset attached. Of + * course, that offset is just immediate #0 for a + * GLES call that doesn't take an offset. If there + * is a non-negative non-zero offset, this is + * specified in immediate offset mode, with the + * values in the offset_* fields as immediates. If + * this is a negative offset, we instead switch to + * a register offset mode, where the offset_* + * fields become register triplets */ + + if (texture->offset_register) { + printf(" + "); + print_texture_reg_triple(texture->offset_x); + + /* The less questions you ask, the better. */ + + unsigned swizzle_lo, swizzle_hi; + unsigned orig_y = texture->offset_y; + unsigned orig_z = texture->offset_z; + + memcpy(&swizzle_lo, &orig_y, sizeof(unsigned)); + memcpy(&swizzle_hi, &orig_z, sizeof(unsigned)); + + /* Duplicate hi swizzle over */ + assert(swizzle_hi < 4); + swizzle_hi = (swizzle_hi << 2) | swizzle_hi; + + unsigned swiz = (swizzle_lo << 4) | swizzle_hi; + unsigned reversed = util_bitreverse(swiz) >> 24; + print_swizzle_vec4(reversed, false, false); + + printf(", "); + } else if (texture->offset_x || texture->offset_y || texture->offset_z) { + /* Only select ops allow negative immediate offsets, verify */ + + bool neg_x = texture->offset_x < 0; + bool neg_y = texture->offset_y < 0; + bool neg_z = texture->offset_z < 0; + bool any_neg = neg_x || neg_y || neg_z; + + if (any_neg && texture->op != TEXTURE_OP_TEXEL_FETCH) + printf("/* invalid negative */ "); + + /* Regardless, just print the immediate offset */ + + printf(" + <%d, %d, %d>, ", + texture->offset_x, + texture->offset_y, + texture->offset_z); + } else { + printf(", "); + } + + char lod_operand = texture_op_takes_bias(texture->op) ? '+' : '='; + + if (texture->lod_register) { + midgard_tex_register_select sel; + uint8_t raw = texture->bias; + memcpy(&sel, &raw, sizeof(raw)); + + unsigned c = (sel.component_hi << 1) | sel.component_lo; + + printf("lod %c ", lod_operand); + print_texture_reg(sel.full, sel.select, sel.upper); + printf(".%c, ", components[c]); + + if (!sel.component_hi) + printf(" /* gradient? */"); + + if (texture->bias_int) + printf(" /* bias_int = 0x%X */", texture->bias_int); + + if (sel.zero) + printf(" /* sel.zero = 0x%X */", sel.zero); + } else if (texture->op == TEXTURE_OP_TEXEL_FETCH) { + /* For texel fetch, the int LOD is in the fractional place and + * there is no fraction / possibility of bias. We *always* have + * an explicit LOD, even if it's zero. */ + + if (texture->bias_int) + printf(" /* bias_int = 0x%X */ ", texture->bias_int); + + printf("lod = %d, ", texture->bias); + } else if (texture->bias || texture->bias_int) { + signed bias_int = texture->bias_int; + float bias_frac = texture->bias / 256.0f; + float bias = bias_int + bias_frac; + + bool is_bias = texture_op_takes_bias(texture->op); + char sign = (bias >= 0.0) ? '+' : '-'; + char operand = is_bias ? sign : '='; + + printf("lod %c %f, ", operand, fabsf(bias)); + } + + printf("\n"); + + /* While not zero in general, for these simple instructions the + * following unknowns are zero, so we don't include them */ + + if (texture->unknown2 || + texture->unknown4 || + texture->unknownA || + texture->unknown8) { + printf("// unknown2 = 0x%x\n", texture->unknown2); + printf("// unknown4 = 0x%x\n", texture->unknown4); + printf("// unknownA = 0x%x\n", texture->unknownA); + printf("// unknown8 = 0x%x\n", texture->unknown8); + } +} + +void +disassemble_midgard(uint8_t *code, size_t size) +{ + uint32_t *words = (uint32_t *) code; + unsigned num_words = size / 4; + int tabs = 0; + + bool prefetch_flag = false; + + unsigned i = 0; + + while (i < num_words) { + unsigned tag = words[i] & 0xF; + unsigned num_quad_words = midgard_word_size[tag]; + + switch (midgard_word_types[tag]) { + case midgard_word_type_texture: + print_texture_word(&words[i], tabs); + break; + + case midgard_word_type_load_store: + print_load_store_word(&words[i], tabs); + break; + + case midgard_word_type_alu: + print_alu_word(&words[i], num_quad_words, tabs); + + if (prefetch_flag) + return; + + /* Reset word static analysis state */ + is_embedded_constant_half = false; + is_embedded_constant_int = false; + + break; + + default: + printf("Unknown word type %u:\n", words[i] & 0xF); + num_quad_words = 1; + print_quad_word(&words[i], tabs); + printf("\n"); + break; + } + + printf("\n"); + + unsigned next = (words[i] & 0xF0) >> 4; + + i += 4 * num_quad_words; + + /* Break based on instruction prefetch flag */ + + if (i < num_words && next == 1) { + prefetch_flag = true; + + if (midgard_word_types[words[i] & 0xF] != midgard_word_type_alu) + return; + } + } + + return; +} diff --git a/src/panfrost/midgard/disassemble.h b/src/panfrost/midgard/disassemble.h new file mode 100644 index 00000000000..ab1837c201e --- /dev/null +++ b/src/panfrost/midgard/disassemble.h @@ -0,0 +1,2 @@ +#include <stddef.h> +void disassemble_midgard(uint8_t *code, size_t size); diff --git a/src/panfrost/midgard/helpers.h b/src/panfrost/midgard/helpers.h new file mode 100644 index 00000000000..ef854dc60c1 --- /dev/null +++ b/src/panfrost/midgard/helpers.h @@ -0,0 +1,282 @@ +/* Copyright (c) 2018-2019 Alyssa Rosenzweig ([email protected]) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef __MDG_HELPERS_H +#define __MDG_HELPERS_H + +#include "util/macros.h" +#include <string.h> + +#define OP_IS_STORE_VARY(op) (\ + op == midgard_op_st_vary_16 || \ + op == midgard_op_st_vary_32 \ + ) + +#define OP_IS_STORE(op) (\ + OP_IS_STORE_VARY(op) || \ + op == midgard_op_st_cubemap_coords \ + ) + +#define OP_IS_MOVE(op) ( \ + op == midgard_alu_op_fmov || \ + op == midgard_alu_op_imov \ + ) + +/* ALU control words are single bit fields with a lot of space */ + +#define ALU_ENAB_VEC_MUL (1 << 17) +#define ALU_ENAB_SCAL_ADD (1 << 19) +#define ALU_ENAB_VEC_ADD (1 << 21) +#define ALU_ENAB_SCAL_MUL (1 << 23) +#define ALU_ENAB_VEC_LUT (1 << 25) +#define ALU_ENAB_BR_COMPACT (1 << 26) +#define ALU_ENAB_BRANCH (1 << 27) + +/* Other opcode properties that don't conflict with the ALU_ENABs, non-ISA */ + +/* Denotes an opcode that takes a vector input with a fixed-number of + * channels, but outputs to only a single output channel, like dot products. + * For these, to determine the effective mask, this quirk can be set. We have + * an intentional off-by-one (a la MALI_POSITIVE), since 0-channel makes no + * sense but we need to fit 4 channels in 2-bits. Similarly, 1-channel doesn't + * make sense (since then why are we quirked?), so that corresponds to "no + * count set" */ + +#define OP_CHANNEL_COUNT(c) ((c - 1) << 0) +#define GET_CHANNEL_COUNT(c) ((c & (0x3 << 0)) ? ((c & (0x3 << 0)) + 1) : 0) + +/* For instructions that take a single argument, normally the first argument + * slot is used for the argument and the second slot is a dummy #0 constant. + * However, there are exceptions: instructions like fmov store their argument + * in the _second_ slot and store a dummy r24 in the first slot, designated by + * QUIRK_FLIPPED_R24 */ + +#define QUIRK_FLIPPED_R24 (1 << 2) + +/* Is the op commutative? */ +#define OP_COMMUTES (1 << 3) + +/* Does the op convert types between int- and float- space (i2f/f2u/etc) */ +#define OP_TYPE_CONVERT (1 << 4) + +/* Vector-independant shorthands for the above; these numbers are arbitrary and + * not from the ISA. Convert to the above with unit_enum_to_midgard */ + +#define UNIT_MUL 0 +#define UNIT_ADD 1 +#define UNIT_LUT 2 + +/* 4-bit type tags */ + +#define TAG_TEXTURE_4_VTX 0x2 +#define TAG_TEXTURE_4 0x3 +#define TAG_LOAD_STORE_4 0x5 +#define TAG_ALU_4 0x8 +#define TAG_ALU_8 0x9 +#define TAG_ALU_12 0xA +#define TAG_ALU_16 0xB + +static inline int +quadword_size(int tag) +{ + switch (tag) { + case TAG_ALU_4: + case TAG_LOAD_STORE_4: + case TAG_TEXTURE_4: + case TAG_TEXTURE_4_VTX: + return 1; + case TAG_ALU_8: + return 2; + case TAG_ALU_12: + return 3; + case TAG_ALU_16: + return 4; + default: + unreachable("Unknown tag"); + } +} + +#define IS_ALU(tag) (tag == TAG_ALU_4 || tag == TAG_ALU_8 || \ + tag == TAG_ALU_12 || tag == TAG_ALU_16) + +/* Special register aliases */ + +#define MAX_WORK_REGISTERS 16 + +/* Uniforms are begin at (REGISTER_UNIFORMS - uniform_count) */ +#define REGISTER_UNIFORMS 24 + +#define REGISTER_UNUSED 24 +#define REGISTER_CONSTANT 26 +#define REGISTER_VARYING_BASE 26 +#define REGISTER_OFFSET 27 +#define REGISTER_TEXTURE_BASE 28 +#define REGISTER_SELECT 31 + +/* SSA helper aliases to mimic the registers. UNUSED_0 encoded as an inline + * constant. UNUSED_1 encoded as REGISTER_UNUSED */ + +#define SSA_UNUSED_0 0 +#define SSA_UNUSED_1 -2 + +#define SSA_FIXED_SHIFT 24 +#define SSA_FIXED_REGISTER(reg) ((1 + reg) << SSA_FIXED_SHIFT) +#define SSA_REG_FROM_FIXED(reg) ((reg >> SSA_FIXED_SHIFT) - 1) +#define SSA_FIXED_MINIMUM SSA_FIXED_REGISTER(0) + +/* Swizzle support */ + +#define SWIZZLE(A, B, C, D) ((D << 6) | (C << 4) | (B << 2) | (A << 0)) +#define SWIZZLE_FROM_ARRAY(r) SWIZZLE(r[0], r[1], r[2], r[3]) +#define COMPONENT_X 0x0 +#define COMPONENT_Y 0x1 +#define COMPONENT_Z 0x2 +#define COMPONENT_W 0x3 + +#define SWIZZLE_XXXX SWIZZLE(COMPONENT_X, COMPONENT_X, COMPONENT_X, COMPONENT_X) +#define SWIZZLE_XYXX SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_X, COMPONENT_X) +#define SWIZZLE_XYZX SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_X) +#define SWIZZLE_XYZW SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W) +#define SWIZZLE_XYXZ SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_X, COMPONENT_Z) +#define SWIZZLE_XYZZ SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_Z) +#define SWIZZLE_WWWW SWIZZLE(COMPONENT_W, COMPONENT_W, COMPONENT_W, COMPONENT_W) + +static inline unsigned +swizzle_of(unsigned comp) +{ + switch (comp) { + case 1: + return SWIZZLE_XXXX; + case 2: + return SWIZZLE_XYXX; + case 3: + return SWIZZLE_XYZX; + case 4: + return SWIZZLE_XYZW; + default: + unreachable("Invalid component count"); + } +} + +static inline unsigned +mask_of(unsigned nr_comp) +{ + return (1 << nr_comp) - 1; +} + + +/* See ISA notes */ + +#define LDST_NOP (3) + +/* There are five ALU units: VMUL, VADD, SMUL, SADD, LUT. A given opcode is + * implemented on some subset of these units (or occassionally all of them). + * This table encodes a bit mask of valid units for each opcode, so the + * scheduler can figure where to plonk the instruction. */ + +/* Shorthands for each unit */ +#define UNIT_VMUL ALU_ENAB_VEC_MUL +#define UNIT_SADD ALU_ENAB_SCAL_ADD +#define UNIT_VADD ALU_ENAB_VEC_ADD +#define UNIT_SMUL ALU_ENAB_SCAL_MUL +#define UNIT_VLUT ALU_ENAB_VEC_LUT + +/* Shorthands for usual combinations of units */ + +#define UNITS_MUL (UNIT_VMUL | UNIT_SMUL) +#define UNITS_ADD (UNIT_VADD | UNIT_SADD) +#define UNITS_MOST (UNITS_MUL | UNITS_ADD) +#define UNITS_ALL (UNITS_MOST | UNIT_VLUT) +#define UNITS_SCALAR (UNIT_SADD | UNIT_SMUL) +#define UNITS_VECTOR (UNIT_VMUL | UNIT_VADD) +#define UNITS_ANY_VECTOR (UNITS_VECTOR | UNIT_VLUT) + +struct mir_op_props { + const char *name; + unsigned props; +}; + +/* This file is common, so don't define the tables themselves. #include + * midgard_op.h if you need that, or edit midgard_ops.c directly */ + +/* Duplicate bits to convert a 4-bit writemask to duplicated 8-bit format, + * which is used for 32-bit vector units */ + +static inline unsigned +expand_writemask_32(unsigned mask) +{ + unsigned o = 0; + + for (int i = 0; i < 4; ++i) + if (mask & (1 << i)) + o |= (3 << (2 * i)); + + return o; +} + +/* Coerce structs to integer */ + +static inline unsigned +vector_alu_srco_unsigned(midgard_vector_alu_src src) +{ + unsigned u; + memcpy(&u, &src, sizeof(src)); + return u; +} + +static inline midgard_vector_alu_src +vector_alu_from_unsigned(unsigned u) +{ + midgard_vector_alu_src s; + memcpy(&s, &u, sizeof(s)); + return s; +} + +/* Composes two swizzles */ +static inline unsigned +pan_compose_swizzle(unsigned left, unsigned right) +{ + unsigned out = 0; + + for (unsigned c = 0; c < 4; ++c) { + unsigned s = (left >> (2*c)) & 0x3; + unsigned q = (right >> (2*s)) & 0x3; + + out |= (q << (2*c)); + } + + return out; +} + +/* Applies a swizzle to an ALU source */ + +static inline unsigned +vector_alu_apply_swizzle(unsigned src, unsigned swizzle) +{ + midgard_vector_alu_src s = + vector_alu_from_unsigned(src); + + s.swizzle = pan_compose_swizzle(s.swizzle, swizzle); + + return vector_alu_srco_unsigned(s); +} + +#endif diff --git a/src/panfrost/midgard/meson.build b/src/panfrost/midgard/meson.build new file mode 100644 index 00000000000..cbe26004e2d --- /dev/null +++ b/src/panfrost/midgard/meson.build @@ -0,0 +1,63 @@ +# Copyright © 2018 Rob Clark +# Copyright © 2019 Collabora + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +libpanfrost_midgard_files = files( + 'midgard_compile.c', + 'mir.c', + 'midgard_print.c', + 'midgard_schedule.c', + 'midgard_emit.c', + 'midgard_ra.c', + 'midgard_ra_pipeline.c', + 'midgard_liveness.c', + 'midgard_ops.c', + 'cppwrap.cpp', + 'disassemble.c', +) + +midgard_nir_algebraic_c = custom_target( + 'midgard_nir_algebraic.c', + input : 'midgard_nir_algebraic.py', + output : 'midgard_nir_algebraic.c', + command : [ + prog_python, '@INPUT@', + '-p', join_paths(meson.source_root(), 'src/compiler/nir/'), + ], + capture : true, + depend_files : nir_algebraic_py, +) + +libpanfrost_midgard = static_library( + 'panfrost_midgard', + [libpanfrost_midgard_files, midgard_nir_algebraic_c], + include_directories : [ + inc_common, + inc_include, + inc_src, + inc_panfrost_hw, + ], + dependencies: [ + idep_nir + ], + c_args : [c_vis_args, no_override_init_args], + cpp_args : [cpp_vis_args], + build_by_default : false, +) diff --git a/src/panfrost/midgard/midgard-parse.h b/src/panfrost/midgard/midgard-parse.h new file mode 100644 index 00000000000..5d134839406 --- /dev/null +++ b/src/panfrost/midgard/midgard-parse.h @@ -0,0 +1,70 @@ +/* Author(s): + * Connor Abbott + * Alyssa Rosenzweig + * + * Copyright (c) 2013 Connor Abbott ([email protected]) + * Copyright (c) 2018 Alyssa Rosenzweig ([email protected]) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef __midgard_parse_h__ +#define __midgard_parse_h__ + +/* Additional metadata for parsing Midgard binaries, not needed for compilation */ + +static midgard_word_type midgard_word_types[16] = { + midgard_word_type_unknown, /* 0x0 */ + midgard_word_type_unknown, /* 0x1 */ + midgard_word_type_texture, /* 0x2 */ + midgard_word_type_texture, /* 0x3 */ + midgard_word_type_unknown, /* 0x4 */ + midgard_word_type_load_store, /* 0x5 */ + midgard_word_type_unknown, /* 0x6 */ + midgard_word_type_unknown, /* 0x7 */ + midgard_word_type_alu, /* 0x8 */ + midgard_word_type_alu, /* 0x9 */ + midgard_word_type_alu, /* 0xA */ + midgard_word_type_alu, /* 0xB */ + midgard_word_type_alu, /* 0xC */ + midgard_word_type_alu, /* 0xD */ + midgard_word_type_alu, /* 0xE */ + midgard_word_type_alu, /* 0xF */ +}; + +static unsigned midgard_word_size[16] = { + 0, /* 0x0 */ + 0, /* 0x1 */ + 1, /* 0x2 */ + 1, /* 0x3 */ + 0, /* 0x4 */ + 1, /* 0x5 */ + 0, /* 0x6 */ + 0, /* 0x7 */ + 1, /* 0x8 */ + 2, /* 0x9 */ + 3, /* 0xA */ + 4, /* 0xB */ + 1, /* 0xC */ + 2, /* 0xD */ + 3, /* 0xE */ + 4, /* 0xF */ +}; + +#endif diff --git a/src/panfrost/midgard/midgard.h b/src/panfrost/midgard/midgard.h new file mode 100644 index 00000000000..5953214c599 --- /dev/null +++ b/src/panfrost/midgard/midgard.h @@ -0,0 +1,646 @@ +/* Author(s): + * Connor Abbott + * Alyssa Rosenzweig + * + * Copyright (c) 2013 Connor Abbott ([email protected]) + * Copyright (c) 2018 Alyssa Rosenzweig ([email protected]) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef __midgard_h__ +#define __midgard_h__ + +#include <stdint.h> +#include <stdbool.h> +#include "panfrost-job.h" + +#define MIDGARD_DBG_MSGS 0x0001 +#define MIDGARD_DBG_SHADERS 0x0002 +#define MIDGARD_DBG_SHADERDB 0x0004 + +extern int midgard_debug; + +typedef enum { + midgard_word_type_alu, + midgard_word_type_load_store, + midgard_word_type_texture, + midgard_word_type_unknown +} midgard_word_type; + +typedef enum { + midgard_alu_vmul, + midgard_alu_sadd, + midgard_alu_smul, + midgard_alu_vadd, + midgard_alu_lut +} midgard_alu; + +/* + * ALU words + */ + +typedef enum { + midgard_alu_op_fadd = 0x10, + midgard_alu_op_fmul = 0x14, + + midgard_alu_op_fmin = 0x28, + midgard_alu_op_fmax = 0x2C, + + midgard_alu_op_fmov = 0x30, /* fmov_rte */ + midgard_alu_op_fmov_rtz = 0x31, + midgard_alu_op_fmov_rtn = 0x32, + midgard_alu_op_fmov_rtp = 0x33, + midgard_alu_op_froundeven = 0x34, + midgard_alu_op_ftrunc = 0x35, + midgard_alu_op_ffloor = 0x36, + midgard_alu_op_fceil = 0x37, + midgard_alu_op_ffma = 0x38, + midgard_alu_op_fdot3 = 0x3C, + midgard_alu_op_fdot3r = 0x3D, + midgard_alu_op_fdot4 = 0x3E, + midgard_alu_op_freduce = 0x3F, + + midgard_alu_op_iadd = 0x40, + midgard_alu_op_ishladd = 0x41, + midgard_alu_op_isub = 0x46, + midgard_alu_op_iaddsat = 0x48, + midgard_alu_op_uaddsat = 0x49, + midgard_alu_op_isubsat = 0x4E, + midgard_alu_op_usubsat = 0x4F, + + midgard_alu_op_imul = 0x58, + + midgard_alu_op_imin = 0x60, + midgard_alu_op_umin = 0x61, + midgard_alu_op_imax = 0x62, + midgard_alu_op_umax = 0x63, + midgard_alu_op_ihadd = 0x64, + midgard_alu_op_uhadd = 0x65, + midgard_alu_op_irhadd = 0x66, + midgard_alu_op_urhadd = 0x67, + midgard_alu_op_iasr = 0x68, + midgard_alu_op_ilsr = 0x69, + midgard_alu_op_ishl = 0x6E, + + midgard_alu_op_iand = 0x70, + midgard_alu_op_ior = 0x71, + midgard_alu_op_inand = 0x72, /* ~(a & b), for inot let a = b */ + midgard_alu_op_inor = 0x73, /* ~(a | b) */ + midgard_alu_op_iandnot = 0x74, /* (a & ~b), used for not/b2f */ + midgard_alu_op_iornot = 0x75, /* (a | ~b) */ + midgard_alu_op_ixor = 0x76, + midgard_alu_op_inxor = 0x77, /* ~(a & b) */ + midgard_alu_op_iclz = 0x78, /* Number of zeroes on left */ + midgard_alu_op_ibitcount8 = 0x7A, /* Counts bits in 8-bit increments */ + midgard_alu_op_imov = 0x7B, + midgard_alu_op_iabsdiff = 0x7C, + midgard_alu_op_uabsdiff = 0x7D, + midgard_alu_op_ichoose = 0x7E, /* vector, component number - dupe for shuffle() */ + + midgard_alu_op_feq = 0x80, + midgard_alu_op_fne = 0x81, + midgard_alu_op_flt = 0x82, + midgard_alu_op_fle = 0x83, + midgard_alu_op_fball_eq = 0x88, + midgard_alu_op_bball_eq = 0x89, + midgard_alu_op_fball_lt = 0x8A, /* all(lessThan(.., ..)) */ + midgard_alu_op_fball_lte = 0x8B, /* all(lessThanEqual(.., ..)) */ + + midgard_alu_op_bbany_neq = 0x90, /* used for bvec4(1) */ + midgard_alu_op_fbany_neq = 0x91, /* bvec4(0) also */ + midgard_alu_op_fbany_lt = 0x92, /* any(lessThan(.., ..)) */ + midgard_alu_op_fbany_lte = 0x93, /* any(lessThanEqual(.., ..)) */ + + midgard_alu_op_f2i_rte = 0x98, + midgard_alu_op_f2i_rtz = 0x99, + midgard_alu_op_f2i_rtn = 0x9A, + midgard_alu_op_f2i_rtp = 0x9B, + midgard_alu_op_f2u_rte = 0x9C, + midgard_alu_op_f2u_rtz = 0x9D, + midgard_alu_op_f2u_rtn = 0x9E, + midgard_alu_op_f2u_rtp = 0x9F, + + midgard_alu_op_ieq = 0xA0, + midgard_alu_op_ine = 0xA1, + midgard_alu_op_ult = 0xA2, + midgard_alu_op_ule = 0xA3, + midgard_alu_op_ilt = 0xA4, + midgard_alu_op_ile = 0xA5, + midgard_alu_op_iball_eq = 0xA8, + midgard_alu_op_iball_neq = 0xA9, + midgard_alu_op_uball_lt = 0xAA, + midgard_alu_op_uball_lte = 0xAB, + midgard_alu_op_iball_lt = 0xAC, + midgard_alu_op_iball_lte = 0xAD, + + midgard_alu_op_ibany_eq = 0xB0, + midgard_alu_op_ibany_neq = 0xB1, + midgard_alu_op_ubany_lt = 0xB2, + midgard_alu_op_ubany_lte = 0xB3, + midgard_alu_op_ibany_lt = 0xB4, /* any(lessThan(.., ..)) */ + midgard_alu_op_ibany_lte = 0xB5, /* any(lessThanEqual(.., ..)) */ + midgard_alu_op_i2f_rte = 0xB8, + midgard_alu_op_i2f_rtz = 0xB9, + midgard_alu_op_i2f_rtn = 0xBA, + midgard_alu_op_i2f_rtp = 0xBB, + midgard_alu_op_u2f_rte = 0xBC, + midgard_alu_op_u2f_rtz = 0xBD, + midgard_alu_op_u2f_rtn = 0xBE, + midgard_alu_op_u2f_rtp = 0xBF, + + midgard_alu_op_icsel_v = 0xC0, /* condition code r31 */ + midgard_alu_op_icsel = 0xC1, /* condition code r31.w */ + midgard_alu_op_fcsel_v = 0xC4, + midgard_alu_op_fcsel = 0xC5, + midgard_alu_op_fround = 0xC6, + + midgard_alu_op_fatan_pt2 = 0xE8, + midgard_alu_op_fpow_pt1 = 0xEC, + midgard_alu_op_fpown_pt1 = 0xED, + midgard_alu_op_fpowr_pt1 = 0xEE, + + midgard_alu_op_frcp = 0xF0, + midgard_alu_op_frsqrt = 0xF2, + midgard_alu_op_fsqrt = 0xF3, + midgard_alu_op_fexp2 = 0xF4, + midgard_alu_op_flog2 = 0xF5, + midgard_alu_op_fsin = 0xF6, + midgard_alu_op_fcos = 0xF7, + midgard_alu_op_fatan2_pt1 = 0xF9, +} midgard_alu_op; + +typedef enum { + midgard_outmod_none = 0, + midgard_outmod_pos = 1, + /* 0x2 unknown */ + midgard_outmod_sat = 3 +} midgard_outmod_float; + +typedef enum { + midgard_outmod_int_saturate = 0, + midgard_outmod_uint_saturate = 1, + midgard_outmod_int_wrap = 2, + midgard_outmod_int_high = 3, /* Overflowed portion */ +} midgard_outmod_int; + +typedef enum { + midgard_reg_mode_8 = 0, + midgard_reg_mode_16 = 1, + midgard_reg_mode_32 = 2, + midgard_reg_mode_64 = 3 +} midgard_reg_mode; + +typedef enum { + midgard_dest_override_lower = 0, + midgard_dest_override_upper = 1, + midgard_dest_override_none = 2 +} midgard_dest_override; + +typedef enum { + midgard_int_sign_extend = 0, + midgard_int_zero_extend = 1, + midgard_int_normal = 2, + midgard_int_shift = 3 +} midgard_int_mod; + +#define MIDGARD_FLOAT_MOD_ABS (1 << 0) +#define MIDGARD_FLOAT_MOD_NEG (1 << 1) + +typedef struct +__attribute__((__packed__)) +{ + /* Either midgard_int_mod or from midgard_float_mod_*, depending on the + * type of op */ + unsigned mod : 2; + + /* replicate lower half if dest = half, or low/high half selection if + * dest = full + */ + bool rep_low : 1; + bool rep_high : 1; /* unused if dest = full */ + bool half : 1; /* only matters if dest = full */ + unsigned swizzle : 8; +} +midgard_vector_alu_src; + +typedef struct +__attribute__((__packed__)) +{ + midgard_alu_op op : 8; + midgard_reg_mode reg_mode : 2; + unsigned src1 : 13; + unsigned src2 : 13; + midgard_dest_override dest_override : 2; + midgard_outmod_float outmod : 2; + unsigned mask : 8; +} +midgard_vector_alu; + +typedef struct +__attribute__((__packed__)) +{ + bool abs : 1; + bool negate : 1; + bool full : 1; /* 0 = half, 1 = full */ + unsigned component : 3; +} +midgard_scalar_alu_src; + +typedef struct +__attribute__((__packed__)) +{ + midgard_alu_op op : 8; + unsigned src1 : 6; + unsigned src2 : 11; + unsigned unknown : 1; + unsigned outmod : 2; + bool output_full : 1; + unsigned output_component : 3; +} +midgard_scalar_alu; + +typedef struct +__attribute__((__packed__)) +{ + unsigned src1_reg : 5; + unsigned src2_reg : 5; + unsigned out_reg : 5; + bool src2_imm : 1; +} +midgard_reg_info; + +/* In addition to conditional branches and jumps (unconditional branches), + * Midgard implements a bit of fixed function functionality used in fragment + * shaders via specially crafted branches. These have special branch opcodes, + * which perform a fixed-function operation and/or use the results of a + * fixed-function operation as the branch condition. */ + +typedef enum { + /* Regular branches */ + midgard_jmp_writeout_op_branch_uncond = 1, + midgard_jmp_writeout_op_branch_cond = 2, + + /* In a fragment shader, execute a discard_if instruction, with the + * corresponding condition code. Terminates the shader, so generally + * set the branch target to out of the shader */ + midgard_jmp_writeout_op_discard = 4, + + /* Branch if the tilebuffer is not yet ready. At the beginning of a + * fragment shader that reads from the tile buffer, for instance via + * ARM_shader_framebuffer_fetch or EXT_pixel_local_storage, this branch + * operation should be used as a loop. An instruction like + * "br.tilebuffer.always -1" does the trick, corresponding to + * "while(!is_tilebuffer_ready) */ + midgard_jmp_writeout_op_tilebuffer_pending = 6, + + /* In a fragment shader, try to write out the value pushed to r0 to the + * tilebuffer, subject to unknown state in r1.z and r1.w. If this + * succeeds, the shader terminates. If it fails, it branches to the + * specified branch target. Generally, this should be used in a loop to + * itself, acting as "do { write(r0); } while(!write_successful);" */ + midgard_jmp_writeout_op_writeout = 7, +} midgard_jmp_writeout_op; + +typedef enum { + midgard_condition_write0 = 0, + + /* These condition codes denote a conditional branch on FALSE and on + * TRUE respectively */ + midgard_condition_false = 1, + midgard_condition_true = 2, + + /* This condition code always branches. For a pure branch, the + * unconditional branch coding should be used instead, but for + * fixed-function branch opcodes, this is still useful */ + midgard_condition_always = 3, +} midgard_condition; + +typedef struct +__attribute__((__packed__)) +{ + midgard_jmp_writeout_op op : 3; /* == branch_uncond */ + unsigned dest_tag : 4; /* tag of branch destination */ + unsigned unknown : 2; + int offset : 7; +} +midgard_branch_uncond; + +typedef struct +__attribute__((__packed__)) +{ + midgard_jmp_writeout_op op : 3; /* == branch_cond */ + unsigned dest_tag : 4; /* tag of branch destination */ + int offset : 7; + midgard_condition cond : 2; +} +midgard_branch_cond; + +typedef struct +__attribute__((__packed__)) +{ + midgard_jmp_writeout_op op : 3; /* == branch_cond */ + unsigned dest_tag : 4; /* tag of branch destination */ + unsigned unknown : 2; + signed offset : 23; + unsigned cond : 16; +} +midgard_branch_extended; + +typedef struct +__attribute__((__packed__)) +{ + midgard_jmp_writeout_op op : 3; /* == writeout */ + unsigned unknown : 13; +} +midgard_writeout; + +/* + * Load/store words + */ + +typedef enum { + midgard_op_ld_st_noop = 0x03, + + /* Unclear why this is on the L/S unit, but (with an address of 0, + * appropriate swizzle, magic constant 0x24, and xy mask?) moves fp32 cube + * map coordinates in r27 to its cube map texture coordinate + * destination (e.g r29). 0x4 magic for lding from fp16 instead */ + + midgard_op_st_cubemap_coords = 0x0E, + + /* Used in OpenCL. Probably can ld other things as well */ + midgard_op_ld_global_id = 0x10, + + /* The L/S unit can do perspective division a clock faster than the ALU + * if you're lucky. Put the vec4 in r27, and call with 0x24 as the + * unknown state; the output will be <x/w, y/w, z/w, 1>. Replace w with + * z for the z version */ + midgard_op_ldst_perspective_division_z = 0x12, + midgard_op_ldst_perspective_division_w = 0x13, + + /* val in r27.y, address embedded, outputs result to argument. Invert val for sub. Let val = +-1 for inc/dec. */ + midgard_op_atomic_add = 0x40, + midgard_op_atomic_and = 0x44, + midgard_op_atomic_or = 0x48, + midgard_op_atomic_xor = 0x4C, + + midgard_op_atomic_imin = 0x50, + midgard_op_atomic_umin = 0x54, + midgard_op_atomic_imax = 0x58, + midgard_op_atomic_umax = 0x5C, + + midgard_op_atomic_xchg = 0x60, + + /* Used for compute shader's __global arguments, __local variables (or + * for register spilling) */ + + midgard_op_ld_char = 0x81, + midgard_op_ld_char2 = 0x84, + midgard_op_ld_short = 0x85, + midgard_op_ld_char4 = 0x88, /* short2, int, float */ + midgard_op_ld_short4 = 0x8C, /* int2, float2, long */ + midgard_op_ld_int4 = 0x90, /* float4, long2 */ + + midgard_op_ld_attr_32 = 0x94, + midgard_op_ld_attr_16 = 0x95, + midgard_op_ld_attr_32u = 0x96, + midgard_op_ld_attr_32i = 0x97, + midgard_op_ld_vary_32 = 0x98, + midgard_op_ld_vary_16 = 0x99, + midgard_op_ld_vary_32u = 0x9A, + midgard_op_ld_vary_32i = 0x9B, + midgard_op_ld_color_buffer_16 = 0x9D, + + midgard_op_ld_uniform_16 = 0xAC, + midgard_op_ld_uniform_32i = 0xA8, + + midgard_op_ld_uniform_32 = 0xB0, + midgard_op_ld_color_buffer_8 = 0xBA, + + midgard_op_st_char = 0xC0, + midgard_op_st_char2 = 0xC4, /* short */ + midgard_op_st_char4 = 0xC8, /* short2, int, float */ + midgard_op_st_short4 = 0xCC, /* int2, float2, long */ + midgard_op_st_int4 = 0xD0, /* float4, long2 */ + + midgard_op_st_vary_32 = 0xD4, + midgard_op_st_vary_16 = 0xD5, + midgard_op_st_vary_32u = 0xD6, + midgard_op_st_vary_32i = 0xD7, + + /* Value to st in r27, location r26.w as short2 */ + midgard_op_st_image_f = 0xD8, + midgard_op_st_image_ui = 0xDA, + midgard_op_st_image_i = 0xDB, +} midgard_load_store_op; + +typedef enum { + midgard_interp_centroid = 1, + midgard_interp_default = 2 +} midgard_interpolation; + +typedef enum { + midgard_varying_mod_none = 0, + + /* Other values unknown */ + + /* Take the would-be result and divide all components by its z/w + * (perspective division baked in with the load) */ + midgard_varying_mod_perspective_z = 2, + midgard_varying_mod_perspective_w = 3, +} midgard_varying_modifier; + +typedef struct +__attribute__((__packed__)) +{ + unsigned zero0 : 1; /* Always zero */ + + midgard_varying_modifier modifier : 2; + + unsigned zero1: 1; /* Always zero */ + + /* Varying qualifiers, zero if not a varying */ + unsigned flat : 1; + unsigned is_varying : 1; /* Always one for varying, but maybe something else? */ + midgard_interpolation interpolation : 2; + + unsigned zero2 : 2; /* Always zero */ +} +midgard_varying_parameter; + +typedef struct +__attribute__((__packed__)) +{ + midgard_load_store_op op : 8; + unsigned reg : 5; + unsigned mask : 4; + unsigned swizzle : 8; + unsigned unknown : 16; + + unsigned varying_parameters : 10; + + unsigned address : 9; +} +midgard_load_store_word; + +typedef struct +__attribute__((__packed__)) +{ + unsigned type : 4; + unsigned next_type : 4; + uint64_t word1 : 60; + uint64_t word2 : 60; +} +midgard_load_store; + +/* 8-bit register selector used in texture ops to select a bias/LOD/gradient + * register, shoved into the `bias` field */ + +typedef struct +__attribute__((__packed__)) +{ + /* Combines with component_hi to form 2-bit component select out of + * xyzw, as the component for bias/LOD and the starting component of a + * gradient vector */ + + unsigned component_lo : 1; + + /* Register select between r28/r29 */ + unsigned select : 1; + + /* For a half-register, selects the upper half */ + unsigned upper : 1; + + /* Specifies a full-register, clear for a half-register. Mutually + * exclusive with upper. */ + unsigned full : 1; + + /* Higher half of component_lo. Always seen to be set for LOD/bias + * and clear for processed gradients, but I'm not sure if that's a + * hardware requirement. */ + unsigned component_hi : 1; + + /* Padding to make this 8-bit */ + unsigned zero : 3; +} +midgard_tex_register_select; + +/* Texture pipeline results are in r28-r29 */ +#define REG_TEX_BASE 28 + +/* Texture opcodes... maybe? */ +#define TEXTURE_OP_NORMAL 0x11 /* texture */ +#define TEXTURE_OP_LOD 0x12 /* textureLod */ +#define TEXTURE_OP_TEXEL_FETCH 0x14 /* texelFetch */ + +enum mali_sampler_type { + MALI_SAMPLER_UNK = 0x0, + MALI_SAMPLER_FLOAT = 0x1, /* sampler */ + MALI_SAMPLER_UNSIGNED = 0x2, /* usampler */ + MALI_SAMPLER_SIGNED = 0x3, /* isampler */ +}; + +typedef struct +__attribute__((__packed__)) +{ + unsigned type : 4; + unsigned next_type : 4; + + unsigned op : 6; + unsigned shadow : 1; + unsigned is_gather : 1; + + /* A little obscure, but last is set for the last texture operation in + * a shader. cont appears to just be last's opposite (?). Yeah, I know, + * kind of funky.. BiOpen thinks it could do with memory hinting, or + * tile locking? */ + + unsigned cont : 1; + unsigned last : 1; + + enum mali_texture_type format : 2; + unsigned zero : 2; + + /* Is a register used to specify the + * LOD/bias/offset? If set, use the `bias` field as + * a register index. If clear, use the `bias` field + * as an immediate. */ + unsigned lod_register : 1; + + /* Is a register used to specify an offset? If set, use the + * offset_reg_* fields to encode this, duplicated for each of the + * components. If clear, there is implcitly always an immediate offst + * specificed in offset_imm_* */ + unsigned offset_register : 1; + + unsigned in_reg_full : 1; + unsigned in_reg_select : 1; + unsigned in_reg_upper : 1; + unsigned in_reg_swizzle : 8; + + unsigned unknown8 : 2; + + unsigned out_full : 1; + + enum mali_sampler_type sampler_type : 2; + + unsigned out_reg_select : 1; + unsigned out_upper : 1; + + unsigned mask : 4; + + unsigned unknown2 : 2; + + unsigned swizzle : 8; + unsigned unknown4 : 8; + + unsigned unknownA : 4; + + /* In immediate mode, each offset field is an immediate range [0, 7]. + * + * In register mode, offset_x becomes a register full / select / upper + * triplet and a vec3 swizzle is splattered across offset_y/offset_z in + * a genuinely bizarre way. + * + * For texel fetches in immediate mode, the range is the full [-8, 7], + * but for normal texturing the top bit must be zero and a register + * used instead. It's not clear where this limitation is from. */ + + signed offset_x : 4; + signed offset_y : 4; + signed offset_z : 4; + + /* In immediate bias mode, for a normal texture op, this is + * texture bias, computed as int(2^8 * frac(biasf)), with + * bias_int = floor(bias). For a textureLod, it's that, but + * s/bias/lod. For a texel fetch, this is the LOD as-is. + * + * In register mode, this is a midgard_tex_register_select + * structure and bias_int is zero */ + + unsigned bias : 8; + signed bias_int : 8; + + unsigned texture_handle : 16; + unsigned sampler_handle : 16; +} +midgard_texture_word; + +#endif diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c new file mode 100644 index 00000000000..9c1349094bd --- /dev/null +++ b/src/panfrost/midgard/midgard_compile.c @@ -0,0 +1,2901 @@ +/* + * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <fcntl.h> +#include <stdint.h> +#include <stdlib.h> +#include <stdio.h> +#include <err.h> + +#include "main/mtypes.h" +#include "compiler/glsl/glsl_to_nir.h" +#include "compiler/nir_types.h" +#include "main/imports.h" +#include "compiler/nir/nir_builder.h" +#include "util/half_float.h" +#include "util/u_math.h" +#include "util/u_debug.h" +#include "util/u_dynarray.h" +#include "util/list.h" +#include "main/mtypes.h" + +#include "midgard.h" +#include "midgard_nir.h" +#include "midgard_compile.h" +#include "midgard_ops.h" +#include "helpers.h" +#include "compiler.h" + +#include "disassemble.h" + +static const struct debug_named_value debug_options[] = { + {"msgs", MIDGARD_DBG_MSGS, "Print debug messages"}, + {"shaders", MIDGARD_DBG_SHADERS, "Dump shaders in NIR and MIR"}, + {"shaderdb", MIDGARD_DBG_SHADERDB, "Prints shader-db statistics"}, + DEBUG_NAMED_VALUE_END +}; + +DEBUG_GET_ONCE_FLAGS_OPTION(midgard_debug, "MIDGARD_MESA_DEBUG", debug_options, 0) + +unsigned SHADER_DB_COUNT = 0; + +int midgard_debug = 0; + +#define DBG(fmt, ...) \ + do { if (midgard_debug & MIDGARD_DBG_MSGS) \ + fprintf(stderr, "%s:%d: "fmt, \ + __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0) + +static bool +midgard_is_branch_unit(unsigned unit) +{ + return (unit == ALU_ENAB_BRANCH) || (unit == ALU_ENAB_BR_COMPACT); +} + +static void +midgard_block_add_successor(midgard_block *block, midgard_block *successor) +{ + block->successors[block->nr_successors++] = successor; + assert(block->nr_successors <= ARRAY_SIZE(block->successors)); +} + +/* Helpers to generate midgard_instruction's using macro magic, since every + * driver seems to do it that way */ + +#define EMIT(op, ...) emit_mir_instruction(ctx, v_##op(__VA_ARGS__)); + +#define M_LOAD_STORE(name, rname, uname) \ + static midgard_instruction m_##name(unsigned ssa, unsigned address) { \ + midgard_instruction i = { \ + .type = TAG_LOAD_STORE_4, \ + .mask = 0xF, \ + .ssa_args = { \ + .rname = ssa, \ + .uname = -1, \ + .src1 = -1 \ + }, \ + .load_store = { \ + .op = midgard_op_##name, \ + .swizzle = SWIZZLE_XYZW, \ + .address = address \ + } \ + }; \ + \ + return i; \ + } + +#define M_LOAD(name) M_LOAD_STORE(name, dest, src0) +#define M_STORE(name) M_LOAD_STORE(name, src0, dest) + +/* Inputs a NIR ALU source, with modifiers attached if necessary, and outputs + * the corresponding Midgard source */ + +static midgard_vector_alu_src +vector_alu_modifiers(nir_alu_src *src, bool is_int, unsigned broadcast_count, + bool half, bool sext) +{ + if (!src) return blank_alu_src; + + /* Figure out how many components there are so we can adjust the + * swizzle. Specifically we want to broadcast the last channel so + * things like ball2/3 work + */ + + if (broadcast_count) { + uint8_t last_component = src->swizzle[broadcast_count - 1]; + + for (unsigned c = broadcast_count; c < NIR_MAX_VEC_COMPONENTS; ++c) { + src->swizzle[c] = last_component; + } + } + + midgard_vector_alu_src alu_src = { + .rep_low = 0, + .rep_high = 0, + .half = half, + .swizzle = SWIZZLE_FROM_ARRAY(src->swizzle) + }; + + if (is_int) { + alu_src.mod = midgard_int_normal; + + /* Sign/zero-extend if needed */ + + if (half) { + alu_src.mod = sext ? + midgard_int_sign_extend + : midgard_int_zero_extend; + } + + /* These should have been lowered away */ + assert(!(src->abs || src->negate)); + } else { + alu_src.mod = (src->abs << 0) | (src->negate << 1); + } + + return alu_src; +} + +/* load/store instructions have both 32-bit and 16-bit variants, depending on + * whether we are using vectors composed of highp or mediump. At the moment, we + * don't support half-floats -- this requires changes in other parts of the + * compiler -- therefore the 16-bit versions are commented out. */ + +//M_LOAD(ld_attr_16); +M_LOAD(ld_attr_32); +//M_LOAD(ld_vary_16); +M_LOAD(ld_vary_32); +//M_LOAD(ld_uniform_16); +M_LOAD(ld_uniform_32); +M_LOAD(ld_color_buffer_8); +//M_STORE(st_vary_16); +M_STORE(st_vary_32); +M_STORE(st_cubemap_coords); + +static midgard_instruction +v_alu_br_compact_cond(midgard_jmp_writeout_op op, unsigned tag, signed offset, unsigned cond) +{ + midgard_branch_cond branch = { + .op = op, + .dest_tag = tag, + .offset = offset, + .cond = cond + }; + + uint16_t compact; + memcpy(&compact, &branch, sizeof(branch)); + + midgard_instruction ins = { + .type = TAG_ALU_4, + .unit = ALU_ENAB_BR_COMPACT, + .prepacked_branch = true, + .compact_branch = true, + .br_compact = compact + }; + + if (op == midgard_jmp_writeout_op_writeout) + ins.writeout = true; + + return ins; +} + +static midgard_instruction +v_branch(bool conditional, bool invert) +{ + midgard_instruction ins = { + .type = TAG_ALU_4, + .unit = ALU_ENAB_BRANCH, + .compact_branch = true, + .branch = { + .conditional = conditional, + .invert_conditional = invert + } + }; + + return ins; +} + +static midgard_branch_extended +midgard_create_branch_extended( midgard_condition cond, + midgard_jmp_writeout_op op, + unsigned dest_tag, + signed quadword_offset) +{ + /* For unclear reasons, the condition code is repeated 8 times */ + uint16_t duplicated_cond = + (cond << 14) | + (cond << 12) | + (cond << 10) | + (cond << 8) | + (cond << 6) | + (cond << 4) | + (cond << 2) | + (cond << 0); + + midgard_branch_extended branch = { + .op = op, + .dest_tag = dest_tag, + .offset = quadword_offset, + .cond = duplicated_cond + }; + + return branch; +} + +static void +attach_constants(compiler_context *ctx, midgard_instruction *ins, void *constants, int name) +{ + ins->has_constants = true; + memcpy(&ins->constants, constants, 16); +} + +static int +glsl_type_size(const struct glsl_type *type, bool bindless) +{ + return glsl_count_attribute_slots(type, false); +} + +/* Lower fdot2 to a vector multiplication followed by channel addition */ +static void +midgard_nir_lower_fdot2_body(nir_builder *b, nir_alu_instr *alu) +{ + if (alu->op != nir_op_fdot2) + return; + + b->cursor = nir_before_instr(&alu->instr); + + nir_ssa_def *src0 = nir_ssa_for_alu_src(b, alu, 0); + nir_ssa_def *src1 = nir_ssa_for_alu_src(b, alu, 1); + + nir_ssa_def *product = nir_fmul(b, src0, src1); + + nir_ssa_def *sum = nir_fadd(b, + nir_channel(b, product, 0), + nir_channel(b, product, 1)); + + /* Replace the fdot2 with this sum */ + nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(sum)); +} + +static int +midgard_nir_sysval_for_intrinsic(nir_intrinsic_instr *instr) +{ + switch (instr->intrinsic) { + case nir_intrinsic_load_viewport_scale: + return PAN_SYSVAL_VIEWPORT_SCALE; + case nir_intrinsic_load_viewport_offset: + return PAN_SYSVAL_VIEWPORT_OFFSET; + default: + return -1; + } +} + +static unsigned +nir_dest_index(compiler_context *ctx, nir_dest *dst) +{ + if (dst->is_ssa) + return dst->ssa.index; + else { + assert(!dst->reg.indirect); + return ctx->func->impl->ssa_alloc + dst->reg.reg->index; + } +} + +static int sysval_for_instr(compiler_context *ctx, nir_instr *instr, + unsigned *dest) +{ + nir_intrinsic_instr *intr; + nir_dest *dst = NULL; + nir_tex_instr *tex; + int sysval = -1; + + switch (instr->type) { + case nir_instr_type_intrinsic: + intr = nir_instr_as_intrinsic(instr); + sysval = midgard_nir_sysval_for_intrinsic(intr); + dst = &intr->dest; + break; + case nir_instr_type_tex: + tex = nir_instr_as_tex(instr); + if (tex->op != nir_texop_txs) + break; + + sysval = PAN_SYSVAL(TEXTURE_SIZE, + PAN_TXS_SYSVAL_ID(tex->texture_index, + nir_tex_instr_dest_size(tex) - + (tex->is_array ? 1 : 0), + tex->is_array)); + dst = &tex->dest; + break; + default: + break; + } + + if (dest && dst) + *dest = nir_dest_index(ctx, dst); + + return sysval; +} + +static void +midgard_nir_assign_sysval_body(compiler_context *ctx, nir_instr *instr) +{ + int sysval; + + sysval = sysval_for_instr(ctx, instr, NULL); + if (sysval < 0) + return; + + /* We have a sysval load; check if it's already been assigned */ + + if (_mesa_hash_table_u64_search(ctx->sysval_to_id, sysval)) + return; + + /* It hasn't -- so assign it now! */ + + unsigned id = ctx->sysval_count++; + _mesa_hash_table_u64_insert(ctx->sysval_to_id, sysval, (void *) ((uintptr_t) id + 1)); + ctx->sysvals[id] = sysval; +} + +static void +midgard_nir_assign_sysvals(compiler_context *ctx, nir_shader *shader) +{ + ctx->sysval_count = 0; + + nir_foreach_function(function, shader) { + if (!function->impl) continue; + + nir_foreach_block(block, function->impl) { + nir_foreach_instr_safe(instr, block) { + midgard_nir_assign_sysval_body(ctx, instr); + } + } + } +} + +static bool +midgard_nir_lower_fdot2(nir_shader *shader) +{ + bool progress = false; + + nir_foreach_function(function, shader) { + if (!function->impl) continue; + + nir_builder _b; + nir_builder *b = &_b; + nir_builder_init(b, function->impl); + + nir_foreach_block(block, function->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_alu) continue; + + nir_alu_instr *alu = nir_instr_as_alu(instr); + midgard_nir_lower_fdot2_body(b, alu); + + progress |= true; + } + } + + nir_metadata_preserve(function->impl, nir_metadata_block_index | nir_metadata_dominance); + + } + + return progress; +} + +/* Flushes undefined values to zero */ + +static void +optimise_nir(nir_shader *nir) +{ + bool progress; + unsigned lower_flrp = + (nir->options->lower_flrp16 ? 16 : 0) | + (nir->options->lower_flrp32 ? 32 : 0) | + (nir->options->lower_flrp64 ? 64 : 0); + + NIR_PASS(progress, nir, nir_lower_regs_to_ssa); + NIR_PASS(progress, nir, midgard_nir_lower_fdot2); + NIR_PASS(progress, nir, nir_lower_idiv); + + nir_lower_tex_options lower_tex_1st_pass_options = { + .lower_rect = true, + .lower_txp = ~0 + }; + + nir_lower_tex_options lower_tex_2nd_pass_options = { + .lower_txs_lod = true, + }; + + NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_1st_pass_options); + NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_2nd_pass_options); + + do { + progress = false; + + NIR_PASS(progress, nir, nir_lower_var_copies); + NIR_PASS(progress, nir, nir_lower_vars_to_ssa); + + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_dead_cf); + NIR_PASS(progress, nir, nir_opt_cse); + NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true); + NIR_PASS(progress, nir, nir_opt_algebraic); + NIR_PASS(progress, nir, nir_opt_constant_folding); + + if (lower_flrp != 0) { + bool lower_flrp_progress = false; + NIR_PASS(lower_flrp_progress, + nir, + nir_lower_flrp, + lower_flrp, + false /* always_precise */, + nir->options->lower_ffma); + if (lower_flrp_progress) { + NIR_PASS(progress, nir, + nir_opt_constant_folding); + progress = true; + } + + /* Nothing should rematerialize any flrps, so we only + * need to do this lowering once. + */ + lower_flrp = 0; + } + + NIR_PASS(progress, nir, nir_opt_undef); + NIR_PASS(progress, nir, nir_undef_to_zero); + + NIR_PASS(progress, nir, nir_opt_loop_unroll, + nir_var_shader_in | + nir_var_shader_out | + nir_var_function_temp); + + NIR_PASS(progress, nir, nir_opt_vectorize); + } while (progress); + + /* Must be run at the end to prevent creation of fsin/fcos ops */ + NIR_PASS(progress, nir, midgard_nir_scale_trig); + + do { + progress = false; + + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_algebraic); + NIR_PASS(progress, nir, nir_opt_constant_folding); + NIR_PASS(progress, nir, nir_copy_prop); + } while (progress); + + NIR_PASS(progress, nir, nir_opt_algebraic_late); + + /* We implement booleans as 32-bit 0/~0 */ + NIR_PASS(progress, nir, nir_lower_bool_to_int32); + + /* Now that booleans are lowered, we can run out late opts */ + NIR_PASS(progress, nir, midgard_nir_lower_algebraic_late); + + /* Lower mods for float ops only. Integer ops don't support modifiers + * (saturate doesn't make sense on integers, neg/abs require dedicated + * instructions) */ + + NIR_PASS(progress, nir, nir_lower_to_source_mods, nir_lower_float_source_mods); + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_dce); + + /* Take us out of SSA */ + NIR_PASS(progress, nir, nir_lower_locals_to_regs); + NIR_PASS(progress, nir, nir_convert_from_ssa, true); + + /* We are a vector architecture; write combine where possible */ + NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest); + NIR_PASS(progress, nir, nir_lower_vec_to_movs); + + NIR_PASS(progress, nir, nir_opt_dce); +} + +/* Front-half of aliasing the SSA slots, merely by inserting the flag in the + * appropriate hash table. Intentional off-by-one to avoid confusing NULL with + * r0. See the comments in compiler_context */ + +static void +alias_ssa(compiler_context *ctx, int dest, int src) +{ + _mesa_hash_table_u64_insert(ctx->ssa_to_alias, dest + 1, (void *) ((uintptr_t) src + 1)); + _mesa_set_add(ctx->leftover_ssa_to_alias, (void *) (uintptr_t) (dest + 1)); +} + +/* ...or undo it, after which the original index will be used (dummy move should be emitted alongside this) */ + +static void +unalias_ssa(compiler_context *ctx, int dest) +{ + _mesa_hash_table_u64_remove(ctx->ssa_to_alias, dest + 1); + /* TODO: Remove from leftover or no? */ +} + +/* Do not actually emit a load; instead, cache the constant for inlining */ + +static void +emit_load_const(compiler_context *ctx, nir_load_const_instr *instr) +{ + nir_ssa_def def = instr->def; + + float *v = rzalloc_array(NULL, float, 4); + nir_const_load_to_arr(v, instr, f32); + _mesa_hash_table_u64_insert(ctx->ssa_constants, def.index + 1, v); +} + +static unsigned +nir_src_index(compiler_context *ctx, nir_src *src) +{ + if (src->is_ssa) + return src->ssa->index; + else { + assert(!src->reg.indirect); + return ctx->func->impl->ssa_alloc + src->reg.reg->index; + } +} + +static unsigned +nir_alu_src_index(compiler_context *ctx, nir_alu_src *src) +{ + return nir_src_index(ctx, &src->src); +} + +static bool +nir_is_non_scalar_swizzle(nir_alu_src *src, unsigned nr_components) +{ + unsigned comp = src->swizzle[0]; + + for (unsigned c = 1; c < nr_components; ++c) { + if (src->swizzle[c] != comp) + return true; + } + + return false; +} + +/* Midgard puts scalar conditionals in r31.w; move an arbitrary source (the + * output of a conditional test) into that register */ + +static void +emit_condition(compiler_context *ctx, nir_src *src, bool for_branch, unsigned component) +{ + int condition = nir_src_index(ctx, src); + + /* Source to swizzle the desired component into w */ + + const midgard_vector_alu_src alu_src = { + .swizzle = SWIZZLE(component, component, component, component), + }; + + /* There is no boolean move instruction. Instead, we simulate a move by + * ANDing the condition with itself to get it into r31.w */ + + midgard_instruction ins = { + .type = TAG_ALU_4, + + /* We need to set the conditional as close as possible */ + .precede_break = true, + .unit = for_branch ? UNIT_SMUL : UNIT_SADD, + .mask = 1 << COMPONENT_W, + + .ssa_args = { + .src0 = condition, + .src1 = condition, + .dest = SSA_FIXED_REGISTER(31), + }, + + .alu = { + .op = midgard_alu_op_iand, + .outmod = midgard_outmod_int_wrap, + .reg_mode = midgard_reg_mode_32, + .dest_override = midgard_dest_override_none, + .src1 = vector_alu_srco_unsigned(alu_src), + .src2 = vector_alu_srco_unsigned(alu_src) + }, + }; + + emit_mir_instruction(ctx, ins); +} + +/* Or, for mixed conditions (with csel_v), here's a vector version using all of + * r31 instead */ + +static void +emit_condition_mixed(compiler_context *ctx, nir_alu_src *src, unsigned nr_comp) +{ + int condition = nir_src_index(ctx, &src->src); + + /* Source to swizzle the desired component into w */ + + const midgard_vector_alu_src alu_src = { + .swizzle = SWIZZLE_FROM_ARRAY(src->swizzle), + }; + + /* There is no boolean move instruction. Instead, we simulate a move by + * ANDing the condition with itself to get it into r31.w */ + + midgard_instruction ins = { + .type = TAG_ALU_4, + .precede_break = true, + .mask = mask_of(nr_comp), + .ssa_args = { + .src0 = condition, + .src1 = condition, + .dest = SSA_FIXED_REGISTER(31), + }, + .alu = { + .op = midgard_alu_op_iand, + .outmod = midgard_outmod_int_wrap, + .reg_mode = midgard_reg_mode_32, + .dest_override = midgard_dest_override_none, + .src1 = vector_alu_srco_unsigned(alu_src), + .src2 = vector_alu_srco_unsigned(alu_src) + }, + }; + + emit_mir_instruction(ctx, ins); +} + + + +/* Likewise, indirect offsets are put in r27.w. TODO: Allow componentwise + * pinning to eliminate this move in all known cases */ + +static void +emit_indirect_offset(compiler_context *ctx, nir_src *src) +{ + int offset = nir_src_index(ctx, src); + + midgard_instruction ins = { + .type = TAG_ALU_4, + .mask = 1 << COMPONENT_W, + .ssa_args = { + .src0 = SSA_UNUSED_1, + .src1 = offset, + .dest = SSA_FIXED_REGISTER(REGISTER_OFFSET), + }, + .alu = { + .op = midgard_alu_op_imov, + .outmod = midgard_outmod_int_wrap, + .reg_mode = midgard_reg_mode_32, + .dest_override = midgard_dest_override_none, + .src1 = vector_alu_srco_unsigned(zero_alu_src), + .src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx) + }, + }; + + emit_mir_instruction(ctx, ins); +} + +#define ALU_CASE(nir, _op) \ + case nir_op_##nir: \ + op = midgard_alu_op_##_op; \ + assert(src_bitsize == dst_bitsize); \ + break; + +#define ALU_CASE_BCAST(nir, _op, count) \ + case nir_op_##nir: \ + op = midgard_alu_op_##_op; \ + broadcast_swizzle = count; \ + assert(src_bitsize == dst_bitsize); \ + break; +static bool +nir_is_fzero_constant(nir_src src) +{ + if (!nir_src_is_const(src)) + return false; + + for (unsigned c = 0; c < nir_src_num_components(src); ++c) { + if (nir_src_comp_as_float(src, c) != 0.0) + return false; + } + + return true; +} + +/* Analyze the sizes of the inputs to determine which reg mode. Ops needed + * special treatment override this anyway. */ + +static midgard_reg_mode +reg_mode_for_nir(nir_alu_instr *instr) +{ + unsigned src_bitsize = nir_src_bit_size(instr->src[0].src); + + switch (src_bitsize) { + case 8: + return midgard_reg_mode_8; + case 16: + return midgard_reg_mode_16; + case 32: + return midgard_reg_mode_32; + case 64: + return midgard_reg_mode_64; + default: + unreachable("Invalid bit size"); + } +} + +static void +emit_alu(compiler_context *ctx, nir_alu_instr *instr) +{ + bool is_ssa = instr->dest.dest.is_ssa; + + unsigned dest = nir_dest_index(ctx, &instr->dest.dest); + unsigned nr_components = nir_dest_num_components(instr->dest.dest); + unsigned nr_inputs = nir_op_infos[instr->op].num_inputs; + + /* Most Midgard ALU ops have a 1:1 correspondance to NIR ops; these are + * supported. A few do not and are commented for now. Also, there are a + * number of NIR ops which Midgard does not support and need to be + * lowered, also TODO. This switch block emits the opcode and calling + * convention of the Midgard instruction; actual packing is done in + * emit_alu below */ + + unsigned op; + + /* Number of components valid to check for the instruction (the rest + * will be forced to the last), or 0 to use as-is. Relevant as + * ball-type instructions have a channel count in NIR but are all vec4 + * in Midgard */ + + unsigned broadcast_swizzle = 0; + + /* What register mode should we operate in? */ + midgard_reg_mode reg_mode = + reg_mode_for_nir(instr); + + /* Do we need a destination override? Used for inline + * type conversion */ + + midgard_dest_override dest_override = + midgard_dest_override_none; + + /* Should we use a smaller respective source and sign-extend? */ + + bool half_1 = false, sext_1 = false; + bool half_2 = false, sext_2 = false; + + unsigned src_bitsize = nir_src_bit_size(instr->src[0].src); + unsigned dst_bitsize = nir_dest_bit_size(instr->dest.dest); + + switch (instr->op) { + ALU_CASE(fadd, fadd); + ALU_CASE(fmul, fmul); + ALU_CASE(fmin, fmin); + ALU_CASE(fmax, fmax); + ALU_CASE(imin, imin); + ALU_CASE(imax, imax); + ALU_CASE(umin, umin); + ALU_CASE(umax, umax); + ALU_CASE(ffloor, ffloor); + ALU_CASE(fround_even, froundeven); + ALU_CASE(ftrunc, ftrunc); + ALU_CASE(fceil, fceil); + ALU_CASE(fdot3, fdot3); + ALU_CASE(fdot4, fdot4); + ALU_CASE(iadd, iadd); + ALU_CASE(isub, isub); + ALU_CASE(imul, imul); + + /* Zero shoved as second-arg */ + ALU_CASE(iabs, iabsdiff); + + ALU_CASE(mov, imov); + + ALU_CASE(feq32, feq); + ALU_CASE(fne32, fne); + ALU_CASE(flt32, flt); + ALU_CASE(ieq32, ieq); + ALU_CASE(ine32, ine); + ALU_CASE(ilt32, ilt); + ALU_CASE(ult32, ult); + + /* We don't have a native b2f32 instruction. Instead, like many + * GPUs, we exploit booleans as 0/~0 for false/true, and + * correspondingly AND + * by 1.0 to do the type conversion. For the moment, prime us + * to emit: + * + * iand [whatever], #0 + * + * At the end of emit_alu (as MIR), we'll fix-up the constant + */ + + ALU_CASE(b2f32, iand); + ALU_CASE(b2i32, iand); + + /* Likewise, we don't have a dedicated f2b32 instruction, but + * we can do a "not equal to 0.0" test. */ + + ALU_CASE(f2b32, fne); + ALU_CASE(i2b32, ine); + + ALU_CASE(frcp, frcp); + ALU_CASE(frsq, frsqrt); + ALU_CASE(fsqrt, fsqrt); + ALU_CASE(fexp2, fexp2); + ALU_CASE(flog2, flog2); + + ALU_CASE(f2i32, f2i_rtz); + ALU_CASE(f2u32, f2u_rtz); + ALU_CASE(i2f32, i2f_rtz); + ALU_CASE(u2f32, u2f_rtz); + + ALU_CASE(f2i16, f2i_rtz); + ALU_CASE(f2u16, f2u_rtz); + ALU_CASE(i2f16, i2f_rtz); + ALU_CASE(u2f16, u2f_rtz); + + ALU_CASE(fsin, fsin); + ALU_CASE(fcos, fcos); + + /* Second op implicit #0 */ + ALU_CASE(inot, inor); + ALU_CASE(iand, iand); + ALU_CASE(ior, ior); + ALU_CASE(ixor, ixor); + ALU_CASE(ishl, ishl); + ALU_CASE(ishr, iasr); + ALU_CASE(ushr, ilsr); + + ALU_CASE_BCAST(b32all_fequal2, fball_eq, 2); + ALU_CASE_BCAST(b32all_fequal3, fball_eq, 3); + ALU_CASE(b32all_fequal4, fball_eq); + + ALU_CASE_BCAST(b32any_fnequal2, fbany_neq, 2); + ALU_CASE_BCAST(b32any_fnequal3, fbany_neq, 3); + ALU_CASE(b32any_fnequal4, fbany_neq); + + ALU_CASE_BCAST(b32all_iequal2, iball_eq, 2); + ALU_CASE_BCAST(b32all_iequal3, iball_eq, 3); + ALU_CASE(b32all_iequal4, iball_eq); + + ALU_CASE_BCAST(b32any_inequal2, ibany_neq, 2); + ALU_CASE_BCAST(b32any_inequal3, ibany_neq, 3); + ALU_CASE(b32any_inequal4, ibany_neq); + + /* Source mods will be shoved in later */ + ALU_CASE(fabs, fmov); + ALU_CASE(fneg, fmov); + ALU_CASE(fsat, fmov); + + /* For size conversion, we use a move. Ideally though we would squash + * these ops together; maybe that has to happen after in NIR as part of + * propagation...? An earlier algebraic pass ensured we step down by + * only / exactly one size. If stepping down, we use a dest override to + * reduce the size; if stepping up, we use a larger-sized move with a + * half source and a sign/zero-extension modifier */ + + case nir_op_i2i8: + case nir_op_i2i16: + case nir_op_i2i32: + /* If we end up upscale, we'll need a sign-extend on the + * operand (the second argument) */ + + sext_2 = true; + case nir_op_u2u8: + case nir_op_u2u16: + case nir_op_u2u32: { + op = midgard_alu_op_imov; + + if (dst_bitsize == (src_bitsize * 2)) { + /* Converting up */ + half_2 = true; + + /* Use a greater register mode */ + reg_mode++; + } else if (src_bitsize == (dst_bitsize * 2)) { + /* Converting down */ + dest_override = midgard_dest_override_lower; + } + + break; + } + + case nir_op_f2f16: { + assert(src_bitsize == 32); + + op = midgard_alu_op_fmov; + dest_override = midgard_dest_override_lower; + break; + } + + case nir_op_f2f32: { + assert(src_bitsize == 16); + + op = midgard_alu_op_fmov; + half_2 = true; + reg_mode++; + break; + } + + + /* For greater-or-equal, we lower to less-or-equal and flip the + * arguments */ + + case nir_op_fge: + case nir_op_fge32: + case nir_op_ige32: + case nir_op_uge32: { + op = + instr->op == nir_op_fge ? midgard_alu_op_fle : + instr->op == nir_op_fge32 ? midgard_alu_op_fle : + instr->op == nir_op_ige32 ? midgard_alu_op_ile : + instr->op == nir_op_uge32 ? midgard_alu_op_ule : + 0; + + /* Swap via temporary */ + nir_alu_src temp = instr->src[1]; + instr->src[1] = instr->src[0]; + instr->src[0] = temp; + + break; + } + + case nir_op_b32csel: { + /* Midgard features both fcsel and icsel, depending on + * the type of the arguments/output. However, as long + * as we're careful we can _always_ use icsel and + * _never_ need fcsel, since the latter does additional + * floating-point-specific processing whereas the + * former just moves bits on the wire. It's not obvious + * why these are separate opcodes, save for the ability + * to do things like sat/pos/abs/neg for free */ + + bool mixed = nir_is_non_scalar_swizzle(&instr->src[0], nr_components); + op = mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel; + + /* csel works as a two-arg in Midgard, since the condition is hardcoded in r31.w */ + nr_inputs = 2; + + /* Emit the condition into r31 */ + + if (mixed) + emit_condition_mixed(ctx, &instr->src[0], nr_components); + else + emit_condition(ctx, &instr->src[0].src, false, instr->src[0].swizzle[0]); + + /* The condition is the first argument; move the other + * arguments up one to be a binary instruction for + * Midgard */ + + memmove(instr->src, instr->src + 1, 2 * sizeof(nir_alu_src)); + break; + } + + default: + DBG("Unhandled ALU op %s\n", nir_op_infos[instr->op].name); + assert(0); + return; + } + + /* Midgard can perform certain modifiers on output of an ALU op */ + unsigned outmod; + + if (midgard_is_integer_out_op(op)) { + outmod = midgard_outmod_int_wrap; + } else { + bool sat = instr->dest.saturate || instr->op == nir_op_fsat; + outmod = sat ? midgard_outmod_sat : midgard_outmod_none; + } + + /* fmax(a, 0.0) can turn into a .pos modifier as an optimization */ + + if (instr->op == nir_op_fmax) { + if (nir_is_fzero_constant(instr->src[0].src)) { + op = midgard_alu_op_fmov; + nr_inputs = 1; + outmod = midgard_outmod_pos; + instr->src[0] = instr->src[1]; + } else if (nir_is_fzero_constant(instr->src[1].src)) { + op = midgard_alu_op_fmov; + nr_inputs = 1; + outmod = midgard_outmod_pos; + } + } + + /* Fetch unit, quirks, etc information */ + unsigned opcode_props = alu_opcode_props[op].props; + bool quirk_flipped_r24 = opcode_props & QUIRK_FLIPPED_R24; + + /* src0 will always exist afaik, but src1 will not for 1-argument + * instructions. The latter can only be fetched if the instruction + * needs it, or else we may segfault. */ + + unsigned src0 = nir_alu_src_index(ctx, &instr->src[0]); + unsigned src1 = nr_inputs == 2 ? nir_alu_src_index(ctx, &instr->src[1]) : SSA_UNUSED_0; + + /* Rather than use the instruction generation helpers, we do it + * ourselves here to avoid the mess */ + + midgard_instruction ins = { + .type = TAG_ALU_4, + .ssa_args = { + .src0 = quirk_flipped_r24 ? SSA_UNUSED_1 : src0, + .src1 = quirk_flipped_r24 ? src0 : src1, + .dest = dest, + } + }; + + nir_alu_src *nirmods[2] = { NULL }; + + if (nr_inputs == 2) { + nirmods[0] = &instr->src[0]; + nirmods[1] = &instr->src[1]; + } else if (nr_inputs == 1) { + nirmods[quirk_flipped_r24] = &instr->src[0]; + } else { + assert(0); + } + + /* These were lowered to a move, so apply the corresponding mod */ + + if (instr->op == nir_op_fneg || instr->op == nir_op_fabs) { + nir_alu_src *s = nirmods[quirk_flipped_r24]; + + if (instr->op == nir_op_fneg) + s->negate = !s->negate; + + if (instr->op == nir_op_fabs) + s->abs = !s->abs; + } + + bool is_int = midgard_is_integer_op(op); + + ins.mask = mask_of(nr_components); + + midgard_vector_alu alu = { + .op = op, + .reg_mode = reg_mode, + .dest_override = dest_override, + .outmod = outmod, + + .src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[0], is_int, broadcast_swizzle, half_1, sext_1)), + .src2 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[1], is_int, broadcast_swizzle, half_2, sext_2)), + }; + + /* Apply writemask if non-SSA, keeping in mind that we can't write to components that don't exist */ + + if (!is_ssa) + ins.mask &= instr->dest.write_mask; + + ins.alu = alu; + + /* Late fixup for emulated instructions */ + + if (instr->op == nir_op_b2f32 || instr->op == nir_op_b2i32) { + /* Presently, our second argument is an inline #0 constant. + * Switch over to an embedded 1.0 constant (that can't fit + * inline, since we're 32-bit, not 16-bit like the inline + * constants) */ + + ins.ssa_args.inline_constant = false; + ins.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + ins.has_constants = true; + + if (instr->op == nir_op_b2f32) { + ins.constants[0] = 1.0f; + } else { + /* Type pun it into place */ + uint32_t one = 0x1; + memcpy(&ins.constants[0], &one, sizeof(uint32_t)); + } + + ins.alu.src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx); + } else if (nr_inputs == 1 && !quirk_flipped_r24) { + /* Lots of instructions need a 0 plonked in */ + ins.ssa_args.inline_constant = false; + ins.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + ins.has_constants = true; + ins.constants[0] = 0.0f; + ins.alu.src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx); + } else if (instr->op == nir_op_inot) { + /* ~b = ~(b & b), so duplicate the source */ + ins.ssa_args.src1 = ins.ssa_args.src0; + ins.alu.src2 = ins.alu.src1; + } + + if ((opcode_props & UNITS_ALL) == UNIT_VLUT) { + /* To avoid duplicating the lookup tables (probably), true LUT + * instructions can only operate as if they were scalars. Lower + * them here by changing the component. */ + + uint8_t original_swizzle[4]; + memcpy(original_swizzle, nirmods[0]->swizzle, sizeof(nirmods[0]->swizzle)); + unsigned orig_mask = ins.mask; + + for (int i = 0; i < nr_components; ++i) { + /* Mask the associated component, dropping the + * instruction if needed */ + + ins.mask = 1 << i; + ins.mask &= orig_mask; + + if (!ins.mask) + continue; + + for (int j = 0; j < 4; ++j) + nirmods[0]->swizzle[j] = original_swizzle[i]; /* Pull from the correct component */ + + ins.alu.src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[0], is_int, broadcast_swizzle, half_1, false)); + emit_mir_instruction(ctx, ins); + } + } else { + emit_mir_instruction(ctx, ins); + } +} + +#undef ALU_CASE + +/* Uniforms and UBOs use a shared code path, as uniforms are just (slightly + * optimized) versions of UBO #0 */ + +static void +emit_ubo_read( + compiler_context *ctx, + unsigned dest, + unsigned offset, + nir_src *indirect_offset, + unsigned index) +{ + /* TODO: half-floats */ + + if (!indirect_offset && offset < ctx->uniform_cutoff && index == 0) { + /* Fast path: For the first 16 uniforms, direct accesses are + * 0-cycle, since they're just a register fetch in the usual + * case. So, we alias the registers while we're still in + * SSA-space */ + + int reg_slot = 23 - offset; + alias_ssa(ctx, dest, SSA_FIXED_REGISTER(reg_slot)); + } else { + /* Otherwise, read from the 'special' UBO to access + * higher-indexed uniforms, at a performance cost. More + * generally, we're emitting a UBO read instruction. */ + + midgard_instruction ins = m_ld_uniform_32(dest, offset); + + /* TODO: Don't split */ + ins.load_store.varying_parameters = (offset & 7) << 7; + ins.load_store.address = offset >> 3; + + if (indirect_offset) { + emit_indirect_offset(ctx, indirect_offset); + ins.load_store.unknown = 0x8700 | index; /* xxx: what is this? */ + } else { + ins.load_store.unknown = 0x1E00 | index; /* xxx: what is this? */ + } + + /* TODO respect index */ + + emit_mir_instruction(ctx, ins); + } +} + +static void +emit_varying_read( + compiler_context *ctx, + unsigned dest, unsigned offset, + unsigned nr_comp, unsigned component, + nir_src *indirect_offset, nir_alu_type type) +{ + /* XXX: Half-floats? */ + /* TODO: swizzle, mask */ + + midgard_instruction ins = m_ld_vary_32(dest, offset); + ins.mask = mask_of(nr_comp); + ins.load_store.swizzle = SWIZZLE_XYZW >> (2 * component); + + midgard_varying_parameter p = { + .is_varying = 1, + .interpolation = midgard_interp_default, + .flat = /*var->data.interpolation == INTERP_MODE_FLAT*/ 0 + }; + + unsigned u; + memcpy(&u, &p, sizeof(p)); + ins.load_store.varying_parameters = u; + + if (indirect_offset) { + /* We need to add in the dynamic index, moved to r27.w */ + emit_indirect_offset(ctx, indirect_offset); + ins.load_store.unknown = 0x79e; /* xxx: what is this? */ + } else { + /* Just a direct load */ + ins.load_store.unknown = 0x1e9e; /* xxx: what is this? */ + } + + /* Use the type appropriate load */ + switch (type) { + case nir_type_uint: + case nir_type_bool: + ins.load_store.op = midgard_op_ld_vary_32u; + break; + case nir_type_int: + ins.load_store.op = midgard_op_ld_vary_32i; + break; + case nir_type_float: + ins.load_store.op = midgard_op_ld_vary_32; + break; + default: + unreachable("Attempted to load unknown type"); + break; + } + + emit_mir_instruction(ctx, ins); +} + +static void +emit_sysval_read(compiler_context *ctx, nir_instr *instr) +{ + unsigned dest; + /* Figure out which uniform this is */ + int sysval = sysval_for_instr(ctx, instr, &dest); + void *val = _mesa_hash_table_u64_search(ctx->sysval_to_id, sysval); + + /* Sysvals are prefix uniforms */ + unsigned uniform = ((uintptr_t) val) - 1; + + /* Emit the read itself -- this is never indirect */ + emit_ubo_read(ctx, dest, uniform, NULL, 0); +} + +static void +emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) +{ + unsigned offset = 0, reg; + + switch (instr->intrinsic) { + case nir_intrinsic_discard_if: + emit_condition(ctx, &instr->src[0], true, COMPONENT_X); + + /* fallthrough */ + + case nir_intrinsic_discard: { + bool conditional = instr->intrinsic == nir_intrinsic_discard_if; + struct midgard_instruction discard = v_branch(conditional, false); + discard.branch.target_type = TARGET_DISCARD; + emit_mir_instruction(ctx, discard); + + ctx->can_discard = true; + break; + } + + case nir_intrinsic_load_uniform: + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_input: { + bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform; + bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo; + + /* Get the base type of the intrinsic */ + /* TODO: Infer type? Does it matter? */ + nir_alu_type t = + is_ubo ? nir_type_uint : nir_intrinsic_type(instr); + t = nir_alu_type_get_base_type(t); + + if (!is_ubo) { + offset = nir_intrinsic_base(instr); + } + + unsigned nr_comp = nir_intrinsic_dest_components(instr); + + nir_src *src_offset = nir_get_io_offset_src(instr); + + bool direct = nir_src_is_const(*src_offset); + + if (direct) + offset += nir_src_as_uint(*src_offset); + + /* We may need to apply a fractional offset */ + int component = instr->intrinsic == nir_intrinsic_load_input ? + nir_intrinsic_component(instr) : 0; + reg = nir_dest_index(ctx, &instr->dest); + + if (is_uniform && !ctx->is_blend) { + emit_ubo_read(ctx, reg, ctx->sysval_count + offset, !direct ? &instr->src[0] : NULL, 0); + } else if (is_ubo) { + nir_src index = instr->src[0]; + + /* We don't yet support indirect UBOs. For indirect + * block numbers (if that's possible), we don't know + * enough about the hardware yet. For indirect sources, + * we know what we need but we need to add some NIR + * support for lowering correctly with respect to + * 128-bit reads */ + + assert(nir_src_is_const(index)); + assert(nir_src_is_const(*src_offset)); + + /* TODO: Alignment */ + assert((offset & 0xF) == 0); + + uint32_t uindex = nir_src_as_uint(index) + 1; + emit_ubo_read(ctx, reg, offset / 16, NULL, uindex); + } else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_blend) { + emit_varying_read(ctx, reg, offset, nr_comp, component, !direct ? &instr->src[0] : NULL, t); + } else if (ctx->is_blend) { + /* For blend shaders, load the input color, which is + * preloaded to r0 */ + + midgard_instruction move = v_mov(reg, blank_alu_src, SSA_FIXED_REGISTER(0)); + emit_mir_instruction(ctx, move); + } else if (ctx->stage == MESA_SHADER_VERTEX) { + midgard_instruction ins = m_ld_attr_32(reg, offset); + ins.load_store.unknown = 0x1E1E; /* XXX: What is this? */ + ins.mask = mask_of(nr_comp); + + /* Use the type appropriate load */ + switch (t) { + case nir_type_uint: + case nir_type_bool: + ins.load_store.op = midgard_op_ld_attr_32u; + break; + case nir_type_int: + ins.load_store.op = midgard_op_ld_attr_32i; + break; + case nir_type_float: + ins.load_store.op = midgard_op_ld_attr_32; + break; + default: + unreachable("Attempted to load unknown type"); + break; + } + + emit_mir_instruction(ctx, ins); + } else { + DBG("Unknown load\n"); + assert(0); + } + + break; + } + + /* Reads 128-bit value raw off the tilebuffer during blending, tasty */ + + case nir_intrinsic_load_raw_output_pan: + reg = nir_dest_index(ctx, &instr->dest); + assert(ctx->is_blend); + + midgard_instruction ins = m_ld_color_buffer_8(reg, 0); + emit_mir_instruction(ctx, ins); + break; + + case nir_intrinsic_load_blend_const_color_rgba: { + assert(ctx->is_blend); + reg = nir_dest_index(ctx, &instr->dest); + + /* Blend constants are embedded directly in the shader and + * patched in, so we use some magic routing */ + + midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, reg); + ins.has_constants = true; + ins.has_blend_constant = true; + emit_mir_instruction(ctx, ins); + break; + } + + case nir_intrinsic_store_output: + assert(nir_src_is_const(instr->src[1]) && "no indirect outputs"); + + offset = nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[1]); + + reg = nir_src_index(ctx, &instr->src[0]); + + if (ctx->stage == MESA_SHADER_FRAGMENT) { + /* gl_FragColor is not emitted with load/store + * instructions. Instead, it gets plonked into + * r0 at the end of the shader and we do the + * framebuffer writeout dance. TODO: Defer + * writes */ + + midgard_instruction move = v_mov(reg, blank_alu_src, SSA_FIXED_REGISTER(0)); + emit_mir_instruction(ctx, move); + + /* Save the index we're writing to for later reference + * in the epilogue */ + + ctx->fragment_output = reg; + } else if (ctx->stage == MESA_SHADER_VERTEX) { + /* Varyings are written into one of two special + * varying register, r26 or r27. The register itself is + * selected as the register in the st_vary instruction, + * minus the base of 26. E.g. write into r27 and then + * call st_vary(1) */ + + midgard_instruction ins = v_mov(reg, blank_alu_src, SSA_FIXED_REGISTER(26)); + emit_mir_instruction(ctx, ins); + + /* We should have been vectorized, though we don't + * currently check that st_vary is emitted only once + * per slot (this is relevant, since there's not a mask + * parameter available on the store [set to 0 by the + * blob]). We do respect the component by adjusting the + * swizzle. */ + + unsigned component = nir_intrinsic_component(instr); + + midgard_instruction st = m_st_vary_32(SSA_FIXED_REGISTER(0), offset); + st.load_store.unknown = 0x1E9E; /* XXX: What is this? */ + st.load_store.swizzle = SWIZZLE_XYZW << (2*component); + emit_mir_instruction(ctx, st); + } else { + DBG("Unknown store\n"); + assert(0); + } + + break; + + /* Special case of store_output for lowered blend shaders */ + case nir_intrinsic_store_raw_output_pan: + assert (ctx->stage == MESA_SHADER_FRAGMENT); + reg = nir_src_index(ctx, &instr->src[0]); + + midgard_instruction move = v_mov(reg, blank_alu_src, SSA_FIXED_REGISTER(0)); + emit_mir_instruction(ctx, move); + ctx->fragment_output = reg; + + break; + + case nir_intrinsic_load_alpha_ref_float: + assert(instr->dest.is_ssa); + + float ref_value = ctx->alpha_ref; + + float *v = ralloc_array(NULL, float, 4); + memcpy(v, &ref_value, sizeof(float)); + _mesa_hash_table_u64_insert(ctx->ssa_constants, instr->dest.ssa.index + 1, v); + break; + + case nir_intrinsic_load_viewport_scale: + case nir_intrinsic_load_viewport_offset: + emit_sysval_read(ctx, &instr->instr); + break; + + default: + printf ("Unhandled intrinsic\n"); + assert(0); + break; + } +} + +static unsigned +midgard_tex_format(enum glsl_sampler_dim dim) +{ + switch (dim) { + case GLSL_SAMPLER_DIM_1D: + case GLSL_SAMPLER_DIM_BUF: + return MALI_TEX_1D; + + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_EXTERNAL: + return MALI_TEX_2D; + + case GLSL_SAMPLER_DIM_3D: + return MALI_TEX_3D; + + case GLSL_SAMPLER_DIM_CUBE: + return MALI_TEX_CUBE; + + default: + DBG("Unknown sampler dim type\n"); + assert(0); + return 0; + } +} + +/* Tries to attach an explicit LOD / bias as a constant. Returns whether this + * was successful */ + +static bool +pan_attach_constant_bias( + compiler_context *ctx, + nir_src lod, + midgard_texture_word *word) +{ + /* To attach as constant, it has to *be* constant */ + + if (!nir_src_is_const(lod)) + return false; + + float f = nir_src_as_float(lod); + + /* Break into fixed-point */ + signed lod_int = f; + float lod_frac = f - lod_int; + + /* Carry over negative fractions */ + if (lod_frac < 0.0) { + lod_int--; + lod_frac += 1.0; + } + + /* Encode */ + word->bias = float_to_ubyte(lod_frac); + word->bias_int = lod_int; + + return true; +} + +static enum mali_sampler_type +midgard_sampler_type(nir_alu_type t) { + switch (nir_alu_type_get_base_type(t)) + { + case nir_type_float: + return MALI_SAMPLER_FLOAT; + case nir_type_int: + return MALI_SAMPLER_SIGNED; + case nir_type_uint: + return MALI_SAMPLER_UNSIGNED; + default: + unreachable("Unknown sampler type"); + } +} + +static void +emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, + unsigned midgard_texop) +{ + /* TODO */ + //assert (!instr->sampler); + //assert (!instr->texture_array_size); + + /* Allocate registers via a round robin scheme to alternate between the two registers */ + int reg = ctx->texture_op_count & 1; + int in_reg = reg, out_reg = reg; + + /* Make room for the reg */ + + if (ctx->texture_index[reg] > -1) + unalias_ssa(ctx, ctx->texture_index[reg]); + + int texture_index = instr->texture_index; + int sampler_index = texture_index; + + /* No helper to build texture words -- we do it all here */ + midgard_instruction ins = { + .type = TAG_TEXTURE_4, + .mask = 0xF, + .texture = { + .op = midgard_texop, + .format = midgard_tex_format(instr->sampler_dim), + .texture_handle = texture_index, + .sampler_handle = sampler_index, + + /* TODO: Regalloc it in */ + .swizzle = SWIZZLE_XYZW, + + /* TODO: half */ + .in_reg_full = 1, + .out_full = 1, + + .sampler_type = midgard_sampler_type(instr->dest_type), + } + }; + + for (unsigned i = 0; i < instr->num_srcs; ++i) { + int reg = SSA_FIXED_REGISTER(REGISTER_TEXTURE_BASE + in_reg); + int index = nir_src_index(ctx, &instr->src[i].src); + int nr_comp = nir_src_num_components(instr->src[i].src); + midgard_vector_alu_src alu_src = blank_alu_src; + + switch (instr->src[i].src_type) { + case nir_tex_src_coord: { + if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { + /* texelFetch is undefined on samplerCube */ + assert(midgard_texop != TEXTURE_OP_TEXEL_FETCH); + + /* For cubemaps, we need to load coords into + * special r27, and then use a special ld/st op + * to select the face and copy the xy into the + * texture register */ + + alu_src.swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_X); + + midgard_instruction move = v_mov(index, alu_src, SSA_FIXED_REGISTER(27)); + emit_mir_instruction(ctx, move); + + midgard_instruction st = m_st_cubemap_coords(reg, 0); + st.load_store.unknown = 0x24; /* XXX: What is this? */ + st.mask = 0x3; /* xy */ + st.load_store.swizzle = alu_src.swizzle; + emit_mir_instruction(ctx, st); + + ins.texture.in_reg_swizzle = swizzle_of(2); + } else { + ins.texture.in_reg_swizzle = alu_src.swizzle = swizzle_of(nr_comp); + + midgard_instruction mov = v_mov(index, alu_src, reg); + mov.mask = mask_of(nr_comp); + emit_mir_instruction(ctx, mov); + + if (midgard_texop == TEXTURE_OP_TEXEL_FETCH) { + /* Texel fetch opcodes care about the + * values of z and w, so we actually + * need to spill into a second register + * for a texel fetch with register bias + * (for non-2D). TODO: Implement that + */ + + assert(instr->sampler_dim == GLSL_SAMPLER_DIM_2D); + + midgard_instruction zero = v_mov(index, alu_src, reg); + zero.ssa_args.inline_constant = true; + zero.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + zero.has_constants = true; + zero.mask = ~mov.mask; + emit_mir_instruction(ctx, zero); + + ins.texture.in_reg_swizzle = SWIZZLE_XYZZ; + } else { + /* Non-texel fetch doesn't need that + * nonsense. However we do use the Z + * for array indexing */ + bool is_3d = instr->sampler_dim == GLSL_SAMPLER_DIM_3D; + ins.texture.in_reg_swizzle = is_3d ? SWIZZLE_XYZZ : SWIZZLE_XYXZ; + } + } + + break; + } + + case nir_tex_src_bias: + case nir_tex_src_lod: { + /* Try as a constant if we can */ + + bool is_txf = midgard_texop == TEXTURE_OP_TEXEL_FETCH; + if (!is_txf && pan_attach_constant_bias(ctx, instr->src[i].src, &ins.texture)) + break; + + /* Otherwise we use a register. To keep RA simple, we + * put the bias/LOD into the w component of the input + * source, which is otherwise in xy */ + + alu_src.swizzle = SWIZZLE_XXXX; + + midgard_instruction mov = v_mov(index, alu_src, reg); + mov.mask = 1 << COMPONENT_W; + emit_mir_instruction(ctx, mov); + + ins.texture.lod_register = true; + + midgard_tex_register_select sel = { + .select = in_reg, + .full = 1, + + /* w */ + .component_lo = 1, + .component_hi = 1 + }; + + uint8_t packed; + memcpy(&packed, &sel, sizeof(packed)); + ins.texture.bias = packed; + + break; + }; + + default: + unreachable("Unknown texture source type\n"); + } + } + + /* Set registers to read and write from the same place */ + ins.texture.in_reg_select = in_reg; + ins.texture.out_reg_select = out_reg; + + emit_mir_instruction(ctx, ins); + + int o_reg = REGISTER_TEXTURE_BASE + out_reg, o_index = nir_dest_index(ctx, &instr->dest); + midgard_instruction ins2 = v_mov(SSA_FIXED_REGISTER(o_reg), blank_alu_src, o_index); + emit_mir_instruction(ctx, ins2); + + /* Used for .cont and .last hinting */ + ctx->texture_op_count++; +} + +static void +emit_tex(compiler_context *ctx, nir_tex_instr *instr) +{ + /* Fixup op, since only textureLod is permitted in VS but NIR can give + * generic tex in some cases (which confuses the hardware) */ + + bool is_vertex = ctx->stage == MESA_SHADER_VERTEX; + + if (is_vertex && instr->op == nir_texop_tex) + instr->op = nir_texop_txl; + + switch (instr->op) { + case nir_texop_tex: + case nir_texop_txb: + emit_texop_native(ctx, instr, TEXTURE_OP_NORMAL); + break; + case nir_texop_txl: + emit_texop_native(ctx, instr, TEXTURE_OP_LOD); + break; + case nir_texop_txf: + emit_texop_native(ctx, instr, TEXTURE_OP_TEXEL_FETCH); + break; + case nir_texop_txs: + emit_sysval_read(ctx, &instr->instr); + break; + default: + unreachable("Unhanlded texture op"); + } +} + +static void +emit_jump(compiler_context *ctx, nir_jump_instr *instr) +{ + switch (instr->type) { + case nir_jump_break: { + /* Emit a branch out of the loop */ + struct midgard_instruction br = v_branch(false, false); + br.branch.target_type = TARGET_BREAK; + br.branch.target_break = ctx->current_loop_depth; + emit_mir_instruction(ctx, br); + + DBG("break..\n"); + break; + } + + default: + DBG("Unknown jump type %d\n", instr->type); + break; + } +} + +static void +emit_instr(compiler_context *ctx, struct nir_instr *instr) +{ + switch (instr->type) { + case nir_instr_type_load_const: + emit_load_const(ctx, nir_instr_as_load_const(instr)); + break; + + case nir_instr_type_intrinsic: + emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); + break; + + case nir_instr_type_alu: + emit_alu(ctx, nir_instr_as_alu(instr)); + break; + + case nir_instr_type_tex: + emit_tex(ctx, nir_instr_as_tex(instr)); + break; + + case nir_instr_type_jump: + emit_jump(ctx, nir_instr_as_jump(instr)); + break; + + case nir_instr_type_ssa_undef: + /* Spurious */ + break; + + default: + DBG("Unhandled instruction type\n"); + break; + } +} + + +/* ALU instructions can inline or embed constants, which decreases register + * pressure and saves space. */ + +#define CONDITIONAL_ATTACH(src) { \ + void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->ssa_args.src + 1); \ +\ + if (entry) { \ + attach_constants(ctx, alu, entry, alu->ssa_args.src + 1); \ + alu->ssa_args.src = SSA_FIXED_REGISTER(REGISTER_CONSTANT); \ + } \ +} + +static void +inline_alu_constants(compiler_context *ctx) +{ + mir_foreach_instr(ctx, alu) { + /* Other instructions cannot inline constants */ + if (alu->type != TAG_ALU_4) continue; + + /* If there is already a constant here, we can do nothing */ + if (alu->has_constants) continue; + + /* It makes no sense to inline constants on a branch */ + if (alu->compact_branch || alu->prepacked_branch) continue; + + CONDITIONAL_ATTACH(src0); + + if (!alu->has_constants) { + CONDITIONAL_ATTACH(src1) + } else if (!alu->inline_constant) { + /* Corner case: _two_ vec4 constants, for instance with a + * csel. For this case, we can only use a constant + * register for one, we'll have to emit a move for the + * other. Note, if both arguments are constants, then + * necessarily neither argument depends on the value of + * any particular register. As the destination register + * will be wiped, that means we can spill the constant + * to the destination register. + */ + + void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->ssa_args.src1 + 1); + unsigned scratch = alu->ssa_args.dest; + + if (entry) { + midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, scratch); + attach_constants(ctx, &ins, entry, alu->ssa_args.src1 + 1); + + /* Force a break XXX Defer r31 writes */ + ins.unit = UNIT_VLUT; + + /* Set the source */ + alu->ssa_args.src1 = scratch; + + /* Inject us -before- the last instruction which set r31 */ + mir_insert_instruction_before(mir_prev_op(alu), ins); + } + } + } +} + +/* Midgard supports two types of constants, embedded constants (128-bit) and + * inline constants (16-bit). Sometimes, especially with scalar ops, embedded + * constants can be demoted to inline constants, for space savings and + * sometimes a performance boost */ + +static void +embedded_to_inline_constant(compiler_context *ctx) +{ + mir_foreach_instr(ctx, ins) { + if (!ins->has_constants) continue; + + if (ins->ssa_args.inline_constant) continue; + + /* Blend constants must not be inlined by definition */ + if (ins->has_blend_constant) continue; + + /* We can inline 32-bit (sometimes) or 16-bit (usually) */ + bool is_16 = ins->alu.reg_mode == midgard_reg_mode_16; + bool is_32 = ins->alu.reg_mode == midgard_reg_mode_32; + + if (!(is_16 || is_32)) + continue; + + /* src1 cannot be an inline constant due to encoding + * restrictions. So, if possible we try to flip the arguments + * in that case */ + + int op = ins->alu.op; + + if (ins->ssa_args.src0 == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) { + switch (op) { + /* These ops require an operational change to flip + * their arguments TODO */ + case midgard_alu_op_flt: + case midgard_alu_op_fle: + case midgard_alu_op_ilt: + case midgard_alu_op_ile: + case midgard_alu_op_fcsel: + case midgard_alu_op_icsel: + DBG("Missed non-commutative flip (%s)\n", alu_opcode_props[op].name); + default: + break; + } + + if (alu_opcode_props[op].props & OP_COMMUTES) { + /* Flip the SSA numbers */ + ins->ssa_args.src0 = ins->ssa_args.src1; + ins->ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + + /* And flip the modifiers */ + + unsigned src_temp; + + src_temp = ins->alu.src2; + ins->alu.src2 = ins->alu.src1; + ins->alu.src1 = src_temp; + } + } + + if (ins->ssa_args.src1 == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) { + /* Extract the source information */ + + midgard_vector_alu_src *src; + int q = ins->alu.src2; + midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q; + src = m; + + /* Component is from the swizzle, e.g. r26.w -> w component. TODO: What if x is masked out? */ + int component = src->swizzle & 3; + + /* Scale constant appropriately, if we can legally */ + uint16_t scaled_constant = 0; + + if (midgard_is_integer_op(op) || is_16) { + unsigned int *iconstants = (unsigned int *) ins->constants; + scaled_constant = (uint16_t) iconstants[component]; + + /* Constant overflow after resize */ + if (scaled_constant != iconstants[component]) + continue; + } else { + float original = (float) ins->constants[component]; + scaled_constant = _mesa_float_to_half(original); + + /* Check for loss of precision. If this is + * mediump, we don't care, but for a highp + * shader, we need to pay attention. NIR + * doesn't yet tell us which mode we're in! + * Practically this prevents most constants + * from being inlined, sadly. */ + + float fp32 = _mesa_half_to_float(scaled_constant); + + if (fp32 != original) + continue; + } + + /* We don't know how to handle these with a constant */ + + if (src->mod || src->half || src->rep_low || src->rep_high) { + DBG("Bailing inline constant...\n"); + continue; + } + + /* Make sure that the constant is not itself a + * vector by checking if all accessed values + * (by the swizzle) are the same. */ + + uint32_t *cons = (uint32_t *) ins->constants; + uint32_t value = cons[component]; + + bool is_vector = false; + unsigned mask = effective_writemask(&ins->alu, ins->mask); + + for (int c = 1; c < 4; ++c) { + /* We only care if this component is actually used */ + if (!(mask & (1 << c))) + continue; + + uint32_t test = cons[(src->swizzle >> (2 * c)) & 3]; + + if (test != value) { + is_vector = true; + break; + } + } + + if (is_vector) + continue; + + /* Get rid of the embedded constant */ + ins->has_constants = false; + ins->ssa_args.src1 = SSA_UNUSED_0; + ins->ssa_args.inline_constant = true; + ins->inline_constant = scaled_constant; + } + } +} + +/* Map normal SSA sources to other SSA sources / fixed registers (like + * uniforms) */ + +static void +map_ssa_to_alias(compiler_context *ctx, int *ref) +{ + /* Sign is used quite deliberately for unused */ + if (*ref < 0) + return; + + unsigned int alias = (uintptr_t) _mesa_hash_table_u64_search(ctx->ssa_to_alias, *ref + 1); + + if (alias) { + /* Remove entry in leftovers to avoid a redunant fmov */ + + struct set_entry *leftover = _mesa_set_search(ctx->leftover_ssa_to_alias, ((void *) (uintptr_t) (*ref + 1))); + + if (leftover) + _mesa_set_remove(ctx->leftover_ssa_to_alias, leftover); + + /* Assign the alias map */ + *ref = alias - 1; + return; + } +} + +/* Basic dead code elimination on the MIR itself, which cleans up e.g. the + * texture pipeline */ + +static bool +midgard_opt_dead_code_eliminate(compiler_context *ctx, midgard_block *block) +{ + bool progress = false; + + mir_foreach_instr_in_block_safe(block, ins) { + if (ins->type != TAG_ALU_4) continue; + if (ins->compact_branch) continue; + + if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue; + if (mir_is_live_after(ctx, block, ins, ins->ssa_args.dest)) continue; + + mir_remove_instruction(ins); + progress = true; + } + + return progress; +} + +/* Dead code elimination for branches at the end of a block - only one branch + * per block is legal semantically */ + +static void +midgard_opt_cull_dead_branch(compiler_context *ctx, midgard_block *block) +{ + bool branched = false; + + mir_foreach_instr_in_block_safe(block, ins) { + if (!midgard_is_branch_unit(ins->unit)) continue; + + /* We ignore prepacked branches since the fragment epilogue is + * just generally special */ + if (ins->prepacked_branch) continue; + + /* Discards are similarly special and may not correspond to the + * end of a block */ + + if (ins->branch.target_type == TARGET_DISCARD) continue; + + if (branched) { + /* We already branched, so this is dead */ + mir_remove_instruction(ins); + } + + branched = true; + } +} + +static bool +mir_nontrivial_mod(midgard_vector_alu_src src, bool is_int, unsigned mask) +{ + /* abs or neg */ + if (!is_int && src.mod) return true; + + /* Other int mods don't matter in isolation */ + if (is_int && src.mod == midgard_int_shift) return true; + + /* size-conversion */ + if (src.half) return true; + + /* swizzle */ + for (unsigned c = 0; c < 4; ++c) { + if (!(mask & (1 << c))) continue; + if (((src.swizzle >> (2*c)) & 3) != c) return true; + } + + return false; +} + +static bool +mir_nontrivial_source2_mod(midgard_instruction *ins) +{ + bool is_int = midgard_is_integer_op(ins->alu.op); + + midgard_vector_alu_src src2 = + vector_alu_from_unsigned(ins->alu.src2); + + return mir_nontrivial_mod(src2, is_int, ins->mask); +} + +static bool +mir_nontrivial_outmod(midgard_instruction *ins) +{ + bool is_int = midgard_is_integer_op(ins->alu.op); + unsigned mod = ins->alu.outmod; + + /* Type conversion is a sort of outmod */ + if (ins->alu.dest_override != midgard_dest_override_none) + return true; + + if (is_int) + return mod != midgard_outmod_int_wrap; + else + return mod != midgard_outmod_none; +} + +static bool +midgard_opt_copy_prop(compiler_context *ctx, midgard_block *block) +{ + bool progress = false; + + mir_foreach_instr_in_block_safe(block, ins) { + if (ins->type != TAG_ALU_4) continue; + if (!OP_IS_MOVE(ins->alu.op)) continue; + + unsigned from = ins->ssa_args.src1; + unsigned to = ins->ssa_args.dest; + + /* We only work on pure SSA */ + + if (to >= SSA_FIXED_MINIMUM) continue; + if (from >= SSA_FIXED_MINIMUM) continue; + if (to >= ctx->func->impl->ssa_alloc) continue; + if (from >= ctx->func->impl->ssa_alloc) continue; + + /* Constant propagation is not handled here, either */ + if (ins->ssa_args.inline_constant) continue; + if (ins->has_constants) continue; + + if (mir_nontrivial_source2_mod(ins)) continue; + if (mir_nontrivial_outmod(ins)) continue; + + /* We're clear -- rewrite */ + mir_rewrite_index_src(ctx, to, from); + mir_remove_instruction(ins); + progress |= true; + } + + return progress; +} + +/* fmov.pos is an idiom for fpos. Propoagate the .pos up to the source, so then + * the move can be propagated away entirely */ + +static bool +mir_compose_float_outmod(midgard_outmod_float *outmod, midgard_outmod_float comp) +{ + /* Nothing to do */ + if (comp == midgard_outmod_none) + return true; + + if (*outmod == midgard_outmod_none) { + *outmod = comp; + return true; + } + + /* TODO: Compose rules */ + return false; +} + +static bool +midgard_opt_pos_propagate(compiler_context *ctx, midgard_block *block) +{ + bool progress = false; + + mir_foreach_instr_in_block_safe(block, ins) { + if (ins->type != TAG_ALU_4) continue; + if (ins->alu.op != midgard_alu_op_fmov) continue; + if (ins->alu.outmod != midgard_outmod_pos) continue; + + /* TODO: Registers? */ + unsigned src = ins->ssa_args.src1; + if (src >= ctx->func->impl->ssa_alloc) continue; + assert(!mir_has_multiple_writes(ctx, src)); + + /* There might be a source modifier, too */ + if (mir_nontrivial_source2_mod(ins)) continue; + + /* Backpropagate the modifier */ + mir_foreach_instr_in_block_from_rev(block, v, mir_prev_op(ins)) { + if (v->type != TAG_ALU_4) continue; + if (v->ssa_args.dest != src) continue; + + /* Can we even take a float outmod? */ + if (midgard_is_integer_out_op(v->alu.op)) continue; + + midgard_outmod_float temp = v->alu.outmod; + progress |= mir_compose_float_outmod(&temp, ins->alu.outmod); + + /* Throw in the towel.. */ + if (!progress) break; + + /* Otherwise, transfer the modifier */ + v->alu.outmod = temp; + ins->alu.outmod = midgard_outmod_none; + + break; + } + } + + return progress; +} + +/* The following passes reorder MIR instructions to enable better scheduling */ + +static void +midgard_pair_load_store(compiler_context *ctx, midgard_block *block) +{ + mir_foreach_instr_in_block_safe(block, ins) { + if (ins->type != TAG_LOAD_STORE_4) continue; + + /* We've found a load/store op. Check if next is also load/store. */ + midgard_instruction *next_op = mir_next_op(ins); + if (&next_op->link != &block->instructions) { + if (next_op->type == TAG_LOAD_STORE_4) { + /* If so, we're done since we're a pair */ + ins = mir_next_op(ins); + continue; + } + + /* Maximum search distance to pair, to avoid register pressure disasters */ + int search_distance = 8; + + /* Otherwise, we have an orphaned load/store -- search for another load */ + mir_foreach_instr_in_block_from(block, c, mir_next_op(ins)) { + /* Terminate search if necessary */ + if (!(search_distance--)) break; + + if (c->type != TAG_LOAD_STORE_4) continue; + + /* Stores cannot be reordered, since they have + * dependencies. For the same reason, indirect + * loads cannot be reordered as their index is + * loaded in r27.w */ + + if (OP_IS_STORE(c->load_store.op)) continue; + + /* It appears the 0x800 bit is set whenever a + * load is direct, unset when it is indirect. + * Skip indirect loads. */ + + if (!(c->load_store.unknown & 0x800)) continue; + + /* We found one! Move it up to pair and remove it from the old location */ + + mir_insert_instruction_before(ins, *c); + mir_remove_instruction(c); + + break; + } + } + } +} + +/* If there are leftovers after the below pass, emit actual fmov + * instructions for the slow-but-correct path */ + +static void +emit_leftover_move(compiler_context *ctx) +{ + set_foreach(ctx->leftover_ssa_to_alias, leftover) { + int base = ((uintptr_t) leftover->key) - 1; + int mapped = base; + + map_ssa_to_alias(ctx, &mapped); + EMIT(mov, mapped, blank_alu_src, base); + } +} + +static void +actualise_ssa_to_alias(compiler_context *ctx) +{ + mir_foreach_instr(ctx, ins) { + map_ssa_to_alias(ctx, &ins->ssa_args.src0); + map_ssa_to_alias(ctx, &ins->ssa_args.src1); + } + + emit_leftover_move(ctx); +} + +static void +emit_fragment_epilogue(compiler_context *ctx) +{ + /* Special case: writing out constants requires us to include the move + * explicitly now, so shove it into r0 */ + + void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, ctx->fragment_output + 1); + + if (constant_value) { + midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, SSA_FIXED_REGISTER(0)); + attach_constants(ctx, &ins, constant_value, ctx->fragment_output + 1); + emit_mir_instruction(ctx, ins); + } + + /* Perform the actual fragment writeout. We have two writeout/branch + * instructions, forming a loop until writeout is successful as per the + * docs. TODO: gl_FragDepth */ + + EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, 0, midgard_condition_always); + EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, -1, midgard_condition_always); +} + +static midgard_block * +emit_block(compiler_context *ctx, nir_block *block) +{ + midgard_block *this_block = calloc(sizeof(midgard_block), 1); + list_addtail(&this_block->link, &ctx->blocks); + + this_block->is_scheduled = false; + ++ctx->block_count; + + ctx->texture_index[0] = -1; + ctx->texture_index[1] = -1; + + /* Add us as a successor to the block we are following */ + if (ctx->current_block) + midgard_block_add_successor(ctx->current_block, this_block); + + /* Set up current block */ + list_inithead(&this_block->instructions); + ctx->current_block = this_block; + + nir_foreach_instr(instr, block) { + emit_instr(ctx, instr); + ++ctx->instruction_count; + } + + inline_alu_constants(ctx); + embedded_to_inline_constant(ctx); + + /* Perform heavylifting for aliasing */ + actualise_ssa_to_alias(ctx); + + midgard_pair_load_store(ctx, this_block); + + /* Append fragment shader epilogue (value writeout) */ + if (ctx->stage == MESA_SHADER_FRAGMENT) { + if (block == nir_impl_last_block(ctx->func->impl)) { + emit_fragment_epilogue(ctx); + } + } + + if (block == nir_start_block(ctx->func->impl)) + ctx->initial_block = this_block; + + if (block == nir_impl_last_block(ctx->func->impl)) + ctx->final_block = this_block; + + /* Allow the next control flow to access us retroactively, for + * branching etc */ + ctx->current_block = this_block; + + /* Document the fallthrough chain */ + ctx->previous_source_block = this_block; + + return this_block; +} + +static midgard_block *emit_cf_list(struct compiler_context *ctx, struct exec_list *list); + +static void +emit_if(struct compiler_context *ctx, nir_if *nif) +{ + /* Conditional branches expect the condition in r31.w; emit a move for + * that in the _previous_ block (which is the current block). */ + emit_condition(ctx, &nif->condition, true, COMPONENT_X); + + /* Speculatively emit the branch, but we can't fill it in until later */ + EMIT(branch, true, true); + midgard_instruction *then_branch = mir_last_in_block(ctx->current_block); + + /* Emit the two subblocks */ + midgard_block *then_block = emit_cf_list(ctx, &nif->then_list); + + /* Emit a jump from the end of the then block to the end of the else */ + EMIT(branch, false, false); + midgard_instruction *then_exit = mir_last_in_block(ctx->current_block); + + /* Emit second block, and check if it's empty */ + + int else_idx = ctx->block_count; + int count_in = ctx->instruction_count; + midgard_block *else_block = emit_cf_list(ctx, &nif->else_list); + int after_else_idx = ctx->block_count; + + /* Now that we have the subblocks emitted, fix up the branches */ + + assert(then_block); + assert(else_block); + + if (ctx->instruction_count == count_in) { + /* The else block is empty, so don't emit an exit jump */ + mir_remove_instruction(then_exit); + then_branch->branch.target_block = after_else_idx; + } else { + then_branch->branch.target_block = else_idx; + then_exit->branch.target_block = after_else_idx; + } +} + +static void +emit_loop(struct compiler_context *ctx, nir_loop *nloop) +{ + /* Remember where we are */ + midgard_block *start_block = ctx->current_block; + + /* Allocate a loop number, growing the current inner loop depth */ + int loop_idx = ++ctx->current_loop_depth; + + /* Get index from before the body so we can loop back later */ + int start_idx = ctx->block_count; + + /* Emit the body itself */ + emit_cf_list(ctx, &nloop->body); + + /* Branch back to loop back */ + struct midgard_instruction br_back = v_branch(false, false); + br_back.branch.target_block = start_idx; + emit_mir_instruction(ctx, br_back); + + /* Mark down that branch in the graph. Note that we're really branching + * to the block *after* we started in. TODO: Why doesn't the branch + * itself have an off-by-one then...? */ + midgard_block_add_successor(ctx->current_block, start_block->successors[0]); + + /* Find the index of the block about to follow us (note: we don't add + * one; blocks are 0-indexed so we get a fencepost problem) */ + int break_block_idx = ctx->block_count; + + /* Fix up the break statements we emitted to point to the right place, + * now that we can allocate a block number for them */ + + list_for_each_entry_from(struct midgard_block, block, start_block, &ctx->blocks, link) { + mir_foreach_instr_in_block(block, ins) { + if (ins->type != TAG_ALU_4) continue; + if (!ins->compact_branch) continue; + if (ins->prepacked_branch) continue; + + /* We found a branch -- check the type to see if we need to do anything */ + if (ins->branch.target_type != TARGET_BREAK) continue; + + /* It's a break! Check if it's our break */ + if (ins->branch.target_break != loop_idx) continue; + + /* Okay, cool, we're breaking out of this loop. + * Rewrite from a break to a goto */ + + ins->branch.target_type = TARGET_GOTO; + ins->branch.target_block = break_block_idx; + } + } + + /* Now that we've finished emitting the loop, free up the depth again + * so we play nice with recursion amid nested loops */ + --ctx->current_loop_depth; + + /* Dump loop stats */ + ++ctx->loop_count; +} + +static midgard_block * +emit_cf_list(struct compiler_context *ctx, struct exec_list *list) +{ + midgard_block *start_block = NULL; + + foreach_list_typed(nir_cf_node, node, node, list) { + switch (node->type) { + case nir_cf_node_block: { + midgard_block *block = emit_block(ctx, nir_cf_node_as_block(node)); + + if (!start_block) + start_block = block; + + break; + } + + case nir_cf_node_if: + emit_if(ctx, nir_cf_node_as_if(node)); + break; + + case nir_cf_node_loop: + emit_loop(ctx, nir_cf_node_as_loop(node)); + break; + + case nir_cf_node_function: + assert(0); + break; + } + } + + return start_block; +} + +/* Due to lookahead, we need to report the first tag executed in the command + * stream and in branch targets. An initial block might be empty, so iterate + * until we find one that 'works' */ + +static unsigned +midgard_get_first_tag_from_block(compiler_context *ctx, unsigned block_idx) +{ + midgard_block *initial_block = mir_get_block(ctx, block_idx); + + unsigned first_tag = 0; + + do { + midgard_bundle *initial_bundle = util_dynarray_element(&initial_block->bundles, midgard_bundle, 0); + + if (initial_bundle) { + first_tag = initial_bundle->tag; + break; + } + + /* Initial block is empty, try the next block */ + initial_block = list_first_entry(&(initial_block->link), midgard_block, link); + } while(initial_block != NULL); + + assert(first_tag); + return first_tag; +} + +int +midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_blend) +{ + struct util_dynarray *compiled = &program->compiled; + + midgard_debug = debug_get_option_midgard_debug(); + + compiler_context ictx = { + .nir = nir, + .stage = nir->info.stage, + + .is_blend = is_blend, + .blend_constant_offset = 0, + + .alpha_ref = program->alpha_ref + }; + + compiler_context *ctx = &ictx; + + /* TODO: Decide this at runtime */ + ctx->uniform_cutoff = 8; + + /* Initialize at a global (not block) level hash tables */ + + ctx->ssa_constants = _mesa_hash_table_u64_create(NULL); + ctx->ssa_to_alias = _mesa_hash_table_u64_create(NULL); + ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL); + ctx->sysval_to_id = _mesa_hash_table_u64_create(NULL); + ctx->leftover_ssa_to_alias = _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); + + /* Record the varying mapping for the command stream's bookkeeping */ + + struct exec_list *varyings = + ctx->stage == MESA_SHADER_VERTEX ? &nir->outputs : &nir->inputs; + + unsigned max_varying = 0; + nir_foreach_variable(var, varyings) { + unsigned loc = var->data.driver_location; + unsigned sz = glsl_type_size(var->type, FALSE); + + for (int c = 0; c < sz; ++c) { + program->varyings[loc + c] = var->data.location + c; + max_varying = MAX2(max_varying, loc + c); + } + } + + /* Lower gl_Position pre-optimisation, but after lowering vars to ssa + * (so we don't accidentally duplicate the epilogue since mesa/st has + * messed with our I/O quite a bit already) */ + + NIR_PASS_V(nir, nir_lower_vars_to_ssa); + + if (ctx->stage == MESA_SHADER_VERTEX) + NIR_PASS_V(nir, nir_lower_viewport_transform); + + NIR_PASS_V(nir, nir_lower_var_copies); + NIR_PASS_V(nir, nir_lower_vars_to_ssa); + NIR_PASS_V(nir, nir_split_var_copies); + NIR_PASS_V(nir, nir_lower_var_copies); + NIR_PASS_V(nir, nir_lower_global_vars_to_local); + NIR_PASS_V(nir, nir_lower_var_copies); + NIR_PASS_V(nir, nir_lower_vars_to_ssa); + + NIR_PASS_V(nir, nir_lower_io, nir_var_all, glsl_type_size, 0); + + /* Optimisation passes */ + + optimise_nir(nir); + + if (midgard_debug & MIDGARD_DBG_SHADERS) { + nir_print_shader(nir, stdout); + } + + /* Assign sysvals and counts, now that we're sure + * (post-optimisation) */ + + midgard_nir_assign_sysvals(ctx, nir); + + program->uniform_count = nir->num_uniforms; + program->sysval_count = ctx->sysval_count; + memcpy(program->sysvals, ctx->sysvals, sizeof(ctx->sysvals[0]) * ctx->sysval_count); + + program->attribute_count = (ctx->stage == MESA_SHADER_VERTEX) ? nir->num_inputs : 0; + program->varying_count = max_varying + 1; /* Fencepost off-by-one */ + + nir_foreach_function(func, nir) { + if (!func->impl) + continue; + + list_inithead(&ctx->blocks); + ctx->block_count = 0; + ctx->func = func; + + emit_cf_list(ctx, &func->impl->body); + emit_block(ctx, func->impl->end_block); + + break; /* TODO: Multi-function shaders */ + } + + util_dynarray_init(compiled, NULL); + + /* MIR-level optimizations */ + + bool progress = false; + + do { + progress = false; + + mir_foreach_block(ctx, block) { + progress |= midgard_opt_pos_propagate(ctx, block); + progress |= midgard_opt_copy_prop(ctx, block); + progress |= midgard_opt_dead_code_eliminate(ctx, block); + } + } while (progress); + + /* Nested control-flow can result in dead branches at the end of the + * block. This messes with our analysis and is just dead code, so cull + * them */ + mir_foreach_block(ctx, block) { + midgard_opt_cull_dead_branch(ctx, block); + } + + /* Schedule! */ + schedule_program(ctx); + + /* Now that all the bundles are scheduled and we can calculate block + * sizes, emit actual branch instructions rather than placeholders */ + + int br_block_idx = 0; + + mir_foreach_block(ctx, block) { + util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) { + for (int c = 0; c < bundle->instruction_count; ++c) { + midgard_instruction *ins = bundle->instructions[c]; + + if (!midgard_is_branch_unit(ins->unit)) continue; + + if (ins->prepacked_branch) continue; + + /* Parse some basic branch info */ + bool is_compact = ins->unit == ALU_ENAB_BR_COMPACT; + bool is_conditional = ins->branch.conditional; + bool is_inverted = ins->branch.invert_conditional; + bool is_discard = ins->branch.target_type == TARGET_DISCARD; + + /* Determine the block we're jumping to */ + int target_number = ins->branch.target_block; + + /* Report the destination tag */ + int dest_tag = is_discard ? 0 : midgard_get_first_tag_from_block(ctx, target_number); + + /* Count up the number of quadwords we're + * jumping over = number of quadwords until + * (br_block_idx, target_number) */ + + int quadword_offset = 0; + + if (is_discard) { + /* Jump to the end of the shader. We + * need to include not only the + * following blocks, but also the + * contents of our current block (since + * discard can come in the middle of + * the block) */ + + midgard_block *blk = mir_get_block(ctx, br_block_idx + 1); + + for (midgard_bundle *bun = bundle + 1; bun < (midgard_bundle *)((char*) block->bundles.data + block->bundles.size); ++bun) { + quadword_offset += quadword_size(bun->tag); + } + + mir_foreach_block_from(ctx, blk, b) { + quadword_offset += b->quadword_count; + } + + } else if (target_number > br_block_idx) { + /* Jump forward */ + + for (int idx = br_block_idx + 1; idx < target_number; ++idx) { + midgard_block *blk = mir_get_block(ctx, idx); + assert(blk); + + quadword_offset += blk->quadword_count; + } + } else { + /* Jump backwards */ + + for (int idx = br_block_idx; idx >= target_number; --idx) { + midgard_block *blk = mir_get_block(ctx, idx); + assert(blk); + + quadword_offset -= blk->quadword_count; + } + } + + /* Unconditional extended branches (far jumps) + * have issues, so we always use a conditional + * branch, setting the condition to always for + * unconditional. For compact unconditional + * branches, cond isn't used so it doesn't + * matter what we pick. */ + + midgard_condition cond = + !is_conditional ? midgard_condition_always : + is_inverted ? midgard_condition_false : + midgard_condition_true; + + midgard_jmp_writeout_op op = + is_discard ? midgard_jmp_writeout_op_discard : + (is_compact && !is_conditional) ? midgard_jmp_writeout_op_branch_uncond : + midgard_jmp_writeout_op_branch_cond; + + if (!is_compact) { + midgard_branch_extended branch = + midgard_create_branch_extended( + cond, op, + dest_tag, + quadword_offset); + + memcpy(&ins->branch_extended, &branch, sizeof(branch)); + } else if (is_conditional || is_discard) { + midgard_branch_cond branch = { + .op = op, + .dest_tag = dest_tag, + .offset = quadword_offset, + .cond = cond + }; + + assert(branch.offset == quadword_offset); + + memcpy(&ins->br_compact, &branch, sizeof(branch)); + } else { + assert(op == midgard_jmp_writeout_op_branch_uncond); + + midgard_branch_uncond branch = { + .op = op, + .dest_tag = dest_tag, + .offset = quadword_offset, + .unknown = 1 + }; + + assert(branch.offset == quadword_offset); + + memcpy(&ins->br_compact, &branch, sizeof(branch)); + } + } + } + + ++br_block_idx; + } + + /* Emit flat binary from the instruction arrays. Iterate each block in + * sequence. Save instruction boundaries such that lookahead tags can + * be assigned easily */ + + /* Cache _all_ bundles in source order for lookahead across failed branches */ + + int bundle_count = 0; + mir_foreach_block(ctx, block) { + bundle_count += block->bundles.size / sizeof(midgard_bundle); + } + midgard_bundle **source_order_bundles = malloc(sizeof(midgard_bundle *) * bundle_count); + int bundle_idx = 0; + mir_foreach_block(ctx, block) { + util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) { + source_order_bundles[bundle_idx++] = bundle; + } + } + + int current_bundle = 0; + + /* Midgard prefetches instruction types, so during emission we + * need to lookahead. Unless this is the last instruction, in + * which we return 1. Or if this is the second to last and the + * last is an ALU, then it's also 1... */ + + mir_foreach_block(ctx, block) { + mir_foreach_bundle_in_block(block, bundle) { + int lookahead = 1; + + if (current_bundle + 1 < bundle_count) { + uint8_t next = source_order_bundles[current_bundle + 1]->tag; + + if (!(current_bundle + 2 < bundle_count) && IS_ALU(next)) { + lookahead = 1; + } else { + lookahead = next; + } + } + + emit_binary_bundle(ctx, bundle, compiled, lookahead); + ++current_bundle; + } + + /* TODO: Free deeper */ + //util_dynarray_fini(&block->instructions); + } + + free(source_order_bundles); + + /* Report the very first tag executed */ + program->first_tag = midgard_get_first_tag_from_block(ctx, 0); + + /* Deal with off-by-one related to the fencepost problem */ + program->work_register_count = ctx->work_registers + 1; + + program->can_discard = ctx->can_discard; + program->uniform_cutoff = ctx->uniform_cutoff; + + program->blend_patch_offset = ctx->blend_constant_offset; + + if (midgard_debug & MIDGARD_DBG_SHADERS) + disassemble_midgard(program->compiled.data, program->compiled.size); + + if (midgard_debug & MIDGARD_DBG_SHADERDB) { + unsigned nr_bundles = 0, nr_ins = 0, nr_quadwords = 0; + + /* Count instructions and bundles */ + + mir_foreach_instr_global(ctx, ins) { + nr_ins++; + } + + mir_foreach_block(ctx, block) { + nr_bundles += util_dynarray_num_elements( + &block->bundles, midgard_bundle); + + nr_quadwords += block->quadword_count; + } + + /* Calculate thread count. There are certain cutoffs by + * register count for thread count */ + + unsigned nr_registers = program->work_register_count; + + unsigned nr_threads = + (nr_registers <= 4) ? 4 : + (nr_registers <= 8) ? 2 : + 1; + + /* Dump stats */ + + fprintf(stderr, "shader%d - %s shader: " + "%u inst, %u bundles, %u quadwords, " + "%u registers, %u threads, %u loops\n", + SHADER_DB_COUNT++, + gl_shader_stage_name(ctx->stage), + nr_ins, nr_bundles, nr_quadwords, + nr_registers, nr_threads, + ctx->loop_count); + } + + + return 0; +} diff --git a/src/panfrost/midgard/midgard_compile.h b/src/panfrost/midgard/midgard_compile.h new file mode 100644 index 00000000000..147494b8e8a --- /dev/null +++ b/src/panfrost/midgard/midgard_compile.h @@ -0,0 +1,127 @@ +/* + * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MIDGARD_H_ +#define __MIDGARD_H_ + +#include "compiler/nir/nir.h" +#include "util/u_dynarray.h" + +/* Define the general compiler entry point */ + +#define MAX_SYSVAL_COUNT 32 + +/* Allow 2D of sysval IDs, while allowing nonparametric sysvals to equal + * their class for equal comparison */ + +#define PAN_SYSVAL(type, no) (((no) << 16) | PAN_SYSVAL_##type) +#define PAN_SYSVAL_TYPE(sysval) ((sysval) & 0xffff) +#define PAN_SYSVAL_ID(sysval) ((sysval) >> 16) + +/* Define some common types. We start at one for easy indexing of hash + * tables internal to the compiler */ + +enum { + PAN_SYSVAL_VIEWPORT_SCALE = 1, + PAN_SYSVAL_VIEWPORT_OFFSET = 2, + PAN_SYSVAL_TEXTURE_SIZE = 3, +} pan_sysval; + +#define PAN_TXS_SYSVAL_ID(texidx, dim, is_array) \ + ((texidx) | ((dim) << 7) | ((is_array) ? (1 << 9) : 0)) + +#define PAN_SYSVAL_ID_TO_TXS_TEX_IDX(id) ((id) & 0x7f) +#define PAN_SYSVAL_ID_TO_TXS_DIM(id) (((id) >> 7) & 0x3) +#define PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(id) !!((id) & (1 << 9)) + +typedef struct { + int work_register_count; + int uniform_count; + int uniform_cutoff; + + int attribute_count; + int varying_count; + + /* Prepended before uniforms, mapping to SYSVAL_ names for the + * sysval */ + + unsigned sysval_count; + unsigned sysvals[MAX_SYSVAL_COUNT]; + + unsigned varyings[32]; + + /* Boolean properties of the program */ + bool can_discard; + bool writes_point_size; + + int first_tag; + + struct util_dynarray compiled; + + /* For a blend shader using a constant color -- patch point. If + * negative, there's no constant. */ + + int blend_patch_offset; + + /* IN: For a fragment shader with a lowered alpha test, the ref value */ + float alpha_ref; +} midgard_program; + +int +midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_blend); + +/* NIR options are shared between the standalone compiler and the online + * compiler. Defining it here is the simplest, though maybe not the Right + * solution. */ + +static const nir_shader_compiler_options midgard_nir_options = { + .lower_ffma = true, + .lower_sub = true, + .lower_scmp = true, + .lower_flrp32 = true, + .lower_flrp64 = true, + .lower_ffract = true, + .lower_fmod = true, + .lower_fdiv = true, + .lower_idiv = true, + .lower_isign = true, + .lower_fpow = true, + .lower_find_lsb = true, + + .lower_wpos_pntc = true, + + /* TODO: We have native ops to help here, which we'll want to look into + * eventually */ + .lower_fsign = true, + + .vertex_id_zero_based = true, + .lower_extract_byte = true, + .lower_extract_word = true, + .lower_rotate = true, + + .lower_doubles_options = nir_lower_dmod, + + .vectorize_io = true, +}; + +#endif diff --git a/src/panfrost/midgard/midgard_emit.c b/src/panfrost/midgard/midgard_emit.c new file mode 100644 index 00000000000..3522e77d5b1 --- /dev/null +++ b/src/panfrost/midgard/midgard_emit.c @@ -0,0 +1,273 @@ +/* + * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "compiler.h" +#include "midgard_ops.h" + +/* Midgard IR only knows vector ALU types, but we sometimes need to actually + * use scalar ALU instructions, for functional or performance reasons. To do + * this, we just demote vector ALU payloads to scalar. */ + +static int +component_from_mask(unsigned mask) +{ + for (int c = 0; c < 8; ++c) { + if (mask & (1 << c)) + return c; + } + + assert(0); + return 0; +} + +static unsigned +vector_to_scalar_source(unsigned u, bool is_int, bool is_full) +{ + midgard_vector_alu_src v; + memcpy(&v, &u, sizeof(v)); + + /* TODO: Integers */ + + unsigned component = v.swizzle & 3; + bool upper = false; /* TODO */ + + midgard_scalar_alu_src s = { 0 }; + + if (is_full) { + /* For a 32-bit op, just check the source half flag */ + s.full = !v.half; + } else if (!v.half) { + /* For a 16-bit op that's not subdivided, never full */ + s.full = false; + } else { + /* We can't do 8-bit scalar, abort! */ + assert(0); + } + + /* Component indexing takes size into account */ + + if (s.full) + s.component = component << 1; + else + s.component = component + (upper << 2); + + if (is_int) { + /* TODO */ + } else { + s.abs = v.mod & MIDGARD_FLOAT_MOD_ABS; + s.negate = v.mod & MIDGARD_FLOAT_MOD_NEG; + } + + unsigned o; + memcpy(&o, &s, sizeof(s)); + + return o & ((1 << 6) - 1); +} + +static midgard_scalar_alu +vector_to_scalar_alu(midgard_vector_alu v, midgard_instruction *ins) +{ + bool is_int = midgard_is_integer_op(v.op); + bool is_full = v.reg_mode == midgard_reg_mode_32; + bool is_inline_constant = ins->ssa_args.inline_constant; + + /* The output component is from the mask */ + midgard_scalar_alu s = { + .op = v.op, + .src1 = vector_to_scalar_source(v.src1, is_int, is_full), + .src2 = !is_inline_constant ? vector_to_scalar_source(v.src2, is_int, is_full) : 0, + .unknown = 0, + .outmod = v.outmod, + .output_full = is_full, + .output_component = component_from_mask(ins->mask), + }; + + /* Full components are physically spaced out */ + if (is_full) { + assert(s.output_component < 4); + s.output_component <<= 1; + } + + /* Inline constant is passed along rather than trying to extract it + * from v */ + + if (ins->ssa_args.inline_constant) { + uint16_t imm = 0; + int lower_11 = ins->inline_constant & ((1 << 12) - 1); + imm |= (lower_11 >> 9) & 3; + imm |= (lower_11 >> 6) & 4; + imm |= (lower_11 >> 2) & 0x38; + imm |= (lower_11 & 63) << 6; + + s.src2 = imm; + } + + return s; +} + +static void +emit_alu_bundle(compiler_context *ctx, + midgard_bundle *bundle, + struct util_dynarray *emission, + unsigned lookahead) +{ + /* Emit the control word */ + util_dynarray_append(emission, uint32_t, bundle->control | lookahead); + + /* Next up, emit register words */ + for (unsigned i = 0; i < bundle->instruction_count; ++i) { + midgard_instruction *ins = bundle->instructions[i]; + + /* Check if this instruction has registers */ + if (ins->compact_branch || ins->prepacked_branch) continue; + + /* Otherwise, just emit the registers */ + uint16_t reg_word = 0; + memcpy(®_word, &ins->registers, sizeof(uint16_t)); + util_dynarray_append(emission, uint16_t, reg_word); + } + + /* Now, we emit the body itself */ + for (unsigned i = 0; i < bundle->instruction_count; ++i) { + midgard_instruction *ins = bundle->instructions[i]; + + /* Where is this body */ + unsigned size = 0; + void *source = NULL; + + /* In case we demote to a scalar */ + midgard_scalar_alu scalarized; + + if (ins->unit & UNITS_ANY_VECTOR) { + if (ins->alu.reg_mode == midgard_reg_mode_32) + ins->alu.mask = expand_writemask_32(ins->mask); + else + ins->alu.mask = ins->mask; + + size = sizeof(midgard_vector_alu); + source = &ins->alu; + } else if (ins->unit == ALU_ENAB_BR_COMPACT) { + size = sizeof(midgard_branch_cond); + source = &ins->br_compact; + } else if (ins->compact_branch) { /* misnomer */ + size = sizeof(midgard_branch_extended); + source = &ins->branch_extended; + } else { + size = sizeof(midgard_scalar_alu); + scalarized = vector_to_scalar_alu(ins->alu, ins); + source = &scalarized; + } + + memcpy(util_dynarray_grow_bytes(emission, 1, size), source, size); + } + + /* Emit padding (all zero) */ + memset(util_dynarray_grow_bytes(emission, 1, bundle->padding), 0, bundle->padding); + + /* Tack on constants */ + + if (bundle->has_embedded_constants) { + util_dynarray_append(emission, float, bundle->constants[0]); + util_dynarray_append(emission, float, bundle->constants[1]); + util_dynarray_append(emission, float, bundle->constants[2]); + util_dynarray_append(emission, float, bundle->constants[3]); + } +} + +/* After everything is scheduled, emit whole bundles at a time */ + +void +emit_binary_bundle(compiler_context *ctx, + midgard_bundle *bundle, + struct util_dynarray *emission, + int next_tag) +{ + int lookahead = next_tag << 4; + + switch (bundle->tag) { + case TAG_ALU_4: + case TAG_ALU_8: + case TAG_ALU_12: + case TAG_ALU_16: + emit_alu_bundle(ctx, bundle, emission, lookahead); + break; + + case TAG_LOAD_STORE_4: { + /* One or two composing instructions */ + + uint64_t current64, next64 = LDST_NOP; + + /* Copy masks */ + + for (unsigned i = 0; i < bundle->instruction_count; ++i) { + bundle->instructions[i]->load_store.mask = + bundle->instructions[i]->mask; + } + + memcpy(¤t64, &bundle->instructions[0]->load_store, sizeof(current64)); + + if (bundle->instruction_count == 2) + memcpy(&next64, &bundle->instructions[1]->load_store, sizeof(next64)); + + midgard_load_store instruction = { + .type = bundle->tag, + .next_type = next_tag, + .word1 = current64, + .word2 = next64 + }; + + util_dynarray_append(emission, midgard_load_store, instruction); + + break; + } + + case TAG_TEXTURE_4: + case TAG_TEXTURE_4_VTX: { + /* Texture instructions are easy, since there is no pipelining + * nor VLIW to worry about. We may need to set .cont/.last + * flags. */ + + midgard_instruction *ins = bundle->instructions[0]; + + ins->texture.type = bundle->tag; + ins->texture.next_type = next_tag; + ins->texture.mask = ins->mask; + + ctx->texture_op_count--; + + if (ins->texture.op == TEXTURE_OP_NORMAL) { + bool continues = ctx->texture_op_count > 0; + ins->texture.cont = continues; + ins->texture.last = !continues; + } else { + ins->texture.cont = ins->texture.last = 1; + } + + util_dynarray_append(emission, midgard_texture_word, ins->texture); + break; + } + + default: + unreachable("Unknown midgard instruction type\n"); + } +} diff --git a/src/panfrost/midgard/midgard_liveness.c b/src/panfrost/midgard/midgard_liveness.c new file mode 100644 index 00000000000..a18d8b9f8ad --- /dev/null +++ b/src/panfrost/midgard/midgard_liveness.c @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* mir_is_live_after performs liveness analysis on the MIR, used primarily + * as part of register allocation. TODO: Algorithmic improvements for + * compiler performance (this is the worst algorithm possible -- see + * backlog with Connor on IRC) */ + +#include "compiler.h" + +static bool +midgard_is_live_in_instr(midgard_instruction *ins, int src) +{ + if (ins->compact_branch) + return false; + + if (ins->ssa_args.src0 == src) + return true; + + if (!ins->ssa_args.inline_constant && ins->ssa_args.src1 == src) + return true; + + return false; +} + +/* Determine if a variable is live in the successors of a block */ +static bool +is_live_after_successors(compiler_context *ctx, midgard_block *bl, int src) +{ + for (unsigned i = 0; i < bl->nr_successors; ++i) { + midgard_block *succ = bl->successors[i]; + + /* If we already visited, the value we're seeking + * isn't down this path (or we would have short + * circuited */ + + if (succ->visited) continue; + + /* Otherwise (it's visited *now*), check the block */ + + succ->visited = true; + + mir_foreach_instr_in_block(succ, ins) { + if (midgard_is_live_in_instr(ins, src)) + return true; + } + + /* ...and also, check *its* successors */ + if (is_live_after_successors(ctx, succ, src)) + return true; + + } + + /* Welp. We're really not live. */ + + return false; +} + +bool +mir_is_live_after(compiler_context *ctx, midgard_block *block, midgard_instruction *start, int src) +{ + /* Check the rest of the block for liveness */ + + mir_foreach_instr_in_block_from(block, ins, mir_next_op(start)) { + if (midgard_is_live_in_instr(ins, src)) + return true; + } + + /* Check the rest of the blocks for liveness recursively */ + + bool succ = is_live_after_successors(ctx, block, src); + + mir_foreach_block(ctx, block) { + block->visited = false; + } + + return succ; +} + +/* Just a quick check -- is it written more than once? (I.e. are we definitely + * not SSA?) */ + +bool +mir_has_multiple_writes(compiler_context *ctx, int dest) +{ + unsigned write_count = 0; + + mir_foreach_instr_global(ctx, ins) { + if (ins->ssa_args.dest == dest) + write_count++; + } + + return write_count > 1; +} diff --git a/src/panfrost/midgard/midgard_nir.h b/src/panfrost/midgard/midgard_nir.h new file mode 100644 index 00000000000..85eadd34631 --- /dev/null +++ b/src/panfrost/midgard/midgard_nir.h @@ -0,0 +1,5 @@ +#include <stdbool.h> +#include "nir.h" + +bool midgard_nir_lower_algebraic_late(nir_shader *shader); +bool midgard_nir_scale_trig(nir_shader *shader); diff --git a/src/panfrost/midgard/midgard_nir_algebraic.py b/src/panfrost/midgard/midgard_nir_algebraic.py new file mode 100644 index 00000000000..faf83364c3a --- /dev/null +++ b/src/panfrost/midgard/midgard_nir_algebraic.py @@ -0,0 +1,96 @@ +# +# Copyright (C) 2018 Alyssa Rosenzweig +# +# Copyright (C) 2016 Intel Corporation +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +import argparse +import sys +import math + +a = 'a' +b = 'b' +c = 'c' + +algebraic_late = [ + # ineg must be lowered late, but only for integers; floats will try to + # have modifiers attached... hence why this has to be here rather than + # a more standard lower_negate approach + + (('ineg', a), ('isub', 0, a)), + + # These two special-cases save space/an op than the actual csel op + + # scheduler flexibility + + (('b32csel', a, 'b@32', 0), ('iand', a, b)), + (('b32csel', a, 0, 'b@32'), ('iand', ('inot', a), b)), +] + + +# Midgard is able to type convert down by only one "step" per instruction; if +# NIR wants more than one step, we need to break up into multiple instructions + +converts = [ + (('i2i8', 'a@32'), ('i2i8', ('i2i16', a))), + (('u2u8', 'a@32'), ('u2u8', ('u2u16', a))), + + (('i2i32', 'a@8'), ('i2i32', ('i2i16', a))), + (('u2u32', 'a@8'), ('u2u32', ('u2u16', a))), + + (('f2i32', 'a@16'), ('f2i32', ('f2f32', a))), + (('f2u32', 'a@16'), ('f2u32', ('f2f32', a))), + + # Totally redundant + (('~f2f16', ('f2f32', 'a@16')), a), + + (('pack_half_2x16_split', 'a@32', 'b@32'), ('ior', ('ishl', ('i2i32', ('f2f16', b)), 16), ('i2i32', ('f2f16', a)))), +] + +# Midgard scales fsin/fcos arguments by pi. +# Pass must be run only once, after the main loop + +scale_trig = [ + (('fsin', a), ('fsin', ('fdiv', a, math.pi))), + (('fcos', a), ('fcos', ('fdiv', a, math.pi))), +] + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--import-path', required=True) + args = parser.parse_args() + sys.path.insert(0, args.import_path) + run() + + +def run(): + import nir_algebraic # pylint: disable=import-error + + print('#include "midgard_nir.h"') + + print(nir_algebraic.AlgebraicPass("midgard_nir_lower_algebraic_late", + algebraic_late + converts).render()) + + print(nir_algebraic.AlgebraicPass("midgard_nir_scale_trig", + scale_trig).render()) + + +if __name__ == '__main__': + main() diff --git a/src/panfrost/midgard/midgard_ops.c b/src/panfrost/midgard/midgard_ops.c new file mode 100644 index 00000000000..ccd750cff83 --- /dev/null +++ b/src/panfrost/midgard/midgard_ops.c @@ -0,0 +1,221 @@ +/* Copyright (c) 2018-2019 Alyssa Rosenzweig ([email protected]) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "midgard.h" + +/* Include the definitions of the macros and such */ + +#define MIDGARD_OPS_TABLE +#include "helpers.h" +#undef MIDGARD_OPS_TABLE + +/* Table of mapping opcodes to accompanying properties. This is used for both + * the disassembler and the compiler. It is placed in a .c file like this to + * avoid duplications in the binary */ + +struct mir_op_props alu_opcode_props[256] = { + [midgard_alu_op_fadd] = {"fadd", UNITS_ADD | OP_COMMUTES}, + [midgard_alu_op_fmul] = {"fmul", UNITS_MUL | UNIT_VLUT | OP_COMMUTES}, + [midgard_alu_op_fmin] = {"fmin", UNITS_MUL | UNITS_ADD | OP_COMMUTES}, + [midgard_alu_op_fmax] = {"fmax", UNITS_MUL | UNITS_ADD | OP_COMMUTES}, + [midgard_alu_op_imin] = {"imin", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_imax] = {"imax", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_umin] = {"umin", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_umax] = {"umax", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_ihadd] = {"ihadd", UNITS_ADD | OP_COMMUTES}, + [midgard_alu_op_uhadd] = {"uhadd", UNITS_ADD | OP_COMMUTES}, + [midgard_alu_op_irhadd] = {"irhadd", UNITS_ADD | OP_COMMUTES}, + [midgard_alu_op_urhadd] = {"urhadd", UNITS_ADD | OP_COMMUTES}, + + [midgard_alu_op_fmov] = {"fmov", UNITS_ALL | QUIRK_FLIPPED_R24}, + [midgard_alu_op_fmov_rtz] = {"fmov_rtz", UNITS_ALL | QUIRK_FLIPPED_R24}, + [midgard_alu_op_fmov_rtn] = {"fmov_rtn", UNITS_ALL | QUIRK_FLIPPED_R24}, + [midgard_alu_op_fmov_rtp] = {"fmov_rtp", UNITS_ALL | QUIRK_FLIPPED_R24}, + [midgard_alu_op_fround] = {"fround", UNITS_ADD}, + [midgard_alu_op_froundeven] = {"froundeven", UNITS_ADD}, + [midgard_alu_op_ftrunc] = {"ftrunc", UNITS_ADD}, + [midgard_alu_op_ffloor] = {"ffloor", UNITS_ADD}, + [midgard_alu_op_fceil] = {"fceil", UNITS_ADD}, + [midgard_alu_op_ffma] = {"ffma", UNIT_VLUT}, + + /* Though they output a scalar, they need to run on a vector unit + * since they process vectors */ + [midgard_alu_op_fdot3] = {"fdot3", UNIT_VMUL | OP_CHANNEL_COUNT(3) | OP_COMMUTES}, + [midgard_alu_op_fdot3r] = {"fdot3r", UNIT_VMUL | OP_CHANNEL_COUNT(3) | OP_COMMUTES}, + [midgard_alu_op_fdot4] = {"fdot4", UNIT_VMUL | OP_CHANNEL_COUNT(4) | OP_COMMUTES}, + + /* Incredibly, iadd can run on vmul, etc */ + [midgard_alu_op_iadd] = {"iadd", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_iaddsat] = {"iaddsat", UNITS_ADD | OP_COMMUTES}, + [midgard_alu_op_uaddsat] = {"uaddsat", UNITS_ADD | OP_COMMUTES}, + [midgard_alu_op_iabsdiff] = {"iabsdiff", UNITS_ADD}, + [midgard_alu_op_uabsdiff] = {"uabsdiff", UNITS_ADD}, + [midgard_alu_op_ichoose] = {"ichoose", UNITS_ADD}, + [midgard_alu_op_isub] = {"isub", UNITS_MOST}, + [midgard_alu_op_isubsat] = {"isubsat", UNITS_MOST}, + [midgard_alu_op_usubsat] = {"usubsat", UNITS_MOST}, + [midgard_alu_op_imul] = {"imul", UNITS_MUL | OP_COMMUTES}, + [midgard_alu_op_imov] = {"imov", UNITS_MOST | QUIRK_FLIPPED_R24}, + + /* For vector comparisons, use ball etc */ + [midgard_alu_op_feq] = {"feq", UNITS_MOST | OP_TYPE_CONVERT | OP_COMMUTES}, + [midgard_alu_op_fne] = {"fne", UNITS_MOST | OP_TYPE_CONVERT | OP_COMMUTES}, + [midgard_alu_op_fle] = {"fle", UNITS_MOST | OP_TYPE_CONVERT}, + [midgard_alu_op_flt] = {"flt", UNITS_MOST | OP_TYPE_CONVERT}, + [midgard_alu_op_ieq] = {"ieq", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_ine] = {"ine", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_ilt] = {"ilt", UNITS_MOST}, + [midgard_alu_op_ile] = {"ile", UNITS_MOST}, + [midgard_alu_op_ult] = {"ult", UNITS_MOST}, + [midgard_alu_op_ule] = {"ule", UNITS_MOST}, + + [midgard_alu_op_icsel] = {"icsel", UNITS_ADD}, + [midgard_alu_op_icsel_v] = {"icsel_v", UNITS_ADD}, /* Acts as bitselect() */ + [midgard_alu_op_fcsel_v] = {"fcsel_v", UNITS_ADD}, + [midgard_alu_op_fcsel] = {"fcsel", UNITS_ADD | UNIT_SMUL}, + + [midgard_alu_op_frcp] = {"frcp", UNIT_VLUT}, + [midgard_alu_op_frsqrt] = {"frsqrt", UNIT_VLUT}, + [midgard_alu_op_fsqrt] = {"fsqrt", UNIT_VLUT}, + [midgard_alu_op_fpow_pt1] = {"fpow_pt1", UNIT_VLUT}, + [midgard_alu_op_fpown_pt1] = {"fpown_pt1", UNIT_VLUT}, + [midgard_alu_op_fpowr_pt1] = {"fpowr_pt1", UNIT_VLUT}, + [midgard_alu_op_fexp2] = {"fexp2", UNIT_VLUT}, + [midgard_alu_op_flog2] = {"flog2", UNIT_VLUT}, + + [midgard_alu_op_f2i_rte] = {"f2i_rte", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_f2i_rtz] = {"f2i_rtz", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_f2i_rtn] = {"f2i_rtn", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_f2i_rtp] = {"f2i_rtp", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_f2u_rte] = {"f2i_rte", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_f2u_rtz] = {"f2i_rtz", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_f2u_rtn] = {"f2i_rtn", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_f2u_rtp] = {"f2i_rtp", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_i2f_rte] = {"i2f", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_i2f_rtz] = {"i2f_rtz", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_i2f_rtn] = {"i2f_rtn", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_i2f_rtp] = {"i2f_rtp", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_u2f_rte] = {"u2f", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_u2f_rtz] = {"u2f_rtz", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_u2f_rtn] = {"u2f_rtn", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_u2f_rtp] = {"u2f_rtp", UNITS_ADD | OP_TYPE_CONVERT}, + + [midgard_alu_op_fsin] = {"fsin", UNIT_VLUT}, + [midgard_alu_op_fcos] = {"fcos", UNIT_VLUT}, + + /* XXX: Test case where it's right on smul but not sadd */ + [midgard_alu_op_iand] = {"iand", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_iandnot] = {"iandnot", UNITS_MOST}, + + [midgard_alu_op_ior] = {"ior", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_iornot] = {"iornot", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_inor] = {"inor", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_ixor] = {"ixor", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_inxor] = {"inxor", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_iclz] = {"iclz", UNITS_ADD}, + [midgard_alu_op_ibitcount8] = {"ibitcount8", UNITS_ADD}, + [midgard_alu_op_inand] = {"inand", UNITS_MOST}, + [midgard_alu_op_ishl] = {"ishl", UNITS_ADD}, + [midgard_alu_op_iasr] = {"iasr", UNITS_ADD}, + [midgard_alu_op_ilsr] = {"ilsr", UNITS_ADD}, + + [midgard_alu_op_fball_eq] = {"fball_eq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES}, + [midgard_alu_op_fbany_neq] = {"fbany_neq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES}, + [midgard_alu_op_iball_eq] = {"iball_eq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES}, + [midgard_alu_op_iball_neq] = {"iball_neq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES}, + [midgard_alu_op_ibany_eq] = {"ibany_eq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES}, + [midgard_alu_op_ibany_neq] = {"ibany_neq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES}, + + /* These instructions are not yet emitted by the compiler, so + * don't speculate about units yet */ + [midgard_alu_op_ishladd] = {"ishladd", 0}, + + [midgard_alu_op_uball_lt] = {"uball_lt", 0}, + [midgard_alu_op_uball_lte] = {"uball_lte", 0}, + [midgard_alu_op_iball_lt] = {"iball_lt", 0}, + [midgard_alu_op_iball_lte] = {"iball_lte", 0}, + [midgard_alu_op_ubany_lt] = {"ubany_lt", 0}, + [midgard_alu_op_ubany_lte] = {"ubany_lte", 0}, + [midgard_alu_op_ibany_lt] = {"ibany_lt", 0}, + [midgard_alu_op_ibany_lte] = {"ibany_lte", 0}, + + [midgard_alu_op_freduce] = {"freduce", 0}, + [midgard_alu_op_bball_eq] = {"bball_eq", 0 | OP_CHANNEL_COUNT(4) | OP_COMMUTES}, + [midgard_alu_op_bbany_neq] = {"bball_eq", 0 | OP_CHANNEL_COUNT(4) | OP_COMMUTES}, + [midgard_alu_op_fatan2_pt1] = {"fatan2_pt1", 0}, + [midgard_alu_op_fatan_pt2] = {"fatan_pt2", 0}, +}; + +const char *load_store_opcode_names[256] = { + [midgard_op_st_cubemap_coords] = "st_cubemap_coords", + [midgard_op_ld_global_id] = "ld_global_id", + [midgard_op_ldst_perspective_division_z] = "ldst_perspective_division_z", + [midgard_op_ldst_perspective_division_w] = "ldst_perspective_division_w", + + [midgard_op_atomic_add] = "atomic_add", + [midgard_op_atomic_and] = "atomic_and", + [midgard_op_atomic_or] = "atomic_or", + [midgard_op_atomic_xor] = "atomic_xor", + [midgard_op_atomic_imin] = "atomic_imin", + [midgard_op_atomic_umin] = "atomic_umin", + [midgard_op_atomic_imax] = "atomic_imax", + [midgard_op_atomic_umax] = "atomic_umax", + [midgard_op_atomic_xchg] = "atomic_xchg", + + [midgard_op_ld_char] = "ld_char", + [midgard_op_ld_char2] = "ld_char2", + [midgard_op_ld_short] = "ld_short", + [midgard_op_ld_char4] = "ld_char4", + [midgard_op_ld_short4] = "ld_short4", + [midgard_op_ld_int4] = "ld_int4", + + [midgard_op_ld_attr_32] = "ld_attr_32", + [midgard_op_ld_attr_16] = "ld_attr_16", + [midgard_op_ld_attr_32i] = "ld_attr_32i", + [midgard_op_ld_attr_32u] = "ld_attr_32u", + + [midgard_op_ld_vary_32] = "ld_vary_32", + [midgard_op_ld_vary_16] = "ld_vary_16", + [midgard_op_ld_vary_32i] = "ld_vary_32i", + [midgard_op_ld_vary_32u] = "ld_vary_32u", + + [midgard_op_ld_color_buffer_16] = "ld_color_buffer_16", + + [midgard_op_ld_uniform_16] = "ld_uniform_16", + [midgard_op_ld_uniform_32] = "ld_uniform_32", + [midgard_op_ld_uniform_32i] = "ld_uniform_32i", + [midgard_op_ld_color_buffer_8] = "ld_color_buffer_8", + + [midgard_op_st_char] = "st_char", + [midgard_op_st_char2] = "st_char2", + [midgard_op_st_char4] = "st_char4", + [midgard_op_st_short4] = "st_short4", + [midgard_op_st_int4] = "st_int4", + + [midgard_op_st_vary_32] = "st_vary_32", + [midgard_op_st_vary_16] = "st_vary_16", + [midgard_op_st_vary_32i] = "st_vary_32i", + [midgard_op_st_vary_32u] = "st_vary_32u", + + [midgard_op_st_image_f] = "st_image_f", + [midgard_op_st_image_ui] = "st_image_ui", + [midgard_op_st_image_i] = "st_image_i", +}; diff --git a/src/panfrost/midgard/midgard_ops.h b/src/panfrost/midgard/midgard_ops.h new file mode 100644 index 00000000000..64c91a5bcac --- /dev/null +++ b/src/panfrost/midgard/midgard_ops.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2018-2019 Alyssa Rosenzweig ([email protected]) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "helpers.h" + +/* Forward declare */ + +extern struct mir_op_props alu_opcode_props[256]; +extern const char *load_store_opcode_names[256]; + +/* Is this opcode that of an integer (regardless of signedness)? Instruction + * names authoritatively determine types */ + +static inline bool +midgard_is_integer_op(int op) +{ + const char *name = alu_opcode_props[op].name; + + if (!name) + return false; + + return (name[0] == 'i') || (name[0] == 'u'); +} + +/* Does this opcode *write* an integer? Same as is_integer_op, unless it's a + * conversion between int<->float in which case we do the opposite */ + +static inline bool +midgard_is_integer_out_op(int op) +{ + bool is_int = midgard_is_integer_op(op); + bool is_conversion = alu_opcode_props[op].props & OP_TYPE_CONVERT; + + return is_int ^ is_conversion; +} + +/* Determines effective writemask, taking quirks and expansion into account */ + +static inline unsigned +effective_writemask(midgard_vector_alu *alu, unsigned existing_mask) +{ + /* Channel count is off-by-one to fit in two-bits (0 channel makes no + * sense) */ + + unsigned channel_count = GET_CHANNEL_COUNT(alu_opcode_props[alu->op].props); + + /* If there is a fixed channel count, construct the appropriate mask */ + + if (channel_count) + return (1 << channel_count) - 1; + + return existing_mask; +}; + + diff --git a/src/panfrost/midgard/midgard_print.c b/src/panfrost/midgard/midgard_print.c new file mode 100644 index 00000000000..6e10429ccee --- /dev/null +++ b/src/panfrost/midgard/midgard_print.c @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "compiler.h" +#include "helpers.h" +#include "midgard_ops.h" + +/* Pretty printer for Midgard IR, for use debugging compiler-internal + * passes like register allocation. The output superficially resembles + * Midgard assembly, with the exception that unit information and such is + * (normally) omitted, and generic indices are usually used instead of + * registers */ + +static void +mir_print_source(int source) +{ + if (source >= SSA_FIXED_MINIMUM) { + /* Specific register */ + int reg = SSA_REG_FROM_FIXED(source); + + /* TODO: Moving threshold */ + if (reg > 16 && reg < 24) + printf("u%d", 23 - reg); + else + printf("r%d", reg); + } else { + printf("%d", source); + } +} + +void +mir_print_instruction(midgard_instruction *ins) +{ + printf("\t"); + + switch (ins->type) { + case TAG_ALU_4: { + midgard_alu_op op = ins->alu.op; + const char *name = alu_opcode_props[op].name; + + if (ins->unit) + printf("%d.", ins->unit); + + printf("%s", name ? name : "??"); + break; + } + + case TAG_LOAD_STORE_4: { + midgard_load_store_op op = ins->load_store.op; + const char *name = load_store_opcode_names[op]; + + assert(name); + printf("%s", name); + break; + } + + case TAG_TEXTURE_4: { + printf("texture"); + break; + } + + default: + assert(0); + } + + ssa_args *args = &ins->ssa_args; + + printf(" %d, ", args->dest); + + mir_print_source(args->src0); + printf(", "); + + if (args->inline_constant) + printf("#%d", ins->inline_constant); + else + mir_print_source(args->src1); + + if (ins->has_constants) + printf(" <%f, %f, %f, %f>", ins->constants[0], ins->constants[1], ins->constants[2], ins->constants[3]); + + printf("\n"); +} + +/* Dumps MIR for a block or entire shader respective */ + +void +mir_print_block(midgard_block *block) +{ + printf("{\n"); + + mir_foreach_instr_in_block(block, ins) { + mir_print_instruction(ins); + } + + printf("}\n"); +} + +void +mir_print_shader(compiler_context *ctx) +{ + mir_foreach_block(ctx, block) { + mir_print_block(block); + } +} + +void +mir_print_bundle(midgard_bundle *bundle) +{ + printf("[\n"); + + for (unsigned i = 0; i < bundle->instruction_count; ++i) { + midgard_instruction *ins = bundle->instructions[i]; + mir_print_instruction(ins); + } + + printf("]\n"); +} diff --git a/src/panfrost/midgard/midgard_ra.c b/src/panfrost/midgard/midgard_ra.c new file mode 100644 index 00000000000..cfe091326ed --- /dev/null +++ b/src/panfrost/midgard/midgard_ra.c @@ -0,0 +1,506 @@ +/* + * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]> + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "compiler.h" +#include "midgard_ops.h" +#include "util/register_allocate.h" +#include "util/u_math.h" + +/* For work registers, we can subdivide in various ways. So we create + * classes for the various sizes and conflict accordingly, keeping in + * mind that physical registers are divided along 128-bit boundaries. + * The important part is that 128-bit boundaries are not crossed. + * + * For each 128-bit register, we can subdivide to 32-bits 10 ways + * + * vec4: xyzw + * vec3: xyz, yzw + * vec2: xy, yz, zw, + * vec1: x, y, z, w + * + * For each 64-bit register, we can subdivide similarly to 16-bit + * (TODO: half-float RA, not that we support fp16 yet) + */ + +#define WORK_STRIDE 10 + +/* Prepacked masks/swizzles for virtual register types */ +static unsigned reg_type_to_mask[WORK_STRIDE] = { + 0xF, /* xyzw */ + 0x7, 0x7 << 1, /* xyz */ + 0x3, 0x3 << 1, 0x3 << 2, /* xy */ + 0x1, 0x1 << 1, 0x1 << 2, 0x1 << 3 /* x */ +}; + +static unsigned reg_type_to_swizzle[WORK_STRIDE] = { + SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), + + SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), + SWIZZLE(COMPONENT_Y, COMPONENT_Z, COMPONENT_W, COMPONENT_W), + + SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), + SWIZZLE(COMPONENT_Y, COMPONENT_Z, COMPONENT_Z, COMPONENT_W), + SWIZZLE(COMPONENT_Z, COMPONENT_W, COMPONENT_Z, COMPONENT_W), + + SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), + SWIZZLE(COMPONENT_Y, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), + SWIZZLE(COMPONENT_Z, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), + SWIZZLE(COMPONENT_W, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), +}; + +struct phys_reg { + unsigned reg; + unsigned mask; + unsigned swizzle; +}; + +/* Given the mask/swizzle of both the register and the original source, + * compose to find the actual mask/swizzle to give the hardware */ + +static unsigned +compose_writemask(unsigned mask, struct phys_reg reg) +{ + /* Note: the reg mask is guaranteed to be contiguous. So we shift + * into the X place, compose via a simple AND, and shift back */ + + unsigned shift = __builtin_ctz(reg.mask); + return ((reg.mask >> shift) & mask) << shift; +} + +static unsigned +compose_swizzle(unsigned swizzle, unsigned mask, + struct phys_reg reg, struct phys_reg dst) +{ + unsigned out = pan_compose_swizzle(swizzle, reg.swizzle); + + /* Based on the register mask, we need to adjust over. E.g if we're + * writing to yz, a base swizzle of xy__ becomes _xy_. Save the + * original first component (x). But to prevent duplicate shifting + * (only applies to ALU -- mask param is set to xyzw out on L/S to + * prevent changes), we have to account for the shift inherent to the + * original writemask */ + + unsigned rep = out & 0x3; + unsigned shift = __builtin_ctz(dst.mask) - __builtin_ctz(mask); + unsigned shifted = out << (2*shift); + + /* ..but we fill in the gaps so it appears to replicate */ + + for (unsigned s = 0; s < shift; ++s) + shifted |= rep << (2*s); + + return shifted; +} + +/* When we're 'squeezing down' the values in the IR, we maintain a hash + * as such */ + +static unsigned +find_or_allocate_temp(compiler_context *ctx, unsigned hash) +{ + if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM)) + return hash; + + unsigned temp = (uintptr_t) _mesa_hash_table_u64_search( + ctx->hash_to_temp, hash + 1); + + if (temp) + return temp - 1; + + /* If no temp is find, allocate one */ + temp = ctx->temp_count++; + ctx->max_hash = MAX2(ctx->max_hash, hash); + + _mesa_hash_table_u64_insert(ctx->hash_to_temp, + hash + 1, (void *) ((uintptr_t) temp + 1)); + + return temp; +} + +/* Callback for register allocation selection, trivial default for now */ + +static unsigned int +midgard_ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data) +{ + /* Choose the first available register to minimise register pressure */ + + for (int i = 0; i < (16 * WORK_STRIDE); ++i) { + if (BITSET_TEST(regs, i)) { + return i; + } + } + + assert(0); + return 0; +} + +/* Helper to return the default phys_reg for a given register */ + +static struct phys_reg +default_phys_reg(int reg) +{ + struct phys_reg r = { + .reg = reg, + .mask = 0xF, /* xyzw */ + .swizzle = 0xE4 /* xyzw */ + }; + + return r; +} + +/* Determine which physical register, swizzle, and mask a virtual + * register corresponds to */ + +static struct phys_reg +index_to_reg(compiler_context *ctx, struct ra_graph *g, int reg) +{ + /* Check for special cases */ + if (reg >= SSA_FIXED_MINIMUM) + return default_phys_reg(SSA_REG_FROM_FIXED(reg)); + else if ((reg < 0) || !g) + return default_phys_reg(REGISTER_UNUSED); + + /* Special cases aside, we pick the underlying register */ + int virt = ra_get_node_reg(g, reg); + + /* Divide out the register and classification */ + int phys = virt / WORK_STRIDE; + int type = virt % WORK_STRIDE; + + struct phys_reg r = { + .reg = phys, + .mask = reg_type_to_mask[type], + .swizzle = reg_type_to_swizzle[type] + }; + + /* Report that we actually use this register, and return it */ + ctx->work_registers = MAX2(ctx->work_registers, phys); + return r; +} + +/* This routine performs the actual register allocation. It should be succeeded + * by install_registers */ + +struct ra_graph * +allocate_registers(compiler_context *ctx) +{ + /* The number of vec4 work registers available depends on when the + * uniforms start, so compute that first */ + + int work_count = 16 - MAX2((ctx->uniform_cutoff - 8), 0); + + int virtual_count = work_count * WORK_STRIDE; + + /* First, initialize the RA */ + struct ra_regs *regs = ra_alloc_reg_set(NULL, virtual_count, true); + + int work_vec4 = ra_alloc_reg_class(regs); + int work_vec3 = ra_alloc_reg_class(regs); + int work_vec2 = ra_alloc_reg_class(regs); + int work_vec1 = ra_alloc_reg_class(regs); + + unsigned classes[4] = { + work_vec1, + work_vec2, + work_vec3, + work_vec4 + }; + + /* Add the full set of work registers */ + for (unsigned i = 0; i < work_count; ++i) { + int base = WORK_STRIDE * i; + + /* Build a full set of subdivisions */ + ra_class_add_reg(regs, work_vec4, base); + ra_class_add_reg(regs, work_vec3, base + 1); + ra_class_add_reg(regs, work_vec3, base + 2); + ra_class_add_reg(regs, work_vec2, base + 3); + ra_class_add_reg(regs, work_vec2, base + 4); + ra_class_add_reg(regs, work_vec2, base + 5); + ra_class_add_reg(regs, work_vec1, base + 6); + ra_class_add_reg(regs, work_vec1, base + 7); + ra_class_add_reg(regs, work_vec1, base + 8); + ra_class_add_reg(regs, work_vec1, base + 9); + + for (unsigned a = 0; a < 10; ++a) { + unsigned mask1 = reg_type_to_mask[a]; + + for (unsigned b = 0; b < 10; ++b) { + unsigned mask2 = reg_type_to_mask[b]; + + if (mask1 & mask2) + ra_add_reg_conflict(regs, + base + a, base + b); + } + } + } + + /* We're done setting up */ + ra_set_finalize(regs, NULL); + + /* Transform the MIR into squeezed index form */ + mir_foreach_block(ctx, block) { + mir_foreach_instr_in_block(block, ins) { + if (ins->compact_branch) continue; + + ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest); + ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0); + + if (!ins->ssa_args.inline_constant) + ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1); + + } + } + + /* No register allocation to do with no SSA */ + + if (!ctx->temp_count) + return NULL; + + /* Let's actually do register allocation */ + int nodes = ctx->temp_count; + struct ra_graph *g = ra_alloc_interference_graph(regs, nodes); + + /* Determine minimum size needed to hold values, to indirectly + * determine class */ + + unsigned *found_class = calloc(sizeof(unsigned), ctx->temp_count); + + mir_foreach_block(ctx, block) { + mir_foreach_instr_in_block(block, ins) { + if (ins->compact_branch) continue; + if (ins->ssa_args.dest < 0) continue; + if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue; + + int class = util_logbase2(ins->mask) + 1; + + /* Use the largest class if there's ambiguity, this + * handles partial writes */ + + int dest = ins->ssa_args.dest; + found_class[dest] = MAX2(found_class[dest], class); + } + } + + for (unsigned i = 0; i < ctx->temp_count; ++i) { + unsigned class = found_class[i]; + if (!class) continue; + ra_set_node_class(g, i, classes[class - 1]); + } + + /* Determine liveness */ + + int *live_start = malloc(nodes * sizeof(int)); + int *live_end = malloc(nodes * sizeof(int)); + + /* Initialize as non-existent */ + + for (int i = 0; i < nodes; ++i) { + live_start[i] = live_end[i] = -1; + } + + int d = 0; + + mir_foreach_block(ctx, block) { + mir_foreach_instr_in_block(block, ins) { + if (ins->compact_branch) continue; + + /* Dest is < 0 for st_vary instructions, which break + * the usual SSA conventions. Liveness analysis doesn't + * make sense on these instructions, so skip them to + * avoid memory corruption */ + + if (ins->ssa_args.dest < 0) continue; + + if (ins->ssa_args.dest < SSA_FIXED_MINIMUM) { + /* If this destination is not yet live, it is + * now since we just wrote it */ + + int dest = ins->ssa_args.dest; + + if (live_start[dest] == -1) + live_start[dest] = d; + } + + /* Since we just used a source, the source might be + * dead now. Scan the rest of the block for + * invocations, and if there are none, the source dies + * */ + + int sources[2] = { + ins->ssa_args.src0, ins->ssa_args.src1 + }; + + for (int src = 0; src < 2; ++src) { + int s = sources[src]; + + if (s < 0) continue; + + if (s >= SSA_FIXED_MINIMUM) continue; + + if (!mir_is_live_after(ctx, block, ins, s)) { + live_end[s] = d; + } + } + + ++d; + } + } + + /* If a node still hasn't been killed, kill it now */ + + for (int i = 0; i < nodes; ++i) { + /* live_start == -1 most likely indicates a pinned output */ + + if (live_end[i] == -1) + live_end[i] = d; + } + + /* Setup interference between nodes that are live at the same time */ + + for (int i = 0; i < nodes; ++i) { + for (int j = i + 1; j < nodes; ++j) { + bool j_overlaps_i = live_start[j] < live_end[i]; + bool i_overlaps_j = live_end[j] < live_start[i]; + + if (i_overlaps_j || j_overlaps_i) + ra_add_node_interference(g, i, j); + } + } + + ra_set_select_reg_callback(g, midgard_ra_select_callback, NULL); + + if (!ra_allocate(g)) { + unreachable("Error allocating registers\n"); + } + + /* Cleanup */ + free(live_start); + free(live_end); + + return g; +} + +/* Once registers have been decided via register allocation + * (allocate_registers), we need to rewrite the MIR to use registers instead of + * indices */ + +static void +install_registers_instr( + compiler_context *ctx, + struct ra_graph *g, + midgard_instruction *ins) +{ + ssa_args args = ins->ssa_args; + + switch (ins->type) { + case TAG_ALU_4: { + int adjusted_src = args.inline_constant ? -1 : args.src1; + struct phys_reg src1 = index_to_reg(ctx, g, args.src0); + struct phys_reg src2 = index_to_reg(ctx, g, adjusted_src); + struct phys_reg dest = index_to_reg(ctx, g, args.dest); + + unsigned uncomposed_mask = ins->mask; + ins->mask = compose_writemask(uncomposed_mask, dest); + + /* Adjust the dest mask if necessary. Mostly this is a no-op + * but it matters for dot products */ + dest.mask = effective_writemask(&ins->alu, ins->mask); + + midgard_vector_alu_src mod1 = + vector_alu_from_unsigned(ins->alu.src1); + mod1.swizzle = compose_swizzle(mod1.swizzle, uncomposed_mask, src1, dest); + ins->alu.src1 = vector_alu_srco_unsigned(mod1); + + ins->registers.src1_reg = src1.reg; + + ins->registers.src2_imm = args.inline_constant; + + if (args.inline_constant) { + /* Encode inline 16-bit constant. See disassembler for + * where the algorithm is from */ + + ins->registers.src2_reg = ins->inline_constant >> 11; + + int lower_11 = ins->inline_constant & ((1 << 12) - 1); + uint16_t imm = ((lower_11 >> 8) & 0x7) | + ((lower_11 & 0xFF) << 3); + + ins->alu.src2 = imm << 2; + } else { + midgard_vector_alu_src mod2 = + vector_alu_from_unsigned(ins->alu.src2); + mod2.swizzle = compose_swizzle( + mod2.swizzle, uncomposed_mask, src2, dest); + ins->alu.src2 = vector_alu_srco_unsigned(mod2); + + ins->registers.src2_reg = src2.reg; + } + + ins->registers.out_reg = dest.reg; + break; + } + + case TAG_LOAD_STORE_4: { + if (OP_IS_STORE_VARY(ins->load_store.op)) { + /* TODO: use ssa_args for st_vary */ + ins->load_store.reg = 0; + } else { + /* Which physical register we read off depends on + * whether we are loading or storing -- think about the + * logical dataflow */ + + unsigned r = OP_IS_STORE(ins->load_store.op) ? + args.src0 : args.dest; + struct phys_reg src = index_to_reg(ctx, g, r); + + ins->load_store.reg = src.reg; + + ins->load_store.swizzle = compose_swizzle( + ins->load_store.swizzle, 0xF, + default_phys_reg(0), src); + + ins->mask = compose_writemask( + ins->mask, src); + } + + break; + } + + default: + break; + } +} + +void +install_registers(compiler_context *ctx, struct ra_graph *g) +{ + mir_foreach_block(ctx, block) { + mir_foreach_instr_in_block(block, ins) { + if (ins->compact_branch) continue; + install_registers_instr(ctx, g, ins); + } + } + +} diff --git a/src/panfrost/midgard/midgard_ra_pipeline.c b/src/panfrost/midgard/midgard_ra_pipeline.c new file mode 100644 index 00000000000..cd64bdf29e5 --- /dev/null +++ b/src/panfrost/midgard/midgard_ra_pipeline.c @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2019 Alyssa Rosenzweig <[email protected]> + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "compiler.h" + +/* Creates pipeline registers. This is a prepass run before the main register + * allocator but after scheduling, once bundles are created. It works by + * iterating the scheduled IR, checking if a value is ever used after the end + * of the current bundle. If it is not, it is promoted to a bundle-specific + * pipeline register. + * + * Pipeline registers are only written from the first two stages of the + * pipeline (vmul/sadd) lasting the duration of the bundle only. There are two + * 128-bit pipeline registers available (r24/r25). The upshot is that no actual + * register allocation is needed; we can _always_ promote a value to a pipeline + * register, liveness permitting. This greatly simplifies the logic of this + * passing, negating the need for a proper RA like work registers. + */ + +static bool +mir_pipeline_ins( + compiler_context *ctx, + midgard_block *block, + midgard_bundle *bundle, unsigned i, + unsigned pipeline_count) +{ + midgard_instruction *ins = bundle->instructions[i]; + unsigned dest = ins->ssa_args.dest; + + /* Check to make sure we're legal */ + + if (ins->compact_branch) + return false; + + /* Don't allow non-SSA. Pipelining registers is theoretically possible, + * but the analysis is much hairier, so don't bother quite yet */ + if ((dest < 0) || (dest >= ctx->func->impl->ssa_alloc)) + return false; + + /* Make sure they're not lying to us. Blend shaders lie. TODO: Fix your + * bad code Alyssa */ + + if (mir_has_multiple_writes(ctx, dest)) + return false; + + /* We want to know if we live after this bundle, so check if + * we're live after the last instruction of the bundle */ + + midgard_instruction *end = bundle->instructions[ + bundle->instruction_count - 1]; + + if (mir_is_live_after(ctx, block, end, ins->ssa_args.dest)) + return false; + + /* We're only live in this bundle -- pipeline! */ + + mir_rewrite_index(ctx, dest, SSA_FIXED_REGISTER(24 + pipeline_count)); + + return true; +} + +void +mir_create_pipeline_registers(compiler_context *ctx) +{ + mir_foreach_block(ctx, block) { + mir_foreach_bundle_in_block(block, bundle) { + if (!mir_is_alu_bundle(bundle)) continue; + if (bundle->instruction_count < 2) continue; + + /* Only first 2 instructions could pipeline */ + bool succ = mir_pipeline_ins(ctx, block, bundle, 0, 0); + mir_pipeline_ins(ctx, block, bundle, 1, succ); + } + } +} diff --git a/src/panfrost/midgard/midgard_schedule.c b/src/panfrost/midgard/midgard_schedule.c new file mode 100644 index 00000000000..7a3841e4d44 --- /dev/null +++ b/src/panfrost/midgard/midgard_schedule.c @@ -0,0 +1,541 @@ +/* + * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "compiler.h" +#include "midgard_ops.h" +#include "util/u_memory.h" + +/* Create a mask of accessed components from a swizzle to figure out vector + * dependencies */ + +static unsigned +swizzle_to_access_mask(unsigned swizzle) +{ + unsigned component_mask = 0; + + for (int i = 0; i < 4; ++i) { + unsigned c = (swizzle >> (2 * i)) & 3; + component_mask |= (1 << c); + } + + return component_mask; +} + +/* Does the mask cover more than a scalar? */ + +static bool +is_single_component_mask(unsigned mask) +{ + int components = 0; + + for (int c = 0; c < 8; ++c) { + if (mask & (1 << c)) + components++; + } + + return components == 1; +} + +/* Checks for an SSA data hazard between two adjacent instructions, keeping in + * mind that we are a vector architecture and we can write to different + * components simultaneously */ + +static bool +can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second) +{ + /* Each instruction reads some registers and writes to a register. See + * where the first writes */ + + /* Figure out where exactly we wrote to */ + int source = first->ssa_args.dest; + int source_mask = first->mask; + + /* As long as the second doesn't read from the first, we're okay */ + if (second->ssa_args.src0 == source) { + if (first->type == TAG_ALU_4) { + /* Figure out which components we just read from */ + + int q = second->alu.src1; + midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q; + + /* Check if there are components in common, and fail if so */ + if (swizzle_to_access_mask(m->swizzle) & source_mask) + return false; + } else + return false; + + } + + if (second->ssa_args.src1 == source) + return false; + + /* Otherwise, it's safe in that regard. Another data hazard is both + * writing to the same place, of course */ + + if (second->ssa_args.dest == source) { + /* ...but only if the components overlap */ + + if (second->mask & source_mask) + return false; + } + + /* ...That's it */ + return true; +} + +static bool +midgard_has_hazard( + midgard_instruction **segment, unsigned segment_size, + midgard_instruction *ains) +{ + for (int s = 0; s < segment_size; ++s) + if (!can_run_concurrent_ssa(segment[s], ains)) + return true; + + return false; + + +} + +/* Schedules, but does not emit, a single basic block. After scheduling, the + * final tag and size of the block are known, which are necessary for branching + * */ + +static midgard_bundle +schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip) +{ + int instructions_emitted = 0, packed_idx = 0; + midgard_bundle bundle = { 0 }; + + uint8_t tag = ins->type; + + /* Default to the instruction's tag */ + bundle.tag = tag; + + switch (ins->type) { + case TAG_ALU_4: { + uint32_t control = 0; + size_t bytes_emitted = sizeof(control); + + /* TODO: Constant combining */ + int index = 0, last_unit = 0; + + /* Previous instructions, for the purpose of parallelism */ + midgard_instruction *segment[4] = {0}; + int segment_size = 0; + + instructions_emitted = -1; + midgard_instruction *pins = ins; + + unsigned constant_count = 0; + + for (;;) { + midgard_instruction *ains = pins; + + /* Advance instruction pointer */ + if (index) { + ains = mir_next_op(pins); + pins = ains; + } + + /* Out-of-work condition */ + if ((struct list_head *) ains == &block->instructions) + break; + + /* Ensure that the chain can continue */ + if (ains->type != TAG_ALU_4) break; + + /* If there's already something in the bundle and we + * have weird scheduler constraints, break now */ + if (ains->precede_break && index) break; + + /* According to the presentation "The ARM + * Mali-T880 Mobile GPU" from HotChips 27, + * there are two pipeline stages. Branching + * position determined experimentally. Lines + * are executed in parallel: + * + * [ VMUL ] [ SADD ] + * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ] + * + * Verify that there are no ordering dependencies here. + * + * TODO: Allow for parallelism!!! + */ + + /* Pick a unit for it if it doesn't force a particular unit */ + + int unit = ains->unit; + + if (!unit) { + int op = ains->alu.op; + int units = alu_opcode_props[op].props; + + bool scalarable = units & UNITS_SCALAR; + bool could_scalar = is_single_component_mask(ains->mask); + + /* Only 16/32-bit can run on a scalar unit */ + could_scalar &= ains->alu.reg_mode != midgard_reg_mode_8; + could_scalar &= ains->alu.reg_mode != midgard_reg_mode_64; + could_scalar &= ains->alu.dest_override == midgard_dest_override_none; + + if (ains->alu.reg_mode == midgard_reg_mode_16) { + /* If we're running in 16-bit mode, we + * can't have any 8-bit sources on the + * scalar unit (since the scalar unit + * doesn't understand 8-bit) */ + + midgard_vector_alu_src s1 = + vector_alu_from_unsigned(ains->alu.src1); + + could_scalar &= !s1.half; + + if (!ains->ssa_args.inline_constant) { + midgard_vector_alu_src s2 = + vector_alu_from_unsigned(ains->alu.src2); + + could_scalar &= !s2.half; + } + + } + + bool scalar = could_scalar && scalarable; + + /* TODO: Check ahead-of-time for other scalar + * hazards that otherwise get aborted out */ + + if (scalar) + assert(units & UNITS_SCALAR); + + if (!scalar) { + if (last_unit >= UNIT_VADD) { + if (units & UNIT_VLUT) + unit = UNIT_VLUT; + else + break; + } else { + if ((units & UNIT_VMUL) && last_unit < UNIT_VMUL) + unit = UNIT_VMUL; + else if ((units & UNIT_VADD) && !(control & UNIT_VADD)) + unit = UNIT_VADD; + else if (units & UNIT_VLUT) + unit = UNIT_VLUT; + else + break; + } + } else { + if (last_unit >= UNIT_VADD) { + if ((units & UNIT_SMUL) && !(control & UNIT_SMUL)) + unit = UNIT_SMUL; + else if (units & UNIT_VLUT) + unit = UNIT_VLUT; + else + break; + } else { + if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains)) + unit = UNIT_SADD; + else if (units & UNIT_SMUL) + unit = ((units & UNIT_VMUL) && !(control & UNIT_VMUL)) ? UNIT_VMUL : UNIT_SMUL; + else if ((units & UNIT_VADD) && !(control & UNIT_VADD)) + unit = UNIT_VADD; + else + break; + } + } + + assert(unit & units); + } + + /* Late unit check, this time for encoding (not parallelism) */ + if (unit <= last_unit) break; + + /* Clear the segment */ + if (last_unit < UNIT_VADD && unit >= UNIT_VADD) + segment_size = 0; + + if (midgard_has_hazard(segment, segment_size, ains)) + break; + + /* We're good to go -- emit the instruction */ + ains->unit = unit; + + segment[segment_size++] = ains; + + /* We try to reuse constants if possible, by adjusting + * the swizzle */ + + if (ains->has_blend_constant) { + /* Everything conflicts with the blend constant */ + if (bundle.has_embedded_constants) + break; + + bundle.has_blend_constant = 1; + bundle.has_embedded_constants = 1; + } else if (ains->has_constants && ains->alu.reg_mode == midgard_reg_mode_16) { + /* TODO: DRY with the analysis pass */ + + if (bundle.has_blend_constant) + break; + + if (constant_count) + break; + + /* TODO: Fix packing XXX */ + uint16_t *bundles = (uint16_t *) bundle.constants; + uint32_t *constants = (uint32_t *) ains->constants; + + /* Copy them wholesale */ + for (unsigned i = 0; i < 4; ++i) + bundles[i] = constants[i]; + + bundle.has_embedded_constants = true; + constant_count = 4; + } else if (ains->has_constants) { + /* By definition, blend constants conflict with + * everything, so if there are already + * constants we break the bundle *now* */ + + if (bundle.has_blend_constant) + break; + + /* For anything but blend constants, we can do + * proper analysis, however */ + + /* TODO: Mask by which are used */ + uint32_t *constants = (uint32_t *) ains->constants; + uint32_t *bundles = (uint32_t *) bundle.constants; + + uint32_t indices[4] = { 0 }; + bool break_bundle = false; + + for (unsigned i = 0; i < 4; ++i) { + uint32_t cons = constants[i]; + bool constant_found = false; + + /* Search for the constant */ + for (unsigned j = 0; j < constant_count; ++j) { + if (bundles[j] != cons) + continue; + + /* We found it, reuse */ + indices[i] = j; + constant_found = true; + break; + } + + if (constant_found) + continue; + + /* We didn't find it, so allocate it */ + unsigned idx = constant_count++; + + if (idx >= 4) { + /* Uh-oh, out of space */ + break_bundle = true; + break; + } + + /* We have space, copy it in! */ + bundles[idx] = cons; + indices[i] = idx; + } + + if (break_bundle) + break; + + /* Cool, we have it in. So use indices as a + * swizzle */ + + unsigned swizzle = SWIZZLE_FROM_ARRAY(indices); + unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + + if (ains->ssa_args.src0 == r_constant) + ains->alu.src1 = vector_alu_apply_swizzle(ains->alu.src1, swizzle); + + if (ains->ssa_args.src1 == r_constant) + ains->alu.src2 = vector_alu_apply_swizzle(ains->alu.src2, swizzle); + + bundle.has_embedded_constants = true; + } + + if (ains->unit & UNITS_ANY_VECTOR) { + bytes_emitted += sizeof(midgard_reg_info); + bytes_emitted += sizeof(midgard_vector_alu); + } else if (ains->compact_branch) { + /* All of r0 has to be written out along with + * the branch writeout */ + + if (ains->writeout) { + /* The rules for when "bare" writeout + * is safe are when all components are + * r0 are written out in the final + * bundle, earlier than VLUT, where any + * register dependencies of r0 are from + * an earlier bundle. We can't verify + * this before RA, so we don't try. */ + + if (index != 0) + break; + + /* Inject a move */ + midgard_instruction ins = v_mov(0, blank_alu_src, SSA_FIXED_REGISTER(0)); + ins.unit = UNIT_VMUL; + control |= ins.unit; + + /* TODO don't leak */ + midgard_instruction *move = + mem_dup(&ins, sizeof(midgard_instruction)); + bytes_emitted += sizeof(midgard_reg_info); + bytes_emitted += sizeof(midgard_vector_alu); + bundle.instructions[packed_idx++] = move; + } + + if (ains->unit == ALU_ENAB_BRANCH) { + bytes_emitted += sizeof(midgard_branch_extended); + } else { + bytes_emitted += sizeof(ains->br_compact); + } + } else { + bytes_emitted += sizeof(midgard_reg_info); + bytes_emitted += sizeof(midgard_scalar_alu); + } + + /* Defer marking until after writing to allow for break */ + control |= ains->unit; + last_unit = ains->unit; + ++instructions_emitted; + ++index; + } + + int padding = 0; + + /* Pad ALU op to nearest word */ + + if (bytes_emitted & 15) { + padding = 16 - (bytes_emitted & 15); + bytes_emitted += padding; + } + + /* Constants must always be quadwords */ + if (bundle.has_embedded_constants) + bytes_emitted += 16; + + /* Size ALU instruction for tag */ + bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1; + bundle.padding = padding; + bundle.control = bundle.tag | control; + + break; + } + + case TAG_LOAD_STORE_4: { + /* Load store instructions have two words at once. If + * we only have one queued up, we need to NOP pad. + * Otherwise, we store both in succession to save space + * and cycles -- letting them go in parallel -- skip + * the next. The usefulness of this optimisation is + * greatly dependent on the quality of the instruction + * scheduler. + */ + + midgard_instruction *next_op = mir_next_op(ins); + + if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) { + /* TODO: Concurrency check */ + instructions_emitted++; + } + + break; + } + + case TAG_TEXTURE_4: { + /* Which tag we use depends on the shader stage */ + bool in_frag = ctx->stage == MESA_SHADER_FRAGMENT; + bundle.tag = in_frag ? TAG_TEXTURE_4 : TAG_TEXTURE_4_VTX; + break; + } + + default: + unreachable("Unknown tag"); + break; + } + + /* Copy the instructions into the bundle */ + bundle.instruction_count = instructions_emitted + 1 + packed_idx; + + midgard_instruction *uins = ins; + for (; packed_idx < bundle.instruction_count; ++packed_idx) { + bundle.instructions[packed_idx] = uins; + uins = mir_next_op(uins); + } + + *skip = instructions_emitted; + + return bundle; +} + +/* Schedule a single block by iterating its instruction to create bundles. + * While we go, tally about the bundle sizes to compute the block size. */ + +static void +schedule_block(compiler_context *ctx, midgard_block *block) +{ + util_dynarray_init(&block->bundles, NULL); + + block->quadword_count = 0; + + mir_foreach_instr_in_block(block, ins) { + int skip; + midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip); + util_dynarray_append(&block->bundles, midgard_bundle, bundle); + + if (bundle.has_blend_constant) { + /* TODO: Multiblock? */ + int quadwords_within_block = block->quadword_count + quadword_size(bundle.tag) - 1; + ctx->blend_constant_offset = quadwords_within_block * 0x10; + } + + while(skip--) + ins = mir_next_op(ins); + + block->quadword_count += quadword_size(bundle.tag); + } + + block->is_scheduled = true; +} + +void +schedule_program(compiler_context *ctx) +{ + /* We run RA prior to scheduling */ + + mir_foreach_block(ctx, block) { + schedule_block(ctx, block); + } + + /* Pipeline registers creation is a prepass before RA */ + mir_create_pipeline_registers(ctx); + + struct ra_graph *g = allocate_registers(ctx); + install_registers(ctx, g); +} diff --git a/src/panfrost/midgard/mir.c b/src/panfrost/midgard/mir.c new file mode 100644 index 00000000000..6adc1350c0a --- /dev/null +++ b/src/panfrost/midgard/mir.c @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2019 Alyssa Rosenzweig <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "compiler.h" + +void +mir_rewrite_index_src(compiler_context *ctx, unsigned old, unsigned new) +{ + mir_foreach_instr_global(ctx, ins) { + if (ins->ssa_args.src0 == old) + ins->ssa_args.src0 = new; + + if (ins->ssa_args.src1 == old && + !ins->ssa_args.inline_constant) + ins->ssa_args.src1 = new; + } +} + +void +mir_rewrite_index_dst(compiler_context *ctx, unsigned old, unsigned new) +{ + mir_foreach_instr_global(ctx, ins) { + if (ins->ssa_args.dest == old) + ins->ssa_args.dest = new; + } +} + +void +mir_rewrite_index(compiler_context *ctx, unsigned old, unsigned new) +{ + mir_rewrite_index_src(ctx, old, new); + mir_rewrite_index_dst(ctx, old, new); +} diff --git a/src/panfrost/pandecode/cmdline.c b/src/panfrost/pandecode/cmdline.c new file mode 100644 index 00000000000..38053aa1072 --- /dev/null +++ b/src/panfrost/pandecode/cmdline.c @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2019 Alyssa Rosenzweig + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <stdint.h> +#include <string.h> + +#include "decode.h" + +/* Parsing */ + +static FILE * +pandecode_read_filename(const char *base, const char *name) +{ + char *fn = NULL; + asprintf(&fn, "%s/%s", base, name); + + FILE *fp = fopen(fn, "rb"); + free(fn); + + return fp; +} + +static void +pandecode_read_memory(const char *base, const char *name, mali_ptr gpu_va) +{ + FILE *fp = pandecode_read_filename(base, name); + + if (!fp) { + fprintf(stderr, "Warning: missing %s\n", name); + return; + } + + fseek(fp, 0, SEEK_END); + long sz = ftell(fp); + fseek(fp, 0, SEEK_SET); + + char *buf = malloc(sz); + assert(buf); + fread(buf, 1, sz, fp); + fclose(fp); + + pandecode_inject_mmap(gpu_va, buf, sz, name); +} + +static void +pandecode_read_mmap(const char *base, const char *line) +{ + assert(strlen(line) < 500); + + mali_ptr addr; + char name[512]; + + sscanf(line, "MMAP %" PRIx64 " %s", &addr, name); + pandecode_read_memory(base, name, addr); +} + +static void +pandecode_read_job_submit(const char *base, const char *line) +{ + mali_ptr addr; + unsigned core_req; + unsigned is_bifrost; + + sscanf(line, "JS %" PRIx64 " %x %x", &addr, &core_req, &is_bifrost); + pandecode_replay_jc(addr, is_bifrost); +} + + + +/* Reads the control file, processing as it goes. */ + +static void +pandecode_read_control(const char *base) +{ + FILE *fp = pandecode_read_filename(base, "control.log"); + + if (!fp) { + fprintf(stderr, "Invalid directory path\n"); + return; + } + + char *line = NULL; + size_t len = 0; + + while (getline(&line, &len, fp) != -1) { + switch (line[0]) { + case 'M': + pandecode_read_mmap(base, line); + break; + + case 'J': + pandecode_read_job_submit(base, line); + break; + + default: + assert(0); + break; + } + } +} + +int +main(int argc, char **argv) +{ + if (argc < 2) { + fprintf(stderr, "Usage: pandecode [directory]\n"); + exit(1); + } + + pandecode_initialize(); + pandecode_read_control(argv[1]); +} diff --git a/src/panfrost/pandecode/common.c b/src/panfrost/pandecode/common.c new file mode 100644 index 00000000000..6196379c969 --- /dev/null +++ b/src/panfrost/pandecode/common.c @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2019 Alyssa Rosenzweig + * Copyright (C) 2017-2018 Lyude Paul + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <stdint.h> +#include <string.h> + +#include "decode.h" +#include "util/macros.h" + +/* Memory handling */ + +static struct pandecode_mapped_memory mmaps; + +struct pandecode_mapped_memory * +pandecode_find_mapped_gpu_mem_containing(mali_ptr addr) +{ + list_for_each_entry(struct pandecode_mapped_memory, pos, &mmaps.node, node) { + if (addr >= pos->gpu_va && addr < pos->gpu_va + pos->length) + return pos; + } + + return NULL; +} + +void +pandecode_inject_mmap(mali_ptr gpu_va, void *cpu, unsigned sz, const char *name) +{ + struct pandecode_mapped_memory *mapped_mem = NULL; + + mapped_mem = malloc(sizeof(*mapped_mem)); + list_inithead(&mapped_mem->node); + + mapped_mem->gpu_va = gpu_va; + mapped_mem->length = sz; + mapped_mem->addr = cpu; + + if (!name) { + /* If we don't have a name, assign one */ + + snprintf(mapped_mem->name, ARRAY_SIZE(mapped_mem->name) - 1, + "memory_%" PRIx64, gpu_va); + } else { + assert(strlen(name) < ARRAY_SIZE(mapped_mem->name)); + memcpy(mapped_mem->name, name, strlen(name)); + } + + list_add(&mapped_mem->node, &mmaps.node); +} + +char * +pointer_as_memory_reference(mali_ptr ptr) +{ + struct pandecode_mapped_memory *mapped; + char *out = malloc(128); + + /* Try to find the corresponding mapped zone */ + + mapped = pandecode_find_mapped_gpu_mem_containing(ptr); + + if (mapped) { + snprintf(out, 128, "%s + %d", mapped->name, (int) (ptr - mapped->gpu_va)); + return out; + } + + /* Just use the raw address if other options are exhausted */ + + snprintf(out, 128, MALI_PTR_FMT, ptr); + return out; + +} + +void +pandecode_initialize(void) +{ + list_inithead(&mmaps.node); + +} diff --git a/src/panfrost/pandecode/decode.c b/src/panfrost/pandecode/decode.c new file mode 100644 index 00000000000..61b8914388e --- /dev/null +++ b/src/panfrost/pandecode/decode.c @@ -0,0 +1,2388 @@ +/* + * Copyright (C) 2017-2019 Alyssa Rosenzweig + * Copyright (C) 2017-2019 Connor Abbott + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <panfrost-job.h> +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> +#include <stdbool.h> +#include <stdarg.h> +#include "decode.h" +#include "util/u_math.h" + +#include "pan_pretty_print.h" +#include "midgard/disassemble.h" +#include "bifrost/disassemble.h" + +int pandecode_replay_jc(mali_ptr jc_gpu_va, bool bifrost); + +#define MEMORY_PROP(obj, p) {\ + if (obj->p) { \ + char *a = pointer_as_memory_reference(obj->p); \ + pandecode_prop("%s = %s", #p, a); \ + free(a); \ + } \ +} + +#define DYN_MEMORY_PROP(obj, no, p) { \ + if (obj->p) \ + pandecode_prop("%s = %s_%d_p", #p, #p, no); \ +} + +/* Semantic logging type. + * + * Raw: for raw messages to be printed as is. + * Message: for helpful information to be commented out in replays. + * Property: for properties of a struct + * + * Use one of pandecode_log, pandecode_msg, or pandecode_prop as syntax sugar. + */ + +enum pandecode_log_type { + PANDECODE_RAW, + PANDECODE_MESSAGE, + PANDECODE_PROPERTY +}; + +#define pandecode_log(...) pandecode_log_typed(PANDECODE_RAW, __VA_ARGS__) +#define pandecode_msg(...) pandecode_log_typed(PANDECODE_MESSAGE, __VA_ARGS__) +#define pandecode_prop(...) pandecode_log_typed(PANDECODE_PROPERTY, __VA_ARGS__) + +unsigned pandecode_indent = 0; + +static void +pandecode_make_indent(void) +{ + for (unsigned i = 0; i < pandecode_indent; ++i) + printf(" "); +} + +static void +pandecode_log_typed(enum pandecode_log_type type, const char *format, ...) +{ + va_list ap; + + pandecode_make_indent(); + + if (type == PANDECODE_MESSAGE) + printf("// "); + else if (type == PANDECODE_PROPERTY) + printf("."); + + va_start(ap, format); + vprintf(format, ap); + va_end(ap); + + if (type == PANDECODE_PROPERTY) + printf(",\n"); +} + +static void +pandecode_log_cont(const char *format, ...) +{ + va_list ap; + + va_start(ap, format); + vprintf(format, ap); + va_end(ap); +} + +struct pandecode_flag_info { + u64 flag; + const char *name; +}; + +static void +pandecode_log_decoded_flags(const struct pandecode_flag_info *flag_info, + u64 flags) +{ + bool decodable_flags_found = false; + + for (int i = 0; flag_info[i].name; i++) { + if ((flags & flag_info[i].flag) != flag_info[i].flag) + continue; + + if (!decodable_flags_found) { + decodable_flags_found = true; + } else { + pandecode_log_cont(" | "); + } + + pandecode_log_cont("%s", flag_info[i].name); + + flags &= ~flag_info[i].flag; + } + + if (decodable_flags_found) { + if (flags) + pandecode_log_cont(" | 0x%" PRIx64, flags); + } else { + pandecode_log_cont("0x%" PRIx64, flags); + } +} + +#define FLAG_INFO(flag) { MALI_##flag, "MALI_" #flag } +static const struct pandecode_flag_info gl_enable_flag_info[] = { + FLAG_INFO(OCCLUSION_QUERY), + FLAG_INFO(OCCLUSION_PRECISE), + FLAG_INFO(FRONT_CCW_TOP), + FLAG_INFO(CULL_FACE_FRONT), + FLAG_INFO(CULL_FACE_BACK), + {} +}; +#undef FLAG_INFO + +#define FLAG_INFO(flag) { MALI_CLEAR_##flag, "MALI_CLEAR_" #flag } +static const struct pandecode_flag_info clear_flag_info[] = { + FLAG_INFO(FAST), + FLAG_INFO(SLOW), + FLAG_INFO(SLOW_STENCIL), + {} +}; +#undef FLAG_INFO + +#define FLAG_INFO(flag) { MALI_MASK_##flag, "MALI_MASK_" #flag } +static const struct pandecode_flag_info mask_flag_info[] = { + FLAG_INFO(R), + FLAG_INFO(G), + FLAG_INFO(B), + FLAG_INFO(A), + {} +}; +#undef FLAG_INFO + +#define FLAG_INFO(flag) { MALI_##flag, "MALI_" #flag } +static const struct pandecode_flag_info u3_flag_info[] = { + FLAG_INFO(HAS_MSAA), + FLAG_INFO(CAN_DISCARD), + FLAG_INFO(HAS_BLEND_SHADER), + FLAG_INFO(DEPTH_TEST), + {} +}; + +static const struct pandecode_flag_info u4_flag_info[] = { + FLAG_INFO(NO_MSAA), + FLAG_INFO(NO_DITHER), + FLAG_INFO(DEPTH_RANGE_A), + FLAG_INFO(DEPTH_RANGE_B), + FLAG_INFO(STENCIL_TEST), + FLAG_INFO(SAMPLE_ALPHA_TO_COVERAGE_NO_BLEND_SHADER), + {} +}; +#undef FLAG_INFO + +#define FLAG_INFO(flag) { MALI_FRAMEBUFFER_##flag, "MALI_FRAMEBUFFER_" #flag } +static const struct pandecode_flag_info fb_fmt_flag_info[] = { + FLAG_INFO(MSAA_A), + FLAG_INFO(MSAA_B), + FLAG_INFO(MSAA_8), + {} +}; +#undef FLAG_INFO + +#define FLAG_INFO(flag) { MALI_MFBD_FORMAT_##flag, "MALI_MFBD_FORMAT_" #flag } +static const struct pandecode_flag_info mfbd_fmt_flag_info[] = { + FLAG_INFO(MSAA), + FLAG_INFO(SRGB), + {} +}; +#undef FLAG_INFO + +#define FLAG_INFO(flag) { MALI_EXTRA_##flag, "MALI_EXTRA_" #flag } +static const struct pandecode_flag_info mfbd_extra_flag_info[] = { + FLAG_INFO(PRESENT), + FLAG_INFO(AFBC), + FLAG_INFO(ZS), + {} +}; +#undef FLAG_INFO + +#define FLAG_INFO(flag) { MALI_##flag, "MALI_" #flag } +static const struct pandecode_flag_info shader_midgard1_flag_info [] = { + FLAG_INFO(EARLY_Z), + FLAG_INFO(HELPER_INVOCATIONS), + FLAG_INFO(READS_TILEBUFFER), + FLAG_INFO(READS_ZS), + {} +}; +#undef FLAG_INFO + +#define FLAG_INFO(flag) { MALI_MFBD_##flag, "MALI_MFBD_" #flag } +static const struct pandecode_flag_info mfbd_flag_info [] = { + FLAG_INFO(DEPTH_WRITE), + FLAG_INFO(EXTRA), + {} +}; +#undef FLAG_INFO + + +extern char *replace_fragment; +extern char *replace_vertex; + +static char * +pandecode_job_type_name(enum mali_job_type type) +{ +#define DEFINE_CASE(name) case JOB_TYPE_ ## name: return "JOB_TYPE_" #name + + switch (type) { + DEFINE_CASE(NULL); + DEFINE_CASE(SET_VALUE); + DEFINE_CASE(CACHE_FLUSH); + DEFINE_CASE(COMPUTE); + DEFINE_CASE(VERTEX); + DEFINE_CASE(TILER); + DEFINE_CASE(FUSED); + DEFINE_CASE(FRAGMENT); + + case JOB_NOT_STARTED: + return "NOT_STARTED"; + + default: + pandecode_log("Warning! Unknown job type %x\n", type); + return "!?!?!?"; + } + +#undef DEFINE_CASE +} + +static char * +pandecode_draw_mode_name(enum mali_draw_mode mode) +{ +#define DEFINE_CASE(name) case MALI_ ## name: return "MALI_" #name + + switch (mode) { + DEFINE_CASE(DRAW_NONE); + DEFINE_CASE(POINTS); + DEFINE_CASE(LINES); + DEFINE_CASE(TRIANGLES); + DEFINE_CASE(TRIANGLE_STRIP); + DEFINE_CASE(TRIANGLE_FAN); + DEFINE_CASE(LINE_STRIP); + DEFINE_CASE(LINE_LOOP); + DEFINE_CASE(POLYGON); + DEFINE_CASE(QUADS); + DEFINE_CASE(QUAD_STRIP); + + default: + return "MALI_TRIANGLES /* XXX: Unknown GL mode, check dump */"; + } + +#undef DEFINE_CASE +} + +#define DEFINE_CASE(name) case MALI_FUNC_ ## name: return "MALI_FUNC_" #name +static char * +pandecode_func_name(enum mali_func mode) +{ + switch (mode) { + DEFINE_CASE(NEVER); + DEFINE_CASE(LESS); + DEFINE_CASE(EQUAL); + DEFINE_CASE(LEQUAL); + DEFINE_CASE(GREATER); + DEFINE_CASE(NOTEQUAL); + DEFINE_CASE(GEQUAL); + DEFINE_CASE(ALWAYS); + + default: + return "MALI_FUNC_NEVER /* XXX: Unknown function, check dump */"; + } +} +#undef DEFINE_CASE + +/* Why is this duplicated? Who knows... */ +#define DEFINE_CASE(name) case MALI_ALT_FUNC_ ## name: return "MALI_ALT_FUNC_" #name +static char * +pandecode_alt_func_name(enum mali_alt_func mode) +{ + switch (mode) { + DEFINE_CASE(NEVER); + DEFINE_CASE(LESS); + DEFINE_CASE(EQUAL); + DEFINE_CASE(LEQUAL); + DEFINE_CASE(GREATER); + DEFINE_CASE(NOTEQUAL); + DEFINE_CASE(GEQUAL); + DEFINE_CASE(ALWAYS); + + default: + return "MALI_FUNC_NEVER /* XXX: Unknown function, check dump */"; + } +} +#undef DEFINE_CASE + +#define DEFINE_CASE(name) case MALI_STENCIL_ ## name: return "MALI_STENCIL_" #name +static char * +pandecode_stencil_op_name(enum mali_stencil_op op) +{ + switch (op) { + DEFINE_CASE(KEEP); + DEFINE_CASE(REPLACE); + DEFINE_CASE(ZERO); + DEFINE_CASE(INVERT); + DEFINE_CASE(INCR_WRAP); + DEFINE_CASE(DECR_WRAP); + DEFINE_CASE(INCR); + DEFINE_CASE(DECR); + + default: + return "MALI_STENCIL_KEEP /* XXX: Unknown stencil op, check dump */"; + } +} + +#undef DEFINE_CASE + +#define DEFINE_CASE(name) case MALI_ATTR_ ## name: return "MALI_ATTR_" #name +static char *pandecode_attr_mode_name(enum mali_attr_mode mode) +{ + switch(mode) { + DEFINE_CASE(UNUSED); + DEFINE_CASE(LINEAR); + DEFINE_CASE(POT_DIVIDE); + DEFINE_CASE(MODULO); + DEFINE_CASE(NPOT_DIVIDE); + default: return "MALI_ATTR_UNUSED /* XXX: Unknown stencil op, check dump */"; + } +} + +#undef DEFINE_CASE + +#define DEFINE_CASE(name) case MALI_CHANNEL_## name: return "MALI_CHANNEL_" #name +static char * +pandecode_channel_name(enum mali_channel channel) +{ + switch (channel) { + DEFINE_CASE(RED); + DEFINE_CASE(GREEN); + DEFINE_CASE(BLUE); + DEFINE_CASE(ALPHA); + DEFINE_CASE(ZERO); + DEFINE_CASE(ONE); + DEFINE_CASE(RESERVED_0); + DEFINE_CASE(RESERVED_1); + + default: + return "MALI_CHANNEL_ZERO /* XXX: Unknown channel, check dump */"; + } +} +#undef DEFINE_CASE + +#define DEFINE_CASE(name) case MALI_WRAP_## name: return "MALI_WRAP_" #name +static char * +pandecode_wrap_mode_name(enum mali_wrap_mode op) +{ + switch (op) { + DEFINE_CASE(REPEAT); + DEFINE_CASE(CLAMP_TO_EDGE); + DEFINE_CASE(CLAMP_TO_BORDER); + DEFINE_CASE(MIRRORED_REPEAT); + + default: + return "MALI_WRAP_REPEAT /* XXX: Unknown wrap mode, check dump */"; + } +} +#undef DEFINE_CASE + +#define DEFINE_CASE(name) case MALI_TEX_## name: return "MALI_TEX_" #name +static char * +pandecode_texture_type(enum mali_texture_type type) +{ + switch (type) { + DEFINE_CASE(1D); + DEFINE_CASE(2D); + DEFINE_CASE(3D); + DEFINE_CASE(CUBE); + + default: + unreachable("Unknown case"); + } +} +#undef DEFINE_CASE + +#define DEFINE_CASE(name) case MALI_MFBD_BLOCK_## name: return "MALI_MFBD_BLOCK_" #name +static char * +pandecode_mfbd_block_format(enum mali_mfbd_block_format fmt) +{ + switch (fmt) { + DEFINE_CASE(TILED); + DEFINE_CASE(UNKNOWN); + DEFINE_CASE(LINEAR); + DEFINE_CASE(AFBC); + + default: + unreachable("Invalid case"); + } +} +#undef DEFINE_CASE + +static inline char * +pandecode_decode_fbd_type(enum mali_fbd_type type) +{ + if (type == MALI_SFBD) return "SFBD"; + else if (type == MALI_MFBD) return "MFBD"; + else return "WATFBD /* XXX */"; +} + +/* Midgard's tiler descriptor is embedded within the + * larger FBD */ + +static void +pandecode_midgard_tiler_descriptor(const struct midgard_tiler_descriptor *t) +{ + pandecode_log(".tiler = {\n"); + pandecode_indent++; + + pandecode_prop("hierarchy_mask = 0x%" PRIx16, t->hierarchy_mask); + pandecode_prop("flags = 0x%" PRIx16, t->flags); + pandecode_prop("polygon_list_size = 0x%x", t->polygon_list_size); + + MEMORY_PROP(t, polygon_list); + MEMORY_PROP(t, polygon_list_body); + + MEMORY_PROP(t, heap_start); + + { + /* Points to the end of a buffer */ + char *a = pointer_as_memory_reference(t->heap_end - 1); + pandecode_prop("heap_end = %s + 1", a); + free(a); + } + + bool nonzero_weights = false; + + for (unsigned w = 0; w < ARRAY_SIZE(t->weights); ++w) { + nonzero_weights |= t->weights[w] != 0x0; + } + + if (nonzero_weights) { + pandecode_log(".weights = {"); + + for (unsigned w = 0; w < ARRAY_SIZE(t->weights); ++w) { + pandecode_log("%d, ", t->weights[w]); + } + + pandecode_log("},"); + } + + pandecode_indent--; + pandecode_log("}\n"); +} + +static void +pandecode_replay_sfbd(uint64_t gpu_va, int job_no) +{ + struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va); + const struct mali_single_framebuffer *PANDECODE_PTR_VAR(s, mem, (mali_ptr) gpu_va); + + pandecode_log("struct mali_single_framebuffer framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no); + pandecode_indent++; + + pandecode_prop("unknown1 = 0x%" PRIx32, s->unknown1); + pandecode_prop("unknown2 = 0x%" PRIx32, s->unknown2); + + pandecode_log(".format = "); + pandecode_log_decoded_flags(fb_fmt_flag_info, s->format); + pandecode_log_cont(",\n"); + + pandecode_prop("width = MALI_POSITIVE(%" PRId16 ")", s->width + 1); + pandecode_prop("height = MALI_POSITIVE(%" PRId16 ")", s->height + 1); + + MEMORY_PROP(s, framebuffer); + pandecode_prop("stride = %d", s->stride); + + /* Earlier in the actual commandstream -- right before width -- but we + * delay to flow nicer */ + + pandecode_log(".clear_flags = "); + pandecode_log_decoded_flags(clear_flag_info, s->clear_flags); + pandecode_log_cont(",\n"); + + if (s->depth_buffer | s->depth_buffer_enable) { + MEMORY_PROP(s, depth_buffer); + pandecode_prop("depth_buffer_enable = %s", DS_ENABLE(s->depth_buffer_enable)); + } + + if (s->stencil_buffer | s->stencil_buffer_enable) { + MEMORY_PROP(s, stencil_buffer); + pandecode_prop("stencil_buffer_enable = %s", DS_ENABLE(s->stencil_buffer_enable)); + } + + if (s->clear_color_1 | s->clear_color_2 | s->clear_color_3 | s->clear_color_4) { + pandecode_prop("clear_color_1 = 0x%" PRIx32, s->clear_color_1); + pandecode_prop("clear_color_2 = 0x%" PRIx32, s->clear_color_2); + pandecode_prop("clear_color_3 = 0x%" PRIx32, s->clear_color_3); + pandecode_prop("clear_color_4 = 0x%" PRIx32, s->clear_color_4); + } + + if (s->clear_depth_1 != 0 || s->clear_depth_2 != 0 || s->clear_depth_3 != 0 || s->clear_depth_4 != 0) { + pandecode_prop("clear_depth_1 = %f", s->clear_depth_1); + pandecode_prop("clear_depth_2 = %f", s->clear_depth_2); + pandecode_prop("clear_depth_3 = %f", s->clear_depth_3); + pandecode_prop("clear_depth_4 = %f", s->clear_depth_4); + } + + if (s->clear_stencil) { + pandecode_prop("clear_stencil = 0x%x", s->clear_stencil); + } + + MEMORY_PROP(s, unknown_address_0); + pandecode_midgard_tiler_descriptor(&s->tiler); + + pandecode_indent--; + pandecode_log("};\n"); + + pandecode_prop("zero0 = 0x%" PRIx64, s->zero0); + pandecode_prop("zero1 = 0x%" PRIx64, s->zero1); + pandecode_prop("zero2 = 0x%" PRIx32, s->zero2); + pandecode_prop("zero4 = 0x%" PRIx32, s->zero4); + + printf(".zero3 = {"); + + for (int i = 0; i < sizeof(s->zero3) / sizeof(s->zero3[0]); ++i) + printf("%X, ", s->zero3[i]); + + printf("},\n"); + + printf(".zero6 = {"); + + for (int i = 0; i < sizeof(s->zero6) / sizeof(s->zero6[0]); ++i) + printf("%X, ", s->zero6[i]); + + printf("},\n"); +} + +static void +pandecode_u32_slide(unsigned name, const u32 *slide, unsigned count) +{ + pandecode_log(".unknown%d = {", name); + + for (int i = 0; i < count; ++i) + printf("%X, ", slide[i]); + + pandecode_log("},\n"); +} + +#define SHORT_SLIDE(num) \ + pandecode_u32_slide(num, s->unknown ## num, ARRAY_SIZE(s->unknown ## num)) + +static void +pandecode_compute_fbd(uint64_t gpu_va, int job_no) +{ + struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va); + const struct mali_compute_fbd *PANDECODE_PTR_VAR(s, mem, (mali_ptr) gpu_va); + + pandecode_log("struct mali_compute_fbd framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no); + pandecode_indent++; + + SHORT_SLIDE(1); + + pandecode_indent--; + printf("},\n"); +} + +static void +pandecode_replay_swizzle(unsigned swizzle) +{ + pandecode_prop("swizzle = %s | (%s << 3) | (%s << 6) | (%s << 9)", + pandecode_channel_name((swizzle >> 0) & 0x7), + pandecode_channel_name((swizzle >> 3) & 0x7), + pandecode_channel_name((swizzle >> 6) & 0x7), + pandecode_channel_name((swizzle >> 9) & 0x7)); +} + +static void +pandecode_rt_format(struct mali_rt_format format) +{ + pandecode_log(".format = {\n"); + pandecode_indent++; + + pandecode_prop("unk1 = 0x%" PRIx32, format.unk1); + pandecode_prop("unk2 = 0x%" PRIx32, format.unk2); + pandecode_prop("unk3 = 0x%" PRIx32, format.unk3); + + pandecode_prop("block = %s", + pandecode_mfbd_block_format(format.block)); + + pandecode_prop("nr_channels = MALI_POSITIVE(%d)", + MALI_NEGATIVE(format.nr_channels)); + + pandecode_log(".flags = "); + pandecode_log_decoded_flags(mfbd_fmt_flag_info, format.flags); + pandecode_log_cont(",\n"); + + pandecode_replay_swizzle(format.swizzle); + + pandecode_prop("unk4 = 0x%" PRIx32, format.unk4); + + pandecode_indent--; + pandecode_log("},\n"); +} + +static void +pandecode_render_target(uint64_t gpu_va, unsigned job_no, const struct bifrost_framebuffer *fb) +{ + pandecode_log("struct bifrost_render_target rts_list_%"PRIx64"_%d[] = {\n", gpu_va, job_no); + pandecode_indent++; + + for (int i = 0; i < MALI_NEGATIVE(fb->rt_count_1); i++) { + mali_ptr rt_va = gpu_va + i * sizeof(struct bifrost_render_target); + struct pandecode_mapped_memory *mem = + pandecode_find_mapped_gpu_mem_containing(rt_va); + const struct bifrost_render_target *PANDECODE_PTR_VAR(rt, mem, (mali_ptr) rt_va); + + pandecode_log("{\n"); + pandecode_indent++; + + pandecode_rt_format(rt->format); + + if (rt->format.block == MALI_MFBD_BLOCK_AFBC) { + pandecode_log(".afbc = {\n"); + pandecode_indent++; + + char *a = pointer_as_memory_reference(rt->afbc.metadata); + pandecode_prop("metadata = %s", a); + free(a); + + pandecode_prop("stride = %d", rt->afbc.stride); + pandecode_prop("unk = 0x%" PRIx32, rt->afbc.unk); + + pandecode_indent--; + pandecode_log("},\n"); + } else { + pandecode_log(".chunknown = {\n"); + pandecode_indent++; + + pandecode_prop("unk = 0x%" PRIx64, rt->chunknown.unk); + + char *a = pointer_as_memory_reference(rt->chunknown.pointer); + pandecode_prop("pointer = %s", a); + free(a); + + pandecode_indent--; + pandecode_log("},\n"); + } + + MEMORY_PROP(rt, framebuffer); + pandecode_prop("framebuffer_stride = %d", rt->framebuffer_stride); + + if (rt->clear_color_1 | rt->clear_color_2 | rt->clear_color_3 | rt->clear_color_4) { + pandecode_prop("clear_color_1 = 0x%" PRIx32, rt->clear_color_1); + pandecode_prop("clear_color_2 = 0x%" PRIx32, rt->clear_color_2); + pandecode_prop("clear_color_3 = 0x%" PRIx32, rt->clear_color_3); + pandecode_prop("clear_color_4 = 0x%" PRIx32, rt->clear_color_4); + } + + if (rt->zero1 || rt->zero2 || rt->zero3) { + pandecode_msg("render target zeros tripped\n"); + pandecode_prop("zero1 = 0x%" PRIx64, rt->zero1); + pandecode_prop("zero2 = 0x%" PRIx32, rt->zero2); + pandecode_prop("zero3 = 0x%" PRIx32, rt->zero3); + } + + pandecode_indent--; + pandecode_log("},\n"); + } + + pandecode_indent--; + pandecode_log("};\n"); +} + +static unsigned +pandecode_replay_mfbd_bfr(uint64_t gpu_va, int job_no, bool with_render_targets) +{ + struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va); + const struct bifrost_framebuffer *PANDECODE_PTR_VAR(fb, mem, (mali_ptr) gpu_va); + + if (fb->sample_locations) { + /* The blob stores all possible sample locations in a single buffer + * allocated on startup, and just switches the pointer when switching + * MSAA state. For now, we just put the data into the cmdstream, but we + * should do something like what the blob does with a real driver. + * + * There seem to be 32 slots for sample locations, followed by another + * 16. The second 16 is just the center location followed by 15 zeros + * in all the cases I've identified (maybe shader vs. depth/color + * samples?). + */ + + struct pandecode_mapped_memory *smem = pandecode_find_mapped_gpu_mem_containing(fb->sample_locations); + + const u16 *PANDECODE_PTR_VAR(samples, smem, fb->sample_locations); + + pandecode_log("uint16_t sample_locations_%d[] = {\n", job_no); + pandecode_indent++; + + for (int i = 0; i < 32 + 16; i++) { + pandecode_log("%d, %d,\n", samples[2 * i], samples[2 * i + 1]); + } + + pandecode_indent--; + pandecode_log("};\n"); + } + + pandecode_log("struct bifrost_framebuffer framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no); + pandecode_indent++; + + pandecode_prop("unk0 = 0x%x", fb->unk0); + + if (fb->sample_locations) + pandecode_prop("sample_locations = sample_locations_%d", job_no); + + /* Assume that unknown1 was emitted in the last job for + * now */ + MEMORY_PROP(fb, unknown1); + + pandecode_prop("width1 = MALI_POSITIVE(%d)", fb->width1 + 1); + pandecode_prop("height1 = MALI_POSITIVE(%d)", fb->height1 + 1); + pandecode_prop("width2 = MALI_POSITIVE(%d)", fb->width2 + 1); + pandecode_prop("height2 = MALI_POSITIVE(%d)", fb->height2 + 1); + + pandecode_prop("unk1 = 0x%x", fb->unk1); + pandecode_prop("unk2 = 0x%x", fb->unk2); + pandecode_prop("rt_count_1 = MALI_POSITIVE(%d)", fb->rt_count_1 + 1); + pandecode_prop("rt_count_2 = %d", fb->rt_count_2); + + pandecode_log(".mfbd_flags = "); + pandecode_log_decoded_flags(mfbd_flag_info, fb->mfbd_flags); + pandecode_log_cont(",\n"); + + pandecode_prop("clear_stencil = 0x%x", fb->clear_stencil); + pandecode_prop("clear_depth = %f", fb->clear_depth); + + pandecode_prop("unknown2 = 0x%x", fb->unknown2); + MEMORY_PROP(fb, scratchpad); + pandecode_midgard_tiler_descriptor(&fb->tiler); + + if (fb->zero3 || fb->zero4) { + pandecode_msg("framebuffer zeros tripped\n"); + pandecode_prop("zero3 = 0x%" PRIx32, fb->zero3); + pandecode_prop("zero4 = 0x%" PRIx32, fb->zero4); + } + + pandecode_indent--; + pandecode_log("};\n"); + + gpu_va += sizeof(struct bifrost_framebuffer); + + if ((fb->mfbd_flags & MALI_MFBD_EXTRA) && with_render_targets) { + mem = pandecode_find_mapped_gpu_mem_containing(gpu_va); + const struct bifrost_fb_extra *PANDECODE_PTR_VAR(fbx, mem, (mali_ptr) gpu_va); + + pandecode_log("struct bifrost_fb_extra fb_extra_%"PRIx64"_%d = {\n", gpu_va, job_no); + pandecode_indent++; + + MEMORY_PROP(fbx, checksum); + + if (fbx->checksum_stride) + pandecode_prop("checksum_stride = %d", fbx->checksum_stride); + + pandecode_log(".flags = "); + pandecode_log_decoded_flags(mfbd_extra_flag_info, fbx->flags); + pandecode_log_cont(",\n"); + + if (fbx->flags & MALI_EXTRA_AFBC_ZS) { + pandecode_log(".ds_afbc = {\n"); + pandecode_indent++; + + MEMORY_PROP((&fbx->ds_afbc), depth_stencil_afbc_metadata); + pandecode_prop("depth_stencil_afbc_stride = %d", + fbx->ds_afbc.depth_stencil_afbc_stride); + MEMORY_PROP((&fbx->ds_afbc), depth_stencil); + + if (fbx->ds_afbc.zero1 || fbx->ds_afbc.padding) { + pandecode_msg("Depth/stencil AFBC zeros tripped\n"); + pandecode_prop("zero1 = 0x%" PRIx32, + fbx->ds_afbc.zero1); + pandecode_prop("padding = 0x%" PRIx64, + fbx->ds_afbc.padding); + } + + pandecode_indent--; + pandecode_log("},\n"); + } else { + pandecode_log(".ds_linear = {\n"); + pandecode_indent++; + + if (fbx->ds_linear.depth) { + MEMORY_PROP((&fbx->ds_linear), depth); + pandecode_prop("depth_stride = %d", + fbx->ds_linear.depth_stride); + } + + if (fbx->ds_linear.stencil) { + MEMORY_PROP((&fbx->ds_linear), stencil); + pandecode_prop("stencil_stride = %d", + fbx->ds_linear.stencil_stride); + } + + if (fbx->ds_linear.depth_stride_zero || + fbx->ds_linear.stencil_stride_zero || + fbx->ds_linear.zero1 || fbx->ds_linear.zero2) { + pandecode_msg("Depth/stencil zeros tripped\n"); + pandecode_prop("depth_stride_zero = 0x%x", + fbx->ds_linear.depth_stride_zero); + pandecode_prop("stencil_stride_zero = 0x%x", + fbx->ds_linear.stencil_stride_zero); + pandecode_prop("zero1 = 0x%" PRIx32, + fbx->ds_linear.zero1); + pandecode_prop("zero2 = 0x%" PRIx32, + fbx->ds_linear.zero2); + } + + pandecode_indent--; + pandecode_log("},\n"); + } + + if (fbx->zero3 || fbx->zero4) { + pandecode_msg("fb_extra zeros tripped\n"); + pandecode_prop("zero3 = 0x%" PRIx64, fbx->zero3); + pandecode_prop("zero4 = 0x%" PRIx64, fbx->zero4); + } + + pandecode_indent--; + pandecode_log("};\n"); + + gpu_va += sizeof(struct bifrost_fb_extra); + } + + if (with_render_targets) + pandecode_render_target(gpu_va, job_no, fb); + + /* Passback the render target count */ + return MALI_NEGATIVE(fb->rt_count_1); +} + +/* Just add a comment decoding the shift/odd fields forming the padded vertices + * count */ + +static void +pandecode_padded_vertices(unsigned shift, unsigned k) +{ + unsigned odd = 2*k + 1; + unsigned pot = 1 << shift; + pandecode_msg("padded_num_vertices = %d\n", odd * pot); +} + +/* Given a magic divisor, recover what we were trying to divide by. + * + * Let m represent the magic divisor. By definition, m is an element on Z, whre + * 0 <= m < 2^N, for N bits in m. + * + * Let q represent the number we would like to divide by. + * + * By definition of a magic divisor for N-bit unsigned integers (a number you + * multiply by to magically get division), m is a number such that: + * + * (m * x) & (2^N - 1) = floor(x/q). + * for all x on Z where 0 <= x < 2^N + * + * Ignore the case where any of the above values equals zero; it is irrelevant + * for our purposes (instanced arrays). + * + * Choose x = q. Then: + * + * (m * x) & (2^N - 1) = floor(x/q). + * (m * q) & (2^N - 1) = floor(q/q). + * + * floor(q/q) = floor(1) = 1, therefore: + * + * (m * q) & (2^N - 1) = 1 + * + * Recall the identity that the bitwise AND of one less than a power-of-two + * equals the modulo with that power of two, i.e. for all x: + * + * x & (2^N - 1) = x % N + * + * Therefore: + * + * mq % (2^N) = 1 + * + * By definition, a modular multiplicative inverse of a number m is the number + * q such that with respect to a modulos M: + * + * mq % M = 1 + * + * Therefore, q is the modular multiplicative inverse of m with modulus 2^N. + * + */ + +static void +pandecode_magic_divisor(uint32_t magic, unsigned shift, unsigned orig_divisor, unsigned extra) +{ + /* Compute the modular inverse of `magic` with respect to 2^(32 - + * shift) the most lame way possible... just repeatedly add. + * Asymptoptically slow but nobody cares in practice, unless you have + * massive numbers of vertices or high divisors. */ + + unsigned inverse = 0; + + /* Magic implicitly has the highest bit set */ + magic |= (1 << 31); + + /* Depending on rounding direction */ + if (extra) + magic++; + + for (;;) { + uint32_t product = magic * inverse; + + if (shift) { + product >>= shift; + } + + if (product == 1) + break; + + ++inverse; + } + + pandecode_msg("dividing by %d (maybe off by two)\n", inverse); + + /* Recall we're supposed to divide by (gl_level_divisor * + * padded_num_vertices) */ + + unsigned padded_num_vertices = inverse / orig_divisor; + + pandecode_msg("padded_num_vertices = %d\n", padded_num_vertices); +} + +static void +pandecode_replay_attributes(const struct pandecode_mapped_memory *mem, + mali_ptr addr, int job_no, char *suffix, + int count, bool varying) +{ + char *prefix = varying ? "varyings" : "attributes"; + + union mali_attr *attr = pandecode_fetch_gpu_mem(mem, addr, sizeof(union mali_attr) * count); + + char base[128]; + snprintf(base, sizeof(base), "%s_data_%d%s", prefix, job_no, suffix); + + for (int i = 0; i < count; ++i) { + enum mali_attr_mode mode = attr[i].elements & 7; + + if (mode == MALI_ATTR_UNUSED) + continue; + + mali_ptr raw_elements = attr[i].elements & ~7; + + /* TODO: Do we maybe want to dump the attribute values + * themselves given the specified format? Or is that too hard? + * */ + + char *a = pointer_as_memory_reference(raw_elements); + pandecode_log("mali_ptr %s_%d_p = %s;\n", base, i, a); + free(a); + } + + pandecode_log("union mali_attr %s_%d[] = {\n", prefix, job_no); + pandecode_indent++; + + for (int i = 0; i < count; ++i) { + pandecode_log("{\n"); + pandecode_indent++; + + unsigned mode = attr[i].elements & 7; + pandecode_prop("elements = (%s_%d_p) | %s", base, i, pandecode_attr_mode_name(mode)); + pandecode_prop("shift = %d", attr[i].shift); + pandecode_prop("extra_flags = %d", attr[i].extra_flags); + pandecode_prop("stride = 0x%" PRIx32, attr[i].stride); + pandecode_prop("size = 0x%" PRIx32, attr[i].size); + + /* Decode further where possible */ + + if (mode == MALI_ATTR_MODULO) { + pandecode_padded_vertices( + attr[i].shift, + attr[i].extra_flags); + } + + pandecode_indent--; + pandecode_log("}, \n"); + + if (mode == MALI_ATTR_NPOT_DIVIDE) { + i++; + pandecode_log("{\n"); + pandecode_indent++; + pandecode_prop("unk = 0x%x", attr[i].unk); + pandecode_prop("magic_divisor = 0x%08x", attr[i].magic_divisor); + if (attr[i].zero != 0) + pandecode_prop("zero = 0x%x /* XXX zero tripped */", attr[i].zero); + pandecode_prop("divisor = %d", attr[i].divisor); + pandecode_magic_divisor(attr[i].magic_divisor, attr[i - 1].shift, attr[i].divisor, attr[i - 1].extra_flags); + pandecode_indent--; + pandecode_log("}, \n"); + } + + } + + pandecode_indent--; + pandecode_log("};\n"); +} + +static mali_ptr +pandecode_replay_shader_address(const char *name, mali_ptr ptr) +{ + /* TODO: Decode flags */ + mali_ptr shader_ptr = ptr & ~15; + + char *a = pointer_as_memory_reference(shader_ptr); + pandecode_prop("%s = (%s) | %d", name, a, (int) (ptr & 15)); + free(a); + + return shader_ptr; +} + +static bool +all_zero(unsigned *buffer, unsigned count) +{ + for (unsigned i = 0; i < count; ++i) { + if (buffer[i]) + return false; + } + + return true; +} + +static void +pandecode_replay_stencil(const char *name, const struct mali_stencil_test *stencil) +{ + if (all_zero((unsigned *) stencil, sizeof(stencil) / sizeof(unsigned))) + return; + + const char *func = pandecode_func_name(stencil->func); + const char *sfail = pandecode_stencil_op_name(stencil->sfail); + const char *dpfail = pandecode_stencil_op_name(stencil->dpfail); + const char *dppass = pandecode_stencil_op_name(stencil->dppass); + + if (stencil->zero) + pandecode_msg("Stencil zero tripped: %X\n", stencil->zero); + + pandecode_log(".stencil_%s = {\n", name); + pandecode_indent++; + pandecode_prop("ref = %d", stencil->ref); + pandecode_prop("mask = 0x%02X", stencil->mask); + pandecode_prop("func = %s", func); + pandecode_prop("sfail = %s", sfail); + pandecode_prop("dpfail = %s", dpfail); + pandecode_prop("dppass = %s", dppass); + pandecode_indent--; + pandecode_log("},\n"); +} + +static void +pandecode_replay_blend_equation(const struct mali_blend_equation *blend) +{ + if (blend->zero1) + pandecode_msg("Blend zero tripped: %X\n", blend->zero1); + + pandecode_log(".equation = {\n"); + pandecode_indent++; + + pandecode_prop("rgb_mode = 0x%X", blend->rgb_mode); + pandecode_prop("alpha_mode = 0x%X", blend->alpha_mode); + + pandecode_log(".color_mask = "); + pandecode_log_decoded_flags(mask_flag_info, blend->color_mask); + pandecode_log_cont(",\n"); + + pandecode_indent--; + pandecode_log("},\n"); +} + +/* Decodes a Bifrost blend constant. See the notes in bifrost_blend_rt */ + +static unsigned +decode_bifrost_constant(u16 constant) +{ + float lo = (float) (constant & 0xFF); + float hi = (float) (constant >> 8); + + return (hi / 255.0) + (lo / 65535.0); +} + +static mali_ptr +pandecode_bifrost_blend(void *descs, int job_no, int rt_no) +{ + struct bifrost_blend_rt *b = + ((struct bifrost_blend_rt *) descs) + rt_no; + + pandecode_log("struct bifrost_blend_rt blend_rt_%d_%d = {\n", job_no, rt_no); + pandecode_indent++; + + pandecode_prop("flags = 0x%" PRIx16, b->flags); + pandecode_prop("constant = 0x%" PRIx8 " /* %f */", + b->constant, decode_bifrost_constant(b->constant)); + + /* TODO figure out blend shader enable bit */ + pandecode_replay_blend_equation(&b->equation); + pandecode_prop("unk2 = 0x%" PRIx16, b->unk2); + pandecode_prop("index = 0x%" PRIx16, b->index); + pandecode_prop("shader = 0x%" PRIx32, b->shader); + + pandecode_indent--; + pandecode_log("},\n"); + + return 0; +} + +static mali_ptr +pandecode_midgard_blend(union midgard_blend *blend, bool is_shader) +{ + if (all_zero((unsigned *) blend, sizeof(blend) / sizeof(unsigned))) + return 0; + + pandecode_log(".blend = {\n"); + pandecode_indent++; + + if (is_shader) { + pandecode_replay_shader_address("shader", blend->shader); + } else { + pandecode_replay_blend_equation(&blend->equation); + pandecode_prop("constant = %f", blend->constant); + } + + pandecode_indent--; + pandecode_log("},\n"); + + /* Return blend shader to disassemble if present */ + return is_shader ? (blend->shader & ~0xF) : 0; +} + +static mali_ptr +pandecode_midgard_blend_mrt(void *descs, int job_no, int rt_no) +{ + struct midgard_blend_rt *b = + ((struct midgard_blend_rt *) descs) + rt_no; + + /* Flags determine presence of blend shader */ + bool is_shader = (b->flags & 0xF) >= 0x2; + + pandecode_log("struct midgard_blend_rt blend_rt_%d_%d = {\n", job_no, rt_no); + pandecode_indent++; + + pandecode_prop("flags = 0x%" PRIx64, b->flags); + + mali_ptr shader = pandecode_midgard_blend(&b->blend, is_shader); + + pandecode_indent--; + pandecode_log("};\n"); + + return shader; +} + +static int +pandecode_replay_attribute_meta(int job_no, int count, const struct mali_vertex_tiler_postfix *v, bool varying, char *suffix) +{ + char base[128]; + char *prefix = varying ? "varying" : "attribute"; + unsigned max_index = 0; + snprintf(base, sizeof(base), "%s_meta", prefix); + + pandecode_log("struct mali_attr_meta %s_%d%s[] = {\n", base, job_no, suffix); + pandecode_indent++; + + struct mali_attr_meta *attr_meta; + mali_ptr p = varying ? (v->varying_meta & ~0xF) : v->attribute_meta; + + struct pandecode_mapped_memory *attr_mem = pandecode_find_mapped_gpu_mem_containing(p); + + for (int i = 0; i < count; ++i, p += sizeof(struct mali_attr_meta)) { + attr_meta = pandecode_fetch_gpu_mem(attr_mem, p, + sizeof(*attr_mem)); + + pandecode_log("{\n"); + pandecode_indent++; + pandecode_prop("index = %d", attr_meta->index); + + if (attr_meta->index > max_index) + max_index = attr_meta->index; + pandecode_replay_swizzle(attr_meta->swizzle); + pandecode_prop("format = %s", pandecode_format_name(attr_meta->format)); + + pandecode_prop("unknown1 = 0x%" PRIx64, (u64) attr_meta->unknown1); + pandecode_prop("unknown3 = 0x%" PRIx64, (u64) attr_meta->unknown3); + pandecode_prop("src_offset = %d", attr_meta->src_offset); + pandecode_indent--; + pandecode_log("},\n"); + + } + + pandecode_indent--; + pandecode_log("};\n"); + + return max_index; +} + +static void +pandecode_replay_indices(uintptr_t pindices, uint32_t index_count, int job_no) +{ + struct pandecode_mapped_memory *imem = pandecode_find_mapped_gpu_mem_containing(pindices); + + if (imem) { + /* Indices are literally just a u32 array :) */ + + uint32_t *PANDECODE_PTR_VAR(indices, imem, pindices); + + pandecode_log("uint32_t indices_%d[] = {\n", job_no); + pandecode_indent++; + + for (unsigned i = 0; i < (index_count + 1); i += 3) + pandecode_log("%d, %d, %d,\n", + indices[i], + indices[i + 1], + indices[i + 2]); + + pandecode_indent--; + pandecode_log("};\n"); + } +} + +/* return bits [lo, hi) of word */ +static u32 +bits(u32 word, u32 lo, u32 hi) +{ + if (hi - lo >= 32) + return word; // avoid undefined behavior with the shift + + return (word >> lo) & ((1 << (hi - lo)) - 1); +} + +static void +pandecode_replay_vertex_tiler_prefix(struct mali_vertex_tiler_prefix *p, int job_no) +{ + pandecode_log_cont("{\n"); + pandecode_indent++; + + pandecode_prop("invocation_count = 0x%" PRIx32, p->invocation_count); + pandecode_prop("size_y_shift = %d", p->size_y_shift); + pandecode_prop("size_z_shift = %d", p->size_z_shift); + pandecode_prop("workgroups_x_shift = %d", p->workgroups_x_shift); + pandecode_prop("workgroups_y_shift = %d", p->workgroups_y_shift); + pandecode_prop("workgroups_z_shift = %d", p->workgroups_z_shift); + pandecode_prop("workgroups_x_shift_2 = 0x%" PRIx32, p->workgroups_x_shift_2); + + /* Decode invocation_count. See the comment before the definition of + * invocation_count for an explanation. + */ + pandecode_msg("size: (%d, %d, %d)\n", + bits(p->invocation_count, 0, p->size_y_shift) + 1, + bits(p->invocation_count, p->size_y_shift, p->size_z_shift) + 1, + bits(p->invocation_count, p->size_z_shift, + p->workgroups_x_shift) + 1); + pandecode_msg("workgroups: (%d, %d, %d)\n", + bits(p->invocation_count, p->workgroups_x_shift, + p->workgroups_y_shift) + 1, + bits(p->invocation_count, p->workgroups_y_shift, + p->workgroups_z_shift) + 1, + bits(p->invocation_count, p->workgroups_z_shift, + 32) + 1); + + /* TODO: Decode */ + if (p->unknown_draw) + pandecode_prop("unknown_draw = 0x%" PRIx32, p->unknown_draw); + + pandecode_prop("workgroups_x_shift_3 = 0x%" PRIx32, p->workgroups_x_shift_3); + + pandecode_prop("draw_mode = %s", pandecode_draw_mode_name(p->draw_mode)); + + /* Index count only exists for tiler jobs anyway */ + + if (p->index_count) + pandecode_prop("index_count = MALI_POSITIVE(%" PRId32 ")", p->index_count + 1); + + if (p->negative_start) + pandecode_prop("negative_start = %d", p->negative_start); + + DYN_MEMORY_PROP(p, job_no, indices); + + if (p->zero1) { + pandecode_msg("Zero tripped\n"); + pandecode_prop("zero1 = 0x%" PRIx32, p->zero1); + } + + pandecode_indent--; + pandecode_log("},\n"); +} + +static void +pandecode_replay_uniform_buffers(mali_ptr pubufs, int ubufs_count, int job_no) +{ + struct pandecode_mapped_memory *umem = pandecode_find_mapped_gpu_mem_containing(pubufs); + + struct mali_uniform_buffer_meta *PANDECODE_PTR_VAR(ubufs, umem, pubufs); + + for (int i = 0; i < ubufs_count; i++) { + mali_ptr ptr = ubufs[i].ptr << 2; + struct pandecode_mapped_memory *umem2 = pandecode_find_mapped_gpu_mem_containing(ptr); + uint32_t *PANDECODE_PTR_VAR(ubuf, umem2, ptr); + char name[50]; + snprintf(name, sizeof(name), "ubuf_%d", i); + /* The blob uses ubuf 0 to upload internal stuff and + * uniforms that won't fit/are accessed indirectly, so + * it puts it in the batchbuffer. + */ + pandecode_log("uint32_t %s_%d[] = {\n", name, job_no); + pandecode_indent++; + + for (int j = 0; j <= ubufs[i].size; j++) { + for (int k = 0; k < 4; k++) { + if (k == 0) + pandecode_log("0x%"PRIx32", ", ubuf[4 * j + k]); + else + pandecode_log_cont("0x%"PRIx32", ", ubuf[4 * j + k]); + + } + + pandecode_log_cont("\n"); + } + + pandecode_indent--; + pandecode_log("};\n"); + } + + pandecode_log("struct mali_uniform_buffer_meta uniform_buffers_%d[] = {\n", + job_no); + pandecode_indent++; + + for (int i = 0; i < ubufs_count; i++) { + pandecode_log("{\n"); + pandecode_indent++; + pandecode_prop("size = MALI_POSITIVE(%d)", ubufs[i].size + 1); + pandecode_prop("ptr = ubuf_%d_%d_p >> 2", i, job_no); + pandecode_indent--; + pandecode_log("},\n"); + } + + pandecode_indent--; + pandecode_log("};\n"); +} + +static void +pandecode_replay_scratchpad(uintptr_t pscratchpad, int job_no, char *suffix) +{ + + struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(pscratchpad); + + struct bifrost_scratchpad *PANDECODE_PTR_VAR(scratchpad, mem, pscratchpad); + + if (scratchpad->zero) + pandecode_msg("XXX scratchpad zero tripped"); + + pandecode_log("struct bifrost_scratchpad scratchpad_%"PRIx64"_%d%s = {\n", pscratchpad, job_no, suffix); + pandecode_indent++; + + pandecode_prop("flags = 0x%x", scratchpad->flags); + MEMORY_PROP(scratchpad, gpu_scratchpad); + + pandecode_indent--; + pandecode_log("};\n"); +} + +static void +pandecode_shader_disassemble(mali_ptr shader_ptr, int shader_no, int type, + bool is_bifrost) +{ + struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(shader_ptr); + uint8_t *PANDECODE_PTR_VAR(code, mem, shader_ptr); + + /* Compute maximum possible size */ + size_t sz = mem->length - (shader_ptr - mem->gpu_va); + + /* Print some boilerplate to clearly denote the assembly (which doesn't + * obey indentation rules), and actually do the disassembly! */ + + printf("\n\n"); + + if (is_bifrost) { + disassemble_bifrost(code, sz, false); + } else { + disassemble_midgard(code, sz); + } + + printf("\n\n"); +} + +static void +pandecode_replay_vertex_tiler_postfix_pre(const struct mali_vertex_tiler_postfix *p, + int job_no, enum mali_job_type job_type, + char *suffix, bool is_bifrost) +{ + mali_ptr shader_meta_ptr = (u64) (uintptr_t) (p->_shader_upper << 4); + struct pandecode_mapped_memory *attr_mem; + + unsigned rt_count = 1; + + /* On Bifrost, since the tiler heap (for tiler jobs) and the scratchpad + * are the only things actually needed from the FBD, vertex/tiler jobs + * no longer reference the FBD -- instead, this field points to some + * info about the scratchpad. + */ + if (is_bifrost) + pandecode_replay_scratchpad(p->framebuffer & ~FBD_TYPE, job_no, suffix); + else if (p->framebuffer & MALI_MFBD) + rt_count = pandecode_replay_mfbd_bfr((u64) ((uintptr_t) p->framebuffer) & FBD_MASK, job_no, false); + else if (job_type == JOB_TYPE_COMPUTE) + pandecode_compute_fbd((u64) (uintptr_t) p->framebuffer, job_no); + else + pandecode_replay_sfbd((u64) (uintptr_t) p->framebuffer, job_no); + + int varying_count = 0, attribute_count = 0, uniform_count = 0, uniform_buffer_count = 0; + int texture_count = 0, sampler_count = 0; + + if (shader_meta_ptr) { + struct pandecode_mapped_memory *smem = pandecode_find_mapped_gpu_mem_containing(shader_meta_ptr); + struct mali_shader_meta *PANDECODE_PTR_VAR(s, smem, shader_meta_ptr); + + pandecode_log("struct mali_shader_meta shader_meta_%"PRIx64"_%d%s = {\n", shader_meta_ptr, job_no, suffix); + pandecode_indent++; + + /* Save for dumps */ + attribute_count = s->attribute_count; + varying_count = s->varying_count; + texture_count = s->texture_count; + sampler_count = s->sampler_count; + + if (is_bifrost) { + uniform_count = s->bifrost2.uniform_count; + uniform_buffer_count = s->bifrost1.uniform_buffer_count; + } else { + uniform_count = s->midgard1.uniform_count; + uniform_buffer_count = s->midgard1.uniform_buffer_count; + } + + mali_ptr shader_ptr = pandecode_replay_shader_address("shader", s->shader); + + pandecode_prop("texture_count = %" PRId16, s->texture_count); + pandecode_prop("sampler_count = %" PRId16, s->sampler_count); + pandecode_prop("attribute_count = %" PRId16, s->attribute_count); + pandecode_prop("varying_count = %" PRId16, s->varying_count); + + if (is_bifrost) { + pandecode_log(".bifrost1 = {\n"); + pandecode_indent++; + + pandecode_prop("uniform_buffer_count = %" PRId32, s->bifrost1.uniform_buffer_count); + pandecode_prop("unk1 = 0x%" PRIx32, s->bifrost1.unk1); + + pandecode_indent--; + pandecode_log("},\n"); + } else { + pandecode_log(".midgard1 = {\n"); + pandecode_indent++; + + pandecode_prop("uniform_count = %" PRId16, s->midgard1.uniform_count); + pandecode_prop("uniform_buffer_count = %" PRId16, s->midgard1.uniform_buffer_count); + pandecode_prop("work_count = %" PRId16, s->midgard1.work_count); + + pandecode_log(".flags = "); + pandecode_log_decoded_flags(shader_midgard1_flag_info, s->midgard1.flags); + pandecode_log_cont(",\n"); + + pandecode_prop("unknown2 = 0x%" PRIx32, s->midgard1.unknown2); + + pandecode_indent--; + pandecode_log("},\n"); + } + + if (s->depth_units || s->depth_factor) { + if (is_bifrost) + pandecode_prop("depth_units = %f", s->depth_units); + else + pandecode_prop("depth_units = MALI_NEGATIVE(%f)", s->depth_units - 1.0f); + + pandecode_prop("depth_factor = %f", s->depth_factor); + } + + if (s->alpha_coverage) { + bool invert_alpha_coverage = s->alpha_coverage & 0xFFF0; + uint16_t inverted_coverage = invert_alpha_coverage ? ~s->alpha_coverage : s->alpha_coverage; + + pandecode_prop("alpha_coverage = %sMALI_ALPHA_COVERAGE(%f)", + invert_alpha_coverage ? "~" : "", + MALI_GET_ALPHA_COVERAGE(inverted_coverage)); + } + + if (s->unknown2_3 || s->unknown2_4) { + pandecode_log(".unknown2_3 = "); + + int unknown2_3 = s->unknown2_3; + int unknown2_4 = s->unknown2_4; + + /* We're not quite sure what these flags mean without the depth test, if anything */ + + if (unknown2_3 & (MALI_DEPTH_TEST | MALI_DEPTH_FUNC_MASK)) { + const char *func = pandecode_func_name(MALI_GET_DEPTH_FUNC(unknown2_3)); + unknown2_3 &= ~MALI_DEPTH_FUNC_MASK; + + pandecode_log_cont("MALI_DEPTH_FUNC(%s) | ", func); + } + + pandecode_log_decoded_flags(u3_flag_info, unknown2_3); + pandecode_log_cont(",\n"); + + pandecode_log(".unknown2_4 = "); + pandecode_log_decoded_flags(u4_flag_info, unknown2_4); + pandecode_log_cont(",\n"); + } + + if (s->stencil_mask_front || s->stencil_mask_back) { + pandecode_prop("stencil_mask_front = 0x%02X", s->stencil_mask_front); + pandecode_prop("stencil_mask_back = 0x%02X", s->stencil_mask_back); + } + + pandecode_replay_stencil("front", &s->stencil_front); + pandecode_replay_stencil("back", &s->stencil_back); + + if (is_bifrost) { + pandecode_log(".bifrost2 = {\n"); + pandecode_indent++; + + pandecode_prop("unk3 = 0x%" PRIx32, s->bifrost2.unk3); + pandecode_prop("preload_regs = 0x%" PRIx32, s->bifrost2.preload_regs); + pandecode_prop("uniform_count = %" PRId32, s->bifrost2.uniform_count); + pandecode_prop("unk4 = 0x%" PRIx32, s->bifrost2.unk4); + + pandecode_indent--; + pandecode_log("},\n"); + } else if (s->midgard2.unknown2_7) { + pandecode_log(".midgard2 = {\n"); + pandecode_indent++; + + pandecode_prop("unknown2_7 = 0x%" PRIx32, s->midgard2.unknown2_7); + pandecode_indent--; + pandecode_log("},\n"); + } + + if (s->unknown2_8) + pandecode_prop("unknown2_8 = 0x%" PRIx32, s->unknown2_8); + + if (!is_bifrost) { + /* TODO: Blend shaders routing/disasm */ + + pandecode_midgard_blend(&s->blend, false); + } + + pandecode_indent--; + pandecode_log("};\n"); + + /* MRT blend fields are used whenever MFBD is used, with + * per-RT descriptors */ + + if (job_type == JOB_TYPE_TILER) { + void* blend_base = (void *) (s + 1); + + for (unsigned i = 0; i < rt_count; i++) { + mali_ptr shader = 0; + + if (is_bifrost) + shader = pandecode_bifrost_blend(blend_base, job_no, i); + else + shader = pandecode_midgard_blend_mrt(blend_base, job_no, i); + + if (shader) + pandecode_shader_disassemble(shader, job_no, job_type, false); + } + } + + pandecode_shader_disassemble(shader_ptr, job_no, job_type, is_bifrost); + } else + pandecode_msg("<no shader>\n"); + + if (p->viewport) { + struct pandecode_mapped_memory *fmem = pandecode_find_mapped_gpu_mem_containing(p->viewport); + struct mali_viewport *PANDECODE_PTR_VAR(f, fmem, p->viewport); + + pandecode_log("struct mali_viewport viewport_%d%s = {\n", job_no, suffix); + pandecode_indent++; + + pandecode_prop("clip_minx = %f", f->clip_minx); + pandecode_prop("clip_miny = %f", f->clip_miny); + pandecode_prop("clip_minz = %f", f->clip_minz); + pandecode_prop("clip_maxx = %f", f->clip_maxx); + pandecode_prop("clip_maxy = %f", f->clip_maxy); + pandecode_prop("clip_maxz = %f", f->clip_maxz); + + /* Only the higher coordinates are MALI_POSITIVE scaled */ + + pandecode_prop("viewport0 = { %d, %d }", + f->viewport0[0], f->viewport0[1]); + + pandecode_prop("viewport1 = { MALI_POSITIVE(%d), MALI_POSITIVE(%d) }", + f->viewport1[0] + 1, f->viewport1[1] + 1); + + pandecode_indent--; + pandecode_log("};\n"); + } + + if (p->attribute_meta) { + unsigned max_attr_index = pandecode_replay_attribute_meta(job_no, attribute_count, p, false, suffix); + + attr_mem = pandecode_find_mapped_gpu_mem_containing(p->attributes); + pandecode_replay_attributes(attr_mem, p->attributes, job_no, suffix, max_attr_index + 1, false); + } + + /* Varyings are encoded like attributes but not actually sent; we just + * pass a zero buffer with the right stride/size set, (or whatever) + * since the GPU will write to it itself */ + + if (p->varyings) { + attr_mem = pandecode_find_mapped_gpu_mem_containing(p->varyings); + + /* Number of descriptors depends on whether there are + * non-internal varyings */ + + pandecode_replay_attributes(attr_mem, p->varyings, job_no, suffix, varying_count > 1 ? 4 : 1, true); + } + + if (p->varying_meta) { + pandecode_replay_attribute_meta(job_no, varying_count, p, true, suffix); + } + + bool is_compute = job_type == JOB_TYPE_COMPUTE; + + if (p->uniforms && !is_compute) { + int rows = uniform_count, width = 4; + size_t sz = rows * width * sizeof(float); + + struct pandecode_mapped_memory *uniform_mem = pandecode_find_mapped_gpu_mem_containing(p->uniforms); + pandecode_fetch_gpu_mem(uniform_mem, p->uniforms, sz); + u32 *PANDECODE_PTR_VAR(uniforms, uniform_mem, p->uniforms); + + pandecode_log("u32 uniforms_%d%s[] = {\n", job_no, suffix); + + pandecode_indent++; + + for (int row = 0; row < rows; row++) { + for (int i = 0; i < width; i++) { + u32 v = uniforms[i]; + float f; + memcpy(&f, &v, sizeof(v)); + pandecode_log_cont("%X /* %f */, ", v, f); + } + + pandecode_log_cont("\n"); + + uniforms += width; + } + + pandecode_indent--; + pandecode_log("};\n"); + } else if (p->uniforms) { + int rows = uniform_count * 2; + size_t sz = rows * sizeof(mali_ptr); + + struct pandecode_mapped_memory *uniform_mem = pandecode_find_mapped_gpu_mem_containing(p->uniforms); + pandecode_fetch_gpu_mem(uniform_mem, p->uniforms, sz); + mali_ptr *PANDECODE_PTR_VAR(uniforms, uniform_mem, p->uniforms); + + pandecode_log("mali_ptr uniforms_%d%s[] = {\n", job_no, suffix); + + pandecode_indent++; + + for (int row = 0; row < rows; row++) { + char *a = pointer_as_memory_reference(uniforms[row]); + pandecode_log("%s,\n", a); + free(a); + } + + pandecode_indent--; + pandecode_log("};\n"); + + } + + if (p->uniform_buffers) { + pandecode_replay_uniform_buffers(p->uniform_buffers, uniform_buffer_count, job_no); + } + + if (p->texture_trampoline) { + struct pandecode_mapped_memory *mmem = pandecode_find_mapped_gpu_mem_containing(p->texture_trampoline); + + if (mmem) { + mali_ptr *PANDECODE_PTR_VAR(u, mmem, p->texture_trampoline); + + pandecode_log("uint64_t texture_trampoline_%d[] = {\n", job_no); + pandecode_indent++; + + for (int tex = 0; tex < texture_count; ++tex) { + mali_ptr *PANDECODE_PTR_VAR(u, mmem, p->texture_trampoline + tex * sizeof(mali_ptr)); + char *a = pointer_as_memory_reference(*u); + pandecode_log("%s,\n", a); + free(a); + } + + pandecode_indent--; + pandecode_log("};\n"); + + /* Now, finally, descend down into the texture descriptor */ + for (int tex = 0; tex < texture_count; ++tex) { + mali_ptr *PANDECODE_PTR_VAR(u, mmem, p->texture_trampoline + tex * sizeof(mali_ptr)); + struct pandecode_mapped_memory *tmem = pandecode_find_mapped_gpu_mem_containing(*u); + + if (tmem) { + struct mali_texture_descriptor *PANDECODE_PTR_VAR(t, tmem, *u); + + pandecode_log("struct mali_texture_descriptor texture_descriptor_%"PRIx64"_%d_%d = {\n", *u, job_no, tex); + pandecode_indent++; + + pandecode_prop("width = MALI_POSITIVE(%" PRId16 ")", t->width + 1); + pandecode_prop("height = MALI_POSITIVE(%" PRId16 ")", t->height + 1); + pandecode_prop("depth = MALI_POSITIVE(%" PRId16 ")", t->depth + 1); + pandecode_prop("array_size = MALI_POSITIVE(%" PRId16 ")", t->array_size + 1); + pandecode_prop("unknown3 = %" PRId16, t->unknown3); + pandecode_prop("unknown3A = %" PRId8, t->unknown3A); + pandecode_prop("nr_mipmap_levels = %" PRId8, t->nr_mipmap_levels); + + struct mali_texture_format f = t->format; + + pandecode_log(".format = {\n"); + pandecode_indent++; + + pandecode_replay_swizzle(f.swizzle); + pandecode_prop("format = %s", pandecode_format_name(f.format)); + pandecode_prop("type = %s", pandecode_texture_type(f.type)); + pandecode_prop("srgb = %" PRId32, f.srgb); + pandecode_prop("unknown1 = %" PRId32, f.unknown1); + pandecode_prop("usage2 = 0x%" PRIx32, f.usage2); + + pandecode_indent--; + pandecode_log("},\n"); + + pandecode_replay_swizzle(t->swizzle); + + if (t->swizzle_zero) { + /* Shouldn't happen */ + pandecode_msg("Swizzle zero tripped but replay will be fine anyway"); + pandecode_prop("swizzle_zero = %d", t->swizzle_zero); + } + + pandecode_prop("unknown3 = 0x%" PRIx32, t->unknown3); + + pandecode_prop("unknown5 = 0x%" PRIx32, t->unknown5); + pandecode_prop("unknown6 = 0x%" PRIx32, t->unknown6); + pandecode_prop("unknown7 = 0x%" PRIx32, t->unknown7); + + pandecode_log(".payload = {\n"); + pandecode_indent++; + + /* A bunch of bitmap pointers follow. + * We work out the correct number, + * based on the mipmap/cubemap + * properties, but dump extra + * possibilities to futureproof */ + + int bitmap_count = MALI_NEGATIVE(t->nr_mipmap_levels); + bool manual_stride = f.usage2 & MALI_TEX_MANUAL_STRIDE; + + /* Miptree for each face */ + if (f.type == MALI_TEX_CUBE) + bitmap_count *= 6; + + /* Array of textures */ + bitmap_count *= MALI_NEGATIVE(t->array_size); + + /* Stride for each element */ + if (manual_stride) + bitmap_count *= 2; + + /* Sanity check the size */ + int max_count = sizeof(t->payload) / sizeof(t->payload[0]); + assert (bitmap_count <= max_count); + + /* Dump more to be safe, but not _that_ much more */ + int safe_count = MIN2(bitmap_count * 2, max_count); + + for (int i = 0; i < safe_count; ++i) { + char *prefix = (i >= bitmap_count) ? "// " : ""; + + /* How we dump depends if this is a stride or a pointer */ + + if ((f.usage2 & MALI_TEX_MANUAL_STRIDE) && (i & 1)) { + /* signed 32-bit snuck in as a 64-bit pointer */ + uint64_t stride_set = t->payload[i]; + uint32_t clamped_stride = stride_set; + int32_t stride = clamped_stride; + assert(stride_set == clamped_stride); + pandecode_log("%s(mali_ptr) %d /* stride */, \n", prefix, stride); + } else { + char *a = pointer_as_memory_reference(t->payload[i]); + pandecode_log("%s%s, \n", prefix, a); + free(a); + } + } + + pandecode_indent--; + pandecode_log("},\n"); + + pandecode_indent--; + pandecode_log("};\n"); + } + } + } + } + + if (p->sampler_descriptor) { + struct pandecode_mapped_memory *smem = pandecode_find_mapped_gpu_mem_containing(p->sampler_descriptor); + + if (smem) { + struct mali_sampler_descriptor *s; + + mali_ptr d = p->sampler_descriptor; + + for (int i = 0; i < sampler_count; ++i) { + s = pandecode_fetch_gpu_mem(smem, d + sizeof(*s) * i, sizeof(*s)); + + pandecode_log("struct mali_sampler_descriptor sampler_descriptor_%d_%d = {\n", job_no, i); + pandecode_indent++; + + /* Only the lower two bits are understood right now; the rest we display as hex */ + pandecode_log(".filter_mode = MALI_TEX_MIN(%s) | MALI_TEX_MAG(%s) | 0x%" PRIx32",\n", + MALI_FILTER_NAME(s->filter_mode & MALI_TEX_MIN_MASK), + MALI_FILTER_NAME(s->filter_mode & MALI_TEX_MAG_MASK), + s->filter_mode & ~3); + + pandecode_prop("min_lod = FIXED_16(%f)", DECODE_FIXED_16(s->min_lod)); + pandecode_prop("max_lod = FIXED_16(%f)", DECODE_FIXED_16(s->max_lod)); + + pandecode_prop("wrap_s = %s", pandecode_wrap_mode_name(s->wrap_s)); + pandecode_prop("wrap_t = %s", pandecode_wrap_mode_name(s->wrap_t)); + pandecode_prop("wrap_r = %s", pandecode_wrap_mode_name(s->wrap_r)); + + pandecode_prop("compare_func = %s", pandecode_alt_func_name(s->compare_func)); + + if (s->zero || s->zero2) { + pandecode_msg("Zero tripped\n"); + pandecode_prop("zero = 0x%X, 0x%X\n", s->zero, s->zero2); + } + + pandecode_prop("seamless_cube_map = %d", s->seamless_cube_map); + + pandecode_prop("border_color = { %f, %f, %f, %f }", + s->border_color[0], + s->border_color[1], + s->border_color[2], + s->border_color[3]); + + pandecode_indent--; + pandecode_log("};\n"); + } + } + } +} + +static void +pandecode_replay_vertex_tiler_postfix(const struct mali_vertex_tiler_postfix *p, int job_no, bool is_bifrost) +{ + pandecode_log_cont("{\n"); + pandecode_indent++; + + MEMORY_PROP(p, position_varying); + DYN_MEMORY_PROP(p, job_no, uniform_buffers); + DYN_MEMORY_PROP(p, job_no, texture_trampoline); + DYN_MEMORY_PROP(p, job_no, sampler_descriptor); + DYN_MEMORY_PROP(p, job_no, uniforms); + DYN_MEMORY_PROP(p, job_no, attributes); + DYN_MEMORY_PROP(p, job_no, attribute_meta); + DYN_MEMORY_PROP(p, job_no, varyings); + DYN_MEMORY_PROP(p, job_no, varying_meta); + DYN_MEMORY_PROP(p, job_no, viewport); + DYN_MEMORY_PROP(p, job_no, occlusion_counter); + + if (is_bifrost) + pandecode_prop("framebuffer = scratchpad_%d_p", job_no); + else + pandecode_prop("framebuffer = framebuffer_%d_p | %s", job_no, p->framebuffer & MALI_MFBD ? "MALI_MFBD" : "0"); + + pandecode_prop("_shader_upper = (shader_meta_%d_p) >> 4", job_no); + pandecode_prop("flags = %d", p->flags); + + pandecode_indent--; + pandecode_log("},\n"); +} + +static void +pandecode_replay_vertex_only_bfr(struct bifrost_vertex_only *v) +{ + pandecode_log_cont("{\n"); + pandecode_indent++; + + pandecode_prop("unk2 = 0x%x", v->unk2); + + if (v->zero0 || v->zero1) { + pandecode_msg("vertex only zero tripped"); + pandecode_prop("zero0 = 0x%" PRIx32, v->zero0); + pandecode_prop("zero1 = 0x%" PRIx64, v->zero1); + } + + pandecode_indent--; + pandecode_log("}\n"); +} + +static void +pandecode_replay_tiler_heap_meta(mali_ptr gpu_va, int job_no) +{ + + struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va); + const struct bifrost_tiler_heap_meta *PANDECODE_PTR_VAR(h, mem, gpu_va); + + pandecode_log("struct mali_tiler_heap_meta tiler_heap_meta_%d = {\n", job_no); + pandecode_indent++; + + if (h->zero) { + pandecode_msg("tiler heap zero tripped\n"); + pandecode_prop("zero = 0x%x", h->zero); + } + + for (int i = 0; i < 12; i++) { + if (h->zeros[i] != 0) { + pandecode_msg("tiler heap zero %d tripped, value %x\n", + i, h->zeros[i]); + } + } + + pandecode_prop("heap_size = 0x%x", h->heap_size); + MEMORY_PROP(h, tiler_heap_start); + MEMORY_PROP(h, tiler_heap_free); + + /* this might point to the beginning of another buffer, when it's + * really the end of the tiler heap buffer, so we have to be careful + * here. + */ + char *a = pointer_as_memory_reference(h->tiler_heap_end - 1); + pandecode_prop("tiler_heap_end = %s + 1", a); + free(a); + + pandecode_indent--; + pandecode_log("};\n"); +} + +static void +pandecode_replay_tiler_meta(mali_ptr gpu_va, int job_no) +{ + struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va); + const struct bifrost_tiler_meta *PANDECODE_PTR_VAR(t, mem, gpu_va); + + pandecode_replay_tiler_heap_meta(t->tiler_heap_meta, job_no); + + pandecode_log("struct bifrost_tiler_meta tiler_meta_%d = {\n", job_no); + pandecode_indent++; + + if (t->zero0 || t->zero1) { + pandecode_msg("tiler meta zero tripped"); + pandecode_prop("zero0 = 0x%" PRIx64, t->zero0); + pandecode_prop("zero1 = 0x%" PRIx64, t->zero1); + } + + pandecode_prop("hierarchy_mask = 0x%" PRIx16, t->hierarchy_mask); + pandecode_prop("flags = 0x%" PRIx16, t->flags); + + pandecode_prop("width = MALI_POSITIVE(%d)", t->width + 1); + pandecode_prop("height = MALI_POSITIVE(%d)", t->height + 1); + DYN_MEMORY_PROP(t, job_no, tiler_heap_meta); + + for (int i = 0; i < 12; i++) { + if (t->zeros[i] != 0) { + pandecode_msg("tiler heap zero %d tripped, value %" PRIx64 "\n", + i, t->zeros[i]); + } + } + + pandecode_indent--; + pandecode_log("};\n"); +} + +static void +pandecode_replay_gl_enables(uint32_t gl_enables, int job_type) +{ + pandecode_log(".gl_enables = "); + + pandecode_log_decoded_flags(gl_enable_flag_info, gl_enables); + + pandecode_log_cont(",\n"); +} + +static void +pandecode_replay_primitive_size(union midgard_primitive_size u, bool constant) +{ + if (u.pointer == 0x0) + return; + + pandecode_log(".primitive_size = {\n"); + pandecode_indent++; + + if (constant) { + pandecode_prop("constant = %f", u.constant); + } else { + MEMORY_PROP((&u), pointer); + } + + pandecode_indent--; + pandecode_log("},\n"); +} + +static void +pandecode_replay_tiler_only_bfr(const struct bifrost_tiler_only *t, int job_no) +{ + pandecode_log_cont("{\n"); + pandecode_indent++; + + /* TODO: gl_PointSize on Bifrost */ + pandecode_replay_primitive_size(t->primitive_size, true); + + DYN_MEMORY_PROP(t, job_no, tiler_meta); + pandecode_replay_gl_enables(t->gl_enables, JOB_TYPE_TILER); + + if (t->zero1 || t->zero2 || t->zero3 || t->zero4 || t->zero5 + || t->zero6 || t->zero7 || t->zero8) { + pandecode_msg("tiler only zero tripped"); + pandecode_prop("zero1 = 0x%" PRIx64, t->zero1); + pandecode_prop("zero2 = 0x%" PRIx64, t->zero2); + pandecode_prop("zero3 = 0x%" PRIx64, t->zero3); + pandecode_prop("zero4 = 0x%" PRIx64, t->zero4); + pandecode_prop("zero5 = 0x%" PRIx64, t->zero5); + pandecode_prop("zero6 = 0x%" PRIx64, t->zero6); + pandecode_prop("zero7 = 0x%" PRIx32, t->zero7); + pandecode_prop("zero8 = 0x%" PRIx64, t->zero8); + } + + pandecode_indent--; + pandecode_log("},\n"); +} + +static int +pandecode_replay_vertex_job_bfr(const struct mali_job_descriptor_header *h, + const struct pandecode_mapped_memory *mem, + mali_ptr payload, int job_no) +{ + struct bifrost_payload_vertex *PANDECODE_PTR_VAR(v, mem, payload); + + pandecode_replay_vertex_tiler_postfix_pre(&v->postfix, job_no, h->job_type, "", true); + + pandecode_log("struct bifrost_payload_vertex payload_%d = {\n", job_no); + pandecode_indent++; + + pandecode_log(".prefix = "); + pandecode_replay_vertex_tiler_prefix(&v->prefix, job_no); + + pandecode_log(".vertex = "); + pandecode_replay_vertex_only_bfr(&v->vertex); + + pandecode_log(".postfix = "); + pandecode_replay_vertex_tiler_postfix(&v->postfix, job_no, true); + + pandecode_indent--; + pandecode_log("};\n"); + + return sizeof(*v); +} + +static int +pandecode_replay_tiler_job_bfr(const struct mali_job_descriptor_header *h, + const struct pandecode_mapped_memory *mem, + mali_ptr payload, int job_no) +{ + struct bifrost_payload_tiler *PANDECODE_PTR_VAR(t, mem, payload); + + pandecode_replay_vertex_tiler_postfix_pre(&t->postfix, job_no, h->job_type, "", true); + + pandecode_replay_indices(t->prefix.indices, t->prefix.index_count, job_no); + pandecode_replay_tiler_meta(t->tiler.tiler_meta, job_no); + + pandecode_log("struct bifrost_payload_tiler payload_%d = {\n", job_no); + pandecode_indent++; + + pandecode_log(".prefix = "); + pandecode_replay_vertex_tiler_prefix(&t->prefix, job_no); + + pandecode_log(".tiler = "); + pandecode_replay_tiler_only_bfr(&t->tiler, job_no); + + pandecode_log(".postfix = "); + pandecode_replay_vertex_tiler_postfix(&t->postfix, job_no, true); + + pandecode_indent--; + pandecode_log("};\n"); + + return sizeof(*t); +} + +static int +pandecode_replay_vertex_or_tiler_job_mdg(const struct mali_job_descriptor_header *h, + const struct pandecode_mapped_memory *mem, + mali_ptr payload, int job_no) +{ + struct midgard_payload_vertex_tiler *PANDECODE_PTR_VAR(v, mem, payload); + + pandecode_replay_vertex_tiler_postfix_pre(&v->postfix, job_no, h->job_type, "", false); + + pandecode_replay_indices(v->prefix.indices, v->prefix.index_count, job_no); + + pandecode_log("struct midgard_payload_vertex_tiler payload_%d = {\n", job_no); + pandecode_indent++; + + bool has_primitive_pointer = v->prefix.unknown_draw & MALI_DRAW_VARYING_SIZE; + pandecode_replay_primitive_size(v->primitive_size, !has_primitive_pointer); + + pandecode_log(".prefix = "); + pandecode_replay_vertex_tiler_prefix(&v->prefix, job_no); + + pandecode_replay_gl_enables(v->gl_enables, h->job_type); + + if (v->instance_shift || v->instance_odd) { + pandecode_prop("instance_shift = 0x%d /* %d */", + v->instance_shift, 1 << v->instance_shift); + pandecode_prop("instance_odd = 0x%X /* %d */", + v->instance_odd, (2 * v->instance_odd) + 1); + + pandecode_padded_vertices(v->instance_shift, v->instance_odd); + } + + if (v->draw_start) + pandecode_prop("draw_start = %d", v->draw_start); + +#ifndef __LP64__ + + if (v->zero3) { + pandecode_msg("Zero tripped\n"); + pandecode_prop("zero3 = 0x%" PRIx32, v->zero3); + } + +#endif + + if (v->zero5) { + pandecode_msg("Zero tripped\n"); + pandecode_prop("zero5 = 0x%" PRIx64, v->zero5); + } + + pandecode_log(".postfix = "); + pandecode_replay_vertex_tiler_postfix(&v->postfix, job_no, false); + + pandecode_indent--; + pandecode_log("};\n"); + + return sizeof(*v); +} + +static int +pandecode_replay_fragment_job(const struct pandecode_mapped_memory *mem, + mali_ptr payload, int job_no, + bool is_bifrost) +{ + const struct mali_payload_fragment *PANDECODE_PTR_VAR(s, mem, payload); + + bool fbd_dumped = false; + + if (!is_bifrost && (s->framebuffer & FBD_TYPE) == MALI_SFBD) { + /* Only SFBDs are understood, not MFBDs. We're speculating, + * based on the versioning, kernel code, etc, that the + * difference is between Single FrameBuffer Descriptor and + * Multiple FrmaeBuffer Descriptor; the change apparently lines + * up with multi-framebuffer support being added (T7xx onwards, + * including Gxx). In any event, there's some field shuffling + * that we haven't looked into yet. */ + + pandecode_replay_sfbd(s->framebuffer & FBD_MASK, job_no); + fbd_dumped = true; + } else if ((s->framebuffer & FBD_TYPE) == MALI_MFBD) { + /* We don't know if Bifrost supports SFBD's at all, since the + * driver never uses them. And the format is different from + * Midgard anyways, due to the tiler heap and scratchpad being + * moved out into separate structures, so it's not clear what a + * Bifrost SFBD would even look like without getting an actual + * trace, which appears impossible. + */ + + pandecode_replay_mfbd_bfr(s->framebuffer & FBD_MASK, job_no, true); + fbd_dumped = true; + } + + uintptr_t p = (uintptr_t) s->framebuffer & FBD_MASK; + pandecode_log("struct mali_payload_fragment payload_%"PRIx64"_%d = {\n", payload, job_no); + pandecode_indent++; + + /* See the comments by the macro definitions for mathematical context + * on why this is so weird */ + + if (MALI_TILE_COORD_FLAGS(s->max_tile_coord) || MALI_TILE_COORD_FLAGS(s->min_tile_coord)) + pandecode_msg("Tile coordinate flag missed, replay wrong\n"); + + pandecode_prop("min_tile_coord = MALI_COORDINATE_TO_TILE_MIN(%d, %d)", + MALI_TILE_COORD_X(s->min_tile_coord) << MALI_TILE_SHIFT, + MALI_TILE_COORD_Y(s->min_tile_coord) << MALI_TILE_SHIFT); + + pandecode_prop("max_tile_coord = MALI_COORDINATE_TO_TILE_MAX(%d, %d)", + (MALI_TILE_COORD_X(s->max_tile_coord) + 1) << MALI_TILE_SHIFT, + (MALI_TILE_COORD_Y(s->max_tile_coord) + 1) << MALI_TILE_SHIFT); + + /* If the FBD was just decoded, we can refer to it by pointer. If not, + * we have to fallback on offsets. */ + + const char *fbd_type = s->framebuffer & MALI_MFBD ? "MALI_MFBD" : "MALI_SFBD"; + + if (fbd_dumped) + pandecode_prop("framebuffer = framebuffer_%d_p | %s", job_no, fbd_type); + else + pandecode_prop("framebuffer = %s | %s", pointer_as_memory_reference(p), fbd_type); + + pandecode_indent--; + pandecode_log("};\n"); + + return sizeof(*s); +} + +static int job_descriptor_number = 0; + +int +pandecode_replay_jc(mali_ptr jc_gpu_va, bool bifrost) +{ + struct mali_job_descriptor_header *h; + + int start_number = 0; + + bool first = true; + bool last_size; + + do { + struct pandecode_mapped_memory *mem = + pandecode_find_mapped_gpu_mem_containing(jc_gpu_va); + + void *payload; + + h = PANDECODE_PTR(mem, jc_gpu_va, struct mali_job_descriptor_header); + + /* On Midgard, for 32-bit jobs except for fragment jobs, the + * high 32-bits of the 64-bit pointer are reused to store + * something else. + */ + int offset = h->job_descriptor_size == MALI_JOB_32 && + h->job_type != JOB_TYPE_FRAGMENT ? 4 : 0; + mali_ptr payload_ptr = jc_gpu_va + sizeof(*h) - offset; + + payload = pandecode_fetch_gpu_mem(mem, payload_ptr, + MALI_PAYLOAD_SIZE); + + int job_no = job_descriptor_number++; + + if (first) + start_number = job_no; + + pandecode_log("struct mali_job_descriptor_header job_%"PRIx64"_%d = {\n", jc_gpu_va, job_no); + pandecode_indent++; + + pandecode_prop("job_type = %s", pandecode_job_type_name(h->job_type)); + + /* Save for next job fixing */ + last_size = h->job_descriptor_size; + + if (h->job_descriptor_size) + pandecode_prop("job_descriptor_size = %d", h->job_descriptor_size); + + if (h->exception_status != 0x1) + pandecode_prop("exception_status = %x (source ID: 0x%x access: 0x%x exception: 0x%x)", + h->exception_status, + (h->exception_status >> 16) & 0xFFFF, + (h->exception_status >> 8) & 0x3, + h->exception_status & 0xFF); + + if (h->first_incomplete_task) + pandecode_prop("first_incomplete_task = %d", h->first_incomplete_task); + + if (h->fault_pointer) + pandecode_prop("fault_pointer = 0x%" PRIx64, h->fault_pointer); + + if (h->job_barrier) + pandecode_prop("job_barrier = %d", h->job_barrier); + + pandecode_prop("job_index = %d", h->job_index); + + if (h->unknown_flags) + pandecode_prop("unknown_flags = %d", h->unknown_flags); + + if (h->job_dependency_index_1) + pandecode_prop("job_dependency_index_1 = %d", h->job_dependency_index_1); + + if (h->job_dependency_index_2) + pandecode_prop("job_dependency_index_2 = %d", h->job_dependency_index_2); + + pandecode_indent--; + pandecode_log("};\n"); + + /* Do not touch the field yet -- decode the payload first, and + * don't touch that either. This is essential for the uploads + * to occur in sequence and therefore be dynamically allocated + * correctly. Do note the size, however, for that related + * reason. */ + + switch (h->job_type) { + case JOB_TYPE_SET_VALUE: { + struct mali_payload_set_value *s = payload; + pandecode_log("struct mali_payload_set_value payload_%"PRIx64"_%d = {\n", payload_ptr, job_no); + pandecode_indent++; + MEMORY_PROP(s, out); + pandecode_prop("unknown = 0x%" PRIX64, s->unknown); + pandecode_indent--; + pandecode_log("};\n"); + + break; + } + + case JOB_TYPE_TILER: + case JOB_TYPE_VERTEX: + case JOB_TYPE_COMPUTE: + if (bifrost) { + if (h->job_type == JOB_TYPE_TILER) + pandecode_replay_tiler_job_bfr(h, mem, payload_ptr, job_no); + else + pandecode_replay_vertex_job_bfr(h, mem, payload_ptr, job_no); + } else + pandecode_replay_vertex_or_tiler_job_mdg(h, mem, payload_ptr, job_no); + + break; + + case JOB_TYPE_FRAGMENT: + pandecode_replay_fragment_job(mem, payload_ptr, job_no, bifrost); + break; + + default: + break; + } + + /* Handle linkage */ + + if (!first) { + pandecode_log("((struct mali_job_descriptor_header *) (uintptr_t) job_%d_p)->", job_no - 1); + + if (last_size) + pandecode_log_cont("next_job_64 = job_%d_p;\n\n", job_no); + else + pandecode_log_cont("next_job_32 = (u32) (uintptr_t) job_%d_p;\n\n", job_no); + } + + first = false; + + } while ((jc_gpu_va = h->job_descriptor_size ? h->next_job_64 : h->next_job_32)); + + return start_number; +} diff --git a/src/panfrost/pandecode/decode.h b/src/panfrost/pandecode/decode.h new file mode 100644 index 00000000000..9c788bbabcc --- /dev/null +++ b/src/panfrost/pandecode/decode.h @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2017-2019 Lyude Paul + * Copyright (C) 2017-2019 Alyssa Rosenzweig + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __PAN_DECODE_H__ +#define __PAN_DECODE_H__ + +#include <stdlib.h> +#include <stddef.h> +#include <panfrost-job.h> +#include "util/list.h" + +struct pandecode_mapped_memory { + struct list_head node; + + size_t length; + + void *addr; + mali_ptr gpu_va; + + char name[32]; +}; + +void pandecode_initialize(void); + +char *pointer_as_memory_reference(mali_ptr ptr); + +struct pandecode_mapped_memory *pandecode_find_mapped_gpu_mem_containing(mali_ptr addr); + +void +pandecode_inject_mmap(mali_ptr gpu_va, void *cpu, unsigned sz, const char *name); + +static inline void * +__pandecode_fetch_gpu_mem(const struct pandecode_mapped_memory *mem, + mali_ptr gpu_va, size_t size, + int line, const char *filename) +{ + if (!mem) + mem = pandecode_find_mapped_gpu_mem_containing(gpu_va); + + if (!mem) { + fprintf(stderr, "Access to unknown memory %" PRIx64 " in %s:%d", + gpu_va, filename, line); + assert(0); + } + + assert(mem); + assert(size + (gpu_va - mem->gpu_va) <= mem->length); + + return mem->addr + gpu_va - mem->gpu_va; +} + +#define pandecode_fetch_gpu_mem(mem, gpu_va, size) \ + __pandecode_fetch_gpu_mem(mem, gpu_va, size, __LINE__, __FILE__) + +/* Returns a validated pointer to mapped GPU memory with the given pointer type, + * size automatically determined from the pointer type + */ +#define PANDECODE_PTR(mem, gpu_va, type) \ + ((type*)(__pandecode_fetch_gpu_mem(mem, gpu_va, sizeof(type), \ + __LINE__, __FILE__))) + +/* Usage: <variable type> PANDECODE_PTR_VAR(name, mem, gpu_va) */ +#define PANDECODE_PTR_VAR(name, mem, gpu_va) \ + name = __pandecode_fetch_gpu_mem(mem, gpu_va, sizeof(*name), \ + __LINE__, __FILE__) + +/* Common entrypoint */ +int pandecode_replay_jc(mali_ptr jc_gpu_va, bool bifrost); + +#endif /* __MMAP_TRACE_H__ */ diff --git a/src/panfrost/pandecode/meson.build b/src/panfrost/pandecode/meson.build new file mode 100644 index 00000000000..2c341a58cc4 --- /dev/null +++ b/src/panfrost/pandecode/meson.build @@ -0,0 +1,35 @@ +# Copyright © 2018 Rob Clark +# Copyright © 2019 Collabora + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +libpanfrost_decode_files = files( + 'pan_pretty_print.c', + 'common.c', + 'decode.c', +) + +libpanfrost_decode = static_library( + 'panfrost_decode', + [libpanfrost_decode_files], + include_directories : [inc_common, inc_panfrost], + c_args : [c_vis_args, no_override_init_args], + cpp_args : [cpp_vis_args], + build_by_default : false, +) diff --git a/src/panfrost/pandecode/pan_pretty_print.c b/src/panfrost/pandecode/pan_pretty_print.c new file mode 100644 index 00000000000..c59bfa1da5a --- /dev/null +++ b/src/panfrost/pandecode/pan_pretty_print.c @@ -0,0 +1,226 @@ +/* + * © Copyright 2017-2098 The Panfrost Communiy + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "pan_pretty_print.h" + +#include <stdio.h> +#include <string.h> +#include <assert.h> + +/* Some self-contained prettyprinting functions shared between pandecode and + * the main driver */ + +#define DEFINE_CASE(name) case MALI_## name: return "MALI_" #name +char *pandecode_format_name(enum mali_format format) +{ + static char unk_format_str[5]; + + switch (format) { + DEFINE_CASE(RGB565); + DEFINE_CASE(RGB5_A1_UNORM); + DEFINE_CASE(RGB10_A2_UNORM); + DEFINE_CASE(RGB10_A2_SNORM); + DEFINE_CASE(RGB10_A2UI); + DEFINE_CASE(RGB10_A2I); + DEFINE_CASE(NV12); + DEFINE_CASE(Z32_UNORM); + DEFINE_CASE(R32_FIXED); + DEFINE_CASE(RG32_FIXED); + DEFINE_CASE(RGB32_FIXED); + DEFINE_CASE(RGBA32_FIXED); + DEFINE_CASE(R11F_G11F_B10F); + DEFINE_CASE(R9F_G9F_B9F_E5F); + DEFINE_CASE(VARYING_POS); + DEFINE_CASE(VARYING_DISCARD); + + DEFINE_CASE(R8_SNORM); + DEFINE_CASE(R16_SNORM); + DEFINE_CASE(R32_SNORM); + DEFINE_CASE(RG8_SNORM); + DEFINE_CASE(RG16_SNORM); + DEFINE_CASE(RG32_SNORM); + DEFINE_CASE(RGB8_SNORM); + DEFINE_CASE(RGB16_SNORM); + DEFINE_CASE(RGB32_SNORM); + DEFINE_CASE(RGBA8_SNORM); + DEFINE_CASE(RGBA16_SNORM); + DEFINE_CASE(RGBA32_SNORM); + + DEFINE_CASE(R8UI); + DEFINE_CASE(R16UI); + DEFINE_CASE(R32UI); + DEFINE_CASE(RG8UI); + DEFINE_CASE(RG16UI); + DEFINE_CASE(RG32UI); + DEFINE_CASE(RGB8UI); + DEFINE_CASE(RGB16UI); + DEFINE_CASE(RGB32UI); + DEFINE_CASE(RGBA8UI); + DEFINE_CASE(RGBA16UI); + DEFINE_CASE(RGBA32UI); + + DEFINE_CASE(R8_UNORM); + DEFINE_CASE(R16_UNORM); + DEFINE_CASE(R32_UNORM); + DEFINE_CASE(R32F); + DEFINE_CASE(RG8_UNORM); + DEFINE_CASE(RG16_UNORM); + DEFINE_CASE(RG32_UNORM); + DEFINE_CASE(RG32F); + DEFINE_CASE(RGB8_UNORM); + DEFINE_CASE(RGB16_UNORM); + DEFINE_CASE(RGB32_UNORM); + DEFINE_CASE(RGB32F); + DEFINE_CASE(RGBA4_UNORM); + DEFINE_CASE(RGBA8_UNORM); + DEFINE_CASE(RGBA16_UNORM); + DEFINE_CASE(RGBA32_UNORM); + DEFINE_CASE(RGBA32F); + + DEFINE_CASE(R8I); + DEFINE_CASE(R16I); + DEFINE_CASE(R32I); + DEFINE_CASE(RG8I); + DEFINE_CASE(R16F); + DEFINE_CASE(RG16I); + DEFINE_CASE(RG32I); + DEFINE_CASE(RG16F); + DEFINE_CASE(RGB8I); + DEFINE_CASE(RGB16I); + DEFINE_CASE(RGB32I); + DEFINE_CASE(RGB16F); + DEFINE_CASE(RGBA8I); + DEFINE_CASE(RGBA16I); + DEFINE_CASE(RGBA32I); + DEFINE_CASE(RGBA16F); + + DEFINE_CASE(RGBA4); + DEFINE_CASE(RGBA8_2); + DEFINE_CASE(RGB10_A2_2); + default: + snprintf(unk_format_str, sizeof(unk_format_str), "0x%02x", format); + return unk_format_str; + } +} + +#undef DEFINE_CASE + +/* Helper to dump fixed-function blend part for debugging */ + +static const char * +panfrost_factor_name(enum mali_dominant_factor factor) +{ + switch (factor) { + case MALI_DOMINANT_UNK0: + return "unk0"; + + case MALI_DOMINANT_ZERO: + return "zero"; + + case MALI_DOMINANT_SRC_COLOR: + return "source color"; + + case MALI_DOMINANT_DST_COLOR: + return "dest color"; + + case MALI_DOMINANT_UNK4: + return "unk4"; + + case MALI_DOMINANT_SRC_ALPHA: + return "source alpha"; + + case MALI_DOMINANT_DST_ALPHA: + return "dest alpha"; + + case MALI_DOMINANT_CONSTANT: + return "constant"; + } + + return "unreachable"; +} + +static const char * +panfrost_modifier_name(enum mali_blend_modifier mod) +{ + switch (mod) { + case MALI_BLEND_MOD_UNK0: + return "unk0"; + + case MALI_BLEND_MOD_NORMAL: + return "normal"; + + case MALI_BLEND_MOD_SOURCE_ONE: + return "source one"; + + case MALI_BLEND_MOD_DEST_ONE: + return "dest one"; + } + + return "unreachable"; +} + +static void +panfrost_print_fixed_part(const char *name, unsigned u) +{ + struct mali_blend_mode part; + memcpy(&part, &u, sizeof(part)); + + printf("%s blend mode (%X):\n", name, u); + + printf(" %s dominant:\n", + (part.dominant == MALI_BLEND_DOM_SOURCE) ? "source" : "destination"); + + printf(" %s\n", panfrost_factor_name(part.dominant_factor)); + + if (part.complement_dominant) + printf(" complement\n"); + + + printf(" nondominant %s\n", + (part.nondominant_mode == MALI_BLEND_NON_MIRROR) ? "mirror" : "zero"); + + + printf(" mode: %s\n", panfrost_modifier_name(part.clip_modifier)); + + if (part.negate_source) printf(" negate source\n"); + + if (part.negate_dest) printf(" negate dest\n"); + + assert(!(part.unused_0 || part.unused_1)); +} + +void +panfrost_print_blend_equation(struct mali_blend_equation eq) +{ + printf("\n"); + panfrost_print_fixed_part("RGB", eq.rgb_mode); + panfrost_print_fixed_part("Alpha", eq.alpha_mode); + + assert(!eq.zero1); + + printf("Mask: %s%s%s%s\n", + (eq.color_mask & MALI_MASK_R) ? "R" : "", + (eq.color_mask & MALI_MASK_G) ? "G" : "", + (eq.color_mask & MALI_MASK_B) ? "B" : "", + (eq.color_mask & MALI_MASK_A) ? "A" : ""); +} diff --git a/src/panfrost/pandecode/pan_pretty_print.h b/src/panfrost/pandecode/pan_pretty_print.h new file mode 100644 index 00000000000..22dca4abbf6 --- /dev/null +++ b/src/panfrost/pandecode/pan_pretty_print.h @@ -0,0 +1,32 @@ +/* + * © Copyright 2017-2098 The Panfrost Communiy + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __PAN_PRETTY_H +#define __PAN_PRETTY_H + +#include "panfrost-job.h" + +char *pandecode_format_name(enum mali_format format); +void panfrost_print_blend_equation(struct mali_blend_equation eq); + +#endif |