diff options
author | Alyssa Rosenzweig <[email protected]> | 2019-07-10 10:33:24 -0700 |
---|---|---|
committer | Alyssa Rosenzweig <[email protected]> | 2019-07-10 10:43:23 -0700 |
commit | ec2a59cd7aa42652645e76e29a72335370c80e50 (patch) | |
tree | 08e75a12d073cc627307bc59ab3a8d057244b68e /src/panfrost/midgard | |
parent | a2d0ea92ba752c62e59aa681acda7b97fc86d100 (diff) |
panfrost: Move non-Gallium files outside of Gallium
In preparation for a Panfrost-based non-Gallium driver (maybe
Vulkan...?), hoist everything except for the Gallium driver into a
shared src/panfrost. Practically, that means the compilers, the headers,
and pandecode.
Signed-off-by: Alyssa Rosenzweig <[email protected]>
Diffstat (limited to 'src/panfrost/midgard')
21 files changed, 7988 insertions, 0 deletions
diff --git a/src/panfrost/midgard/compiler.h b/src/panfrost/midgard/compiler.h new file mode 100644 index 00000000000..79fe7dfc78a --- /dev/null +++ b/src/panfrost/midgard/compiler.h @@ -0,0 +1,456 @@ +/* + * Copyright (C) 2019 Alyssa Rosenzweig <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _MDG_COMPILER_H +#define _MDG_COMPILER_H + +#include "midgard.h" +#include "helpers.h" +#include "midgard_compile.h" + +#include "util/hash_table.h" +#include "util/u_dynarray.h" +#include "util/set.h" +#include "util/list.h" + +#include "main/mtypes.h" +#include "compiler/nir_types.h" +#include "compiler/nir/nir.h" + +/* Forward declare */ +struct midgard_block; + +/* Target types. Defaults to TARGET_GOTO (the type corresponding directly to + * the hardware), hence why that must be zero. TARGET_DISCARD signals this + * instruction is actually a discard op. */ + +#define TARGET_GOTO 0 +#define TARGET_BREAK 1 +#define TARGET_CONTINUE 2 +#define TARGET_DISCARD 3 + +typedef struct midgard_branch { + /* If conditional, the condition is specified in r31.w */ + bool conditional; + + /* For conditionals, if this is true, we branch on FALSE. If false, we branch on TRUE. */ + bool invert_conditional; + + /* Branch targets: the start of a block, the start of a loop (continue), the end of a loop (break). Value is one of TARGET_ */ + unsigned target_type; + + /* The actual target */ + union { + int target_block; + int target_break; + int target_continue; + }; +} midgard_branch; + +/* Instruction arguments represented as block-local SSA indices, rather than + * registers. Negative values mean unused. */ + +typedef struct { + int src0; + int src1; + int dest; + + /* src1 is -not- SSA but instead a 16-bit inline constant to be smudged + * in. Only valid for ALU ops. */ + bool inline_constant; +} ssa_args; + +/* Generic in-memory data type repesenting a single logical instruction, rather + * than a single instruction group. This is the preferred form for code gen. + * Multiple midgard_insturctions will later be combined during scheduling, + * though this is not represented in this structure. Its format bridges + * the low-level binary representation with the higher level semantic meaning. + * + * Notably, it allows registers to be specified as block local SSA, for code + * emitted before the register allocation pass. + */ + +typedef struct midgard_instruction { + /* Must be first for casting */ + struct list_head link; + + unsigned type; /* ALU, load/store, texture */ + + /* If the register allocator has not run yet... */ + ssa_args ssa_args; + + /* Special fields for an ALU instruction */ + midgard_reg_info registers; + + /* I.e. (1 << alu_bit) */ + int unit; + + /* When emitting bundle, should this instruction have a break forced + * before it? Used for r31 writes which are valid only within a single + * bundle and *need* to happen as early as possible... this is a hack, + * TODO remove when we have a scheduler */ + bool precede_break; + + bool has_constants; + float constants[4]; + uint16_t inline_constant; + bool has_blend_constant; + + bool compact_branch; + bool writeout; + bool prepacked_branch; + + /* Masks in a saneish format. One bit per channel, not packed fancy. + * Use this instead of the op specific ones, and switch over at emit + * time */ + uint16_t mask; + + union { + midgard_load_store_word load_store; + midgard_vector_alu alu; + midgard_texture_word texture; + midgard_branch_extended branch_extended; + uint16_t br_compact; + + /* General branch, rather than packed br_compact. Higher level + * than the other components */ + midgard_branch branch; + }; +} midgard_instruction; + +typedef struct midgard_block { + /* Link to next block. Must be first for mir_get_block */ + struct list_head link; + + /* List of midgard_instructions emitted for the current block */ + struct list_head instructions; + + bool is_scheduled; + + /* List of midgard_bundles emitted (after the scheduler has run) */ + struct util_dynarray bundles; + + /* Number of quadwords _actually_ emitted, as determined after scheduling */ + unsigned quadword_count; + + /* Successors: always one forward (the block after us), maybe + * one backwards (for a backward branch). No need for a second + * forward, since graph traversal would get there eventually + * anyway */ + struct midgard_block *successors[2]; + unsigned nr_successors; + + /* The successors pointer form a graph, and in the case of + * complex control flow, this graph has a cycles. To aid + * traversal during liveness analysis, we have a visited? + * boolean for passes to use as they see fit, provided they + * clean up later */ + bool visited; +} midgard_block; + +typedef struct midgard_bundle { + /* Tag for the overall bundle */ + int tag; + + /* Instructions contained by the bundle */ + int instruction_count; + midgard_instruction *instructions[5]; + + /* Bundle-wide ALU configuration */ + int padding; + int control; + bool has_embedded_constants; + float constants[4]; + bool has_blend_constant; +} midgard_bundle; + +typedef struct compiler_context { + nir_shader *nir; + gl_shader_stage stage; + + /* Is internally a blend shader? Depends on stage == FRAGMENT */ + bool is_blend; + + /* Tracking for blend constant patching */ + int blend_constant_offset; + + /* Current NIR function */ + nir_function *func; + + /* Unordered list of midgard_blocks */ + int block_count; + struct list_head blocks; + + midgard_block *initial_block; + midgard_block *previous_source_block; + midgard_block *final_block; + + /* List of midgard_instructions emitted for the current block */ + midgard_block *current_block; + + /* The current "depth" of the loop, for disambiguating breaks/continues + * when using nested loops */ + int current_loop_depth; + + /* Total number of loops for shader-db */ + unsigned loop_count; + + /* Constants which have been loaded, for later inlining */ + struct hash_table_u64 *ssa_constants; + + /* SSA values / registers which have been aliased. Naively, these + * demand a fmov output; instead, we alias them in a later pass to + * avoid the wasted op. + * + * A note on encoding: to avoid dynamic memory management here, rather + * than ampping to a pointer, we map to the source index; the key + * itself is just the destination index. */ + + struct hash_table_u64 *ssa_to_alias; + struct set *leftover_ssa_to_alias; + + /* Actual SSA-to-register for RA */ + struct hash_table_u64 *ssa_to_register; + + /* Mapping of hashes computed from NIR indices to the sequential temp indices ultimately used in MIR */ + struct hash_table_u64 *hash_to_temp; + int temp_count; + int max_hash; + + /* Just the count of the max register used. Higher count => higher + * register pressure */ + int work_registers; + + /* Used for cont/last hinting. Increase when a tex op is added. + * Decrease when a tex op is removed. */ + int texture_op_count; + + /* Mapping of texture register -> SSA index for unaliasing */ + int texture_index[2]; + + /* If any path hits a discard instruction */ + bool can_discard; + + /* The number of uniforms allowable for the fast path */ + int uniform_cutoff; + + /* Count of instructions emitted from NIR overall, across all blocks */ + int instruction_count; + + /* Alpha ref value passed in */ + float alpha_ref; + + /* The index corresponding to the fragment output */ + unsigned fragment_output; + + /* The mapping of sysvals to uniforms, the count, and the off-by-one inverse */ + unsigned sysvals[MAX_SYSVAL_COUNT]; + unsigned sysval_count; + struct hash_table_u64 *sysval_to_id; +} compiler_context; + +/* Helpers for manipulating the above structures (forming the driver IR) */ + +/* Append instruction to end of current block */ + +static inline midgard_instruction * +mir_upload_ins(struct midgard_instruction ins) +{ + midgard_instruction *heap = malloc(sizeof(ins)); + memcpy(heap, &ins, sizeof(ins)); + return heap; +} + +static inline void +emit_mir_instruction(struct compiler_context *ctx, struct midgard_instruction ins) +{ + list_addtail(&(mir_upload_ins(ins))->link, &ctx->current_block->instructions); +} + +static inline void +mir_insert_instruction_before(struct midgard_instruction *tag, struct midgard_instruction ins) +{ + list_addtail(&(mir_upload_ins(ins))->link, &tag->link); +} + +static inline void +mir_remove_instruction(struct midgard_instruction *ins) +{ + list_del(&ins->link); +} + +static inline midgard_instruction* +mir_prev_op(struct midgard_instruction *ins) +{ + return list_last_entry(&(ins->link), midgard_instruction, link); +} + +static inline midgard_instruction* +mir_next_op(struct midgard_instruction *ins) +{ + return list_first_entry(&(ins->link), midgard_instruction, link); +} + +#define mir_foreach_block(ctx, v) \ + list_for_each_entry(struct midgard_block, v, &ctx->blocks, link) + +#define mir_foreach_block_from(ctx, from, v) \ + list_for_each_entry_from(struct midgard_block, v, from, &ctx->blocks, link) + +#define mir_foreach_instr(ctx, v) \ + list_for_each_entry(struct midgard_instruction, v, &ctx->current_block->instructions, link) + +#define mir_foreach_instr_safe(ctx, v) \ + list_for_each_entry_safe(struct midgard_instruction, v, &ctx->current_block->instructions, link) + +#define mir_foreach_instr_in_block(block, v) \ + list_for_each_entry(struct midgard_instruction, v, &block->instructions, link) + +#define mir_foreach_instr_in_block_safe(block, v) \ + list_for_each_entry_safe(struct midgard_instruction, v, &block->instructions, link) + +#define mir_foreach_instr_in_block_safe_rev(block, v) \ + list_for_each_entry_safe_rev(struct midgard_instruction, v, &block->instructions, link) + +#define mir_foreach_instr_in_block_from(block, v, from) \ + list_for_each_entry_from(struct midgard_instruction, v, from, &block->instructions, link) + +#define mir_foreach_instr_in_block_from_rev(block, v, from) \ + list_for_each_entry_from_rev(struct midgard_instruction, v, from, &block->instructions, link) + +#define mir_foreach_bundle_in_block(block, v) \ + util_dynarray_foreach(&block->bundles, midgard_bundle, v) + +#define mir_foreach_instr_global(ctx, v) \ + mir_foreach_block(ctx, v_block) \ + mir_foreach_instr_in_block(v_block, v) + + +static inline midgard_instruction * +mir_last_in_block(struct midgard_block *block) +{ + return list_last_entry(&block->instructions, struct midgard_instruction, link); +} + +static inline midgard_block * +mir_get_block(compiler_context *ctx, int idx) +{ + struct list_head *lst = &ctx->blocks; + + while ((idx--) + 1) + lst = lst->next; + + return (struct midgard_block *) lst; +} + +static inline bool +mir_is_alu_bundle(midgard_bundle *bundle) +{ + return IS_ALU(bundle->tag); +} + +/* MIR manipulation */ + +void mir_rewrite_index(compiler_context *ctx, unsigned old, unsigned new); +void mir_rewrite_index_src(compiler_context *ctx, unsigned old, unsigned new); +void mir_rewrite_index_dst(compiler_context *ctx, unsigned old, unsigned new); + +/* MIR printing */ + +void mir_print_instruction(midgard_instruction *ins); +void mir_print_bundle(midgard_bundle *ctx); +void mir_print_block(midgard_block *block); +void mir_print_shader(compiler_context *ctx); + +/* MIR goodies */ + +static const midgard_vector_alu_src blank_alu_src = { + .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), +}; + +static const midgard_vector_alu_src blank_alu_src_xxxx = { + .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_X, COMPONENT_X, COMPONENT_X), +}; + +static const midgard_scalar_alu_src blank_scalar_alu_src = { + .full = true +}; + +/* Used for encoding the unused source of 1-op instructions */ +static const midgard_vector_alu_src zero_alu_src = { 0 }; + +/* 'Intrinsic' move for aliasing */ + +static inline midgard_instruction +v_mov(unsigned src, midgard_vector_alu_src mod, unsigned dest) +{ + midgard_instruction ins = { + .type = TAG_ALU_4, + .mask = 0xF, + .ssa_args = { + .src0 = SSA_UNUSED_1, + .src1 = src, + .dest = dest, + }, + .alu = { + .op = midgard_alu_op_imov, + .reg_mode = midgard_reg_mode_32, + .dest_override = midgard_dest_override_none, + .outmod = midgard_outmod_int_wrap, + .src1 = vector_alu_srco_unsigned(zero_alu_src), + .src2 = vector_alu_srco_unsigned(mod) + }, + }; + + return ins; +} + +/* Scheduling */ + +void schedule_program(compiler_context *ctx); + +/* Register allocation */ + +struct ra_graph; + +struct ra_graph* allocate_registers(compiler_context *ctx); +void install_registers(compiler_context *ctx, struct ra_graph *g); +bool mir_is_live_after(compiler_context *ctx, midgard_block *block, midgard_instruction *start, int src); +bool mir_has_multiple_writes(compiler_context *ctx, int src); + +void mir_create_pipeline_registers(compiler_context *ctx); + +/* Final emission */ + +void emit_binary_bundle( + compiler_context *ctx, + midgard_bundle *bundle, + struct util_dynarray *emission, + int next_tag); + +/* NIR stuff */ + +bool +nir_undef_to_zero(nir_shader *shader); + +#endif diff --git a/src/panfrost/midgard/cppwrap.cpp b/src/panfrost/midgard/cppwrap.cpp new file mode 100644 index 00000000000..cf2ca3b7a11 --- /dev/null +++ b/src/panfrost/midgard/cppwrap.cpp @@ -0,0 +1,9 @@ +struct exec_list; + +bool do_mat_op_to_vec(struct exec_list *instructions); + +extern "C" { + bool c_do_mat_op_to_vec(struct exec_list *instructions) { + return do_mat_op_to_vec(instructions); + } +}; diff --git a/src/panfrost/midgard/disassemble.c b/src/panfrost/midgard/disassemble.c new file mode 100644 index 00000000000..bed803162f3 --- /dev/null +++ b/src/panfrost/midgard/disassemble.c @@ -0,0 +1,1317 @@ +/* Author(s): + * Connor Abbott + * Alyssa Rosenzweig + * + * Copyright (c) 2013 Connor Abbott ([email protected]) + * Copyright (c) 2018 Alyssa Rosenzweig ([email protected]) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <stdio.h> +#include <stdint.h> +#include <assert.h> +#include <inttypes.h> +#include <ctype.h> +#include <string.h> +#include "midgard.h" +#include "midgard-parse.h" +#include "midgard_ops.h" +#include "disassemble.h" +#include "helpers.h" +#include "util/half_float.h" +#include "util/u_math.h" + +#define DEFINE_CASE(define, str) case define: { printf(str); break; } + +static bool is_instruction_int = false; + +/* Prints a short form of the tag for branching, the minimum needed to be + * legible and unambiguous */ + +static void +print_tag_short(unsigned tag) +{ + switch (midgard_word_types[tag]) { + case midgard_word_type_texture: + printf("tex/%X", tag); + break; + + case midgard_word_type_load_store: + printf("ldst"); + break; + + case midgard_word_type_alu: + printf("alu%d/%X", midgard_word_size[tag], tag); + break; + + default: + printf("%s%X", (tag > 0) ? "" : "unk", tag); + break; + } +} + +static void +print_alu_opcode(midgard_alu_op op) +{ + bool int_op = false; + + if (alu_opcode_props[op].name) { + printf("%s", alu_opcode_props[op].name); + + int_op = midgard_is_integer_op(op); + } else + printf("alu_op_%02X", op); + + /* For constant analysis */ + is_instruction_int = int_op; +} + +static void +print_ld_st_opcode(midgard_load_store_op op) +{ + if (load_store_opcode_names[op]) + printf("%s", load_store_opcode_names[op]); + else + printf("ldst_op_%02X", op); +} + +static bool is_embedded_constant_half = false; +static bool is_embedded_constant_int = false; + +static char +prefix_for_bits(unsigned bits) +{ + switch (bits) { + case 8: + return 'q'; + case 16: + return 'h'; + case 64: + return 'd'; + default: + return 0; + } +} + +static void +print_reg(unsigned reg, unsigned bits) +{ + /* Perform basic static analysis for expanding constants correctly */ + + if (reg == 26) { + is_embedded_constant_int = is_instruction_int; + is_embedded_constant_half = (bits < 32); + } + + char prefix = prefix_for_bits(bits); + + if (prefix) + putchar(prefix); + + printf("r%u", reg); +} + +static char *outmod_names_float[4] = { + "", + ".pos", + ".unk2", + ".sat" +}; + +static char *outmod_names_int[4] = { + ".isat", + ".usat", + "", + ".hi" +}; + +static char *srcmod_names_int[4] = { + "sext(", + "zext(", + "", + "(" +}; + +static void +print_outmod(unsigned outmod, bool is_int) +{ + printf("%s", is_int ? outmod_names_int[outmod] : + outmod_names_float[outmod]); +} + +static void +print_quad_word(uint32_t *words, unsigned tabs) +{ + unsigned i; + + for (i = 0; i < 4; i++) + printf("0x%08X%s ", words[i], i == 3 ? "" : ","); + + printf("\n"); +} + +static const char components[16] = "xyzwefghijklmnop"; + +/* Helper to print 4 chars of a swizzle */ +static void +print_swizzle_helper(unsigned swizzle, bool upper) +{ + for (unsigned i = 0; i < 4; ++i) { + unsigned c = (swizzle >> (i * 2)) & 3; + c += upper*4; + printf("%c", components[c]); + } +} + +/* Helper to print 8 chars of a swizzle, duplicating over */ +static void +print_swizzle_helper_8(unsigned swizzle, bool upper) +{ + for (unsigned i = 0; i < 4; ++i) { + unsigned c = (swizzle >> (i * 2)) & 3; + c *= 2; + c += upper*8; + printf("%c%c", components[c], components[c+1]); + } +} + +static void +print_swizzle_vec16(unsigned swizzle, bool rep_high, bool rep_low, + midgard_dest_override override) +{ + printf("."); + + if (override == midgard_dest_override_upper) { + if (rep_high) + printf(" /* rep_high */ "); + if (rep_low) + printf(" /* rep_low */ "); + + if (!rep_high && rep_low) + print_swizzle_helper_8(swizzle, true); + else + print_swizzle_helper_8(swizzle, false); + } else { + print_swizzle_helper_8(swizzle, rep_high & 1); + print_swizzle_helper_8(swizzle, !rep_low & 1); + } +} + +static void +print_swizzle_vec8(unsigned swizzle, bool rep_high, bool rep_low) +{ + printf("."); + + print_swizzle_helper(swizzle, rep_high & 1); + print_swizzle_helper(swizzle, !rep_low & 1); +} + +static void +print_swizzle_vec4(unsigned swizzle, bool rep_high, bool rep_low) +{ + if (rep_high) + printf(" /* rep_high */ "); + if (rep_low) + printf(" /* rep_low */ "); + + if (swizzle == 0xE4) return; /* xyzw */ + + printf("."); + print_swizzle_helper(swizzle, 0); +} +static void +print_swizzle_vec2(unsigned swizzle, bool rep_high, bool rep_low) +{ + if (rep_high) + printf(" /* rep_high */ "); + if (rep_low) + printf(" /* rep_low */ "); + + if (swizzle == 0xE4) return; /* XY */ + + printf("."); + + for (unsigned i = 0; i < 4; i += 2) { + unsigned a = (swizzle >> (i * 2)) & 3; + unsigned b = (swizzle >> ((i+1) * 2)) & 3; + + /* Normally we're adjacent, but if there's an issue, don't make + * it ambiguous */ + + if (a & 0x1) + printf("[%c%c]", components[a], components[b]); + else if (a == b) + printf("%c", components[a >> 1]); + else if (b == (a + 1)) + printf("%c", "XY"[a >> 1]); + else + printf("[%c%c]", components[a], components[b]); + } +} + +static int +bits_for_mode(midgard_reg_mode mode) +{ + switch (mode) { + case midgard_reg_mode_8: + return 8; + case midgard_reg_mode_16: + return 16; + case midgard_reg_mode_32: + return 32; + case midgard_reg_mode_64: + return 64; + default: + return 0; + } +} + +static int +bits_for_mode_halved(midgard_reg_mode mode, bool half) +{ + unsigned bits = bits_for_mode(mode); + + if (half) + bits >>= 1; + + return bits; +} + +static void +print_vector_src(unsigned src_binary, + midgard_reg_mode mode, unsigned reg, + midgard_dest_override override, bool is_int) +{ + midgard_vector_alu_src *src = (midgard_vector_alu_src *)&src_binary; + + /* Modifiers change meaning depending on the op's context */ + + midgard_int_mod int_mod = src->mod; + + if (is_int) { + printf("%s", srcmod_names_int[int_mod]); + } else { + if (src->mod & MIDGARD_FLOAT_MOD_NEG) + printf("-"); + + if (src->mod & MIDGARD_FLOAT_MOD_ABS) + printf("abs("); + } + + //register + unsigned bits = bits_for_mode_halved(mode, src->half); + print_reg(reg, bits); + + //swizzle + if (bits == 16) + print_swizzle_vec8(src->swizzle, src->rep_high, src->rep_low); + else if (bits == 8) + print_swizzle_vec16(src->swizzle, src->rep_high, src->rep_low, override); + else if (bits == 32) + print_swizzle_vec4(src->swizzle, src->rep_high, src->rep_low); + else if (bits == 64) + print_swizzle_vec2(src->swizzle, src->rep_high, src->rep_low); + + /* Since we wrapped with a function-looking thing */ + + if (is_int && int_mod == midgard_int_shift) + printf(") << %d", bits); + else if ((is_int && (int_mod != midgard_int_normal)) + || (!is_int && src->mod & MIDGARD_FLOAT_MOD_ABS)) + printf(")"); +} + +static uint16_t +decode_vector_imm(unsigned src2_reg, unsigned imm) +{ + uint16_t ret; + ret = src2_reg << 11; + ret |= (imm & 0x7) << 8; + ret |= (imm >> 3) & 0xFF; + return ret; +} + +static void +print_immediate(uint16_t imm) +{ + if (is_instruction_int) + printf("#%d", imm); + else + printf("#%g", _mesa_half_to_float(imm)); +} + +static unsigned +print_dest(unsigned reg, midgard_reg_mode mode, midgard_dest_override override) +{ + /* Depending on the mode and override, we determine the type of + * destination addressed. Absent an override, we address just the + * type of the operation itself */ + + unsigned bits = bits_for_mode(mode); + + if (override != midgard_dest_override_none) + bits /= 2; + + print_reg(reg, bits); + + return bits; +} + +static void +print_mask_vec16(uint8_t mask, midgard_dest_override override) +{ + printf("."); + + if (override == midgard_dest_override_none) { + for (unsigned i = 0; i < 8; i++) { + if (mask & (1 << i)) + printf("%c%c", + components[i*2 + 0], + components[i*2 + 1]); + } + } else { + bool upper = (override == midgard_dest_override_upper); + + for (unsigned i = 0; i < 8; i++) { + if (mask & (1 << i)) + printf("%c", components[i + (upper ? 8 : 0)]); + } + } +} + +/* For 16-bit+ masks, we read off from the 8-bit mask field. For 16-bit (vec8), + * it's just one bit per channel, easy peasy. For 32-bit (vec4), it's one bit + * per channel with one duplicate bit in the middle. For 64-bit (vec2), it's + * one-bit per channel with _3_ duplicate bits in the middle. Basically, just + * subdividing the 128-bit word in 16-bit increments. For 64-bit, we uppercase + * the mask to make it obvious what happened */ + +static void +print_mask(uint8_t mask, unsigned bits, midgard_dest_override override) +{ + if (bits == 8) { + print_mask_vec16(mask, override); + return; + } + + /* Skip 'complete' masks */ + + if (bits >= 32 && mask == 0xFF) return; + + if (bits == 16) { + if (mask == 0x0F) + return; + else if (mask == 0xF0) { + printf("'"); + return; + } + } + + printf("."); + + unsigned skip = (bits / 16); + bool uppercase = bits > 32; + bool tripped = false; + + for (unsigned i = 0; i < 8; i += skip) { + bool a = (mask & (1 << i)) != 0; + + for (unsigned j = 1; j < skip; ++j) { + bool dupe = (mask & (1 << (i + j))) != 0; + tripped |= (dupe != a); + } + + if (a) { + char c = components[i / skip]; + + if (uppercase) + c = toupper(c); + + printf("%c", c); + } + } + + if (tripped) + printf(" /* %X */", mask); +} + +/* Prints the 4-bit masks found in texture and load/store ops, as opposed to + * the 8-bit masks found in (vector) ALU ops */ + +static void +print_mask_4(unsigned mask) +{ + if (mask == 0xF) return; + + printf("."); + + for (unsigned i = 0; i < 4; ++i) { + bool a = (mask & (1 << i)) != 0; + if (a) + printf("%c", components[i]); + } +} + +static void +print_vector_field(const char *name, uint16_t *words, uint16_t reg_word, + unsigned tabs) +{ + midgard_reg_info *reg_info = (midgard_reg_info *)®_word; + midgard_vector_alu *alu_field = (midgard_vector_alu *) words; + midgard_reg_mode mode = alu_field->reg_mode; + unsigned override = alu_field->dest_override; + + /* For now, prefix instruction names with their unit, until we + * understand how this works on a deeper level */ + printf("%s.", name); + + print_alu_opcode(alu_field->op); + + /* Postfix with the size to disambiguate if necessary */ + char postfix = prefix_for_bits(bits_for_mode(mode)); + bool size_ambiguous = override != midgard_dest_override_none; + + if (size_ambiguous) + printf("%c", postfix ? postfix : 'r'); + + /* Print the outmod, if there is one */ + print_outmod(alu_field->outmod, + midgard_is_integer_out_op(alu_field->op)); + + printf(" "); + + /* Mask denoting status of 8-lanes */ + uint8_t mask = alu_field->mask; + + /* First, print the destination */ + unsigned dest_size = + print_dest(reg_info->out_reg, mode, alu_field->dest_override); + + /* Apply the destination override to the mask */ + + if (mode == midgard_reg_mode_32 || mode == midgard_reg_mode_64) { + if (override == midgard_dest_override_lower) + mask &= 0x0F; + else if (override == midgard_dest_override_upper) + mask &= 0xF0; + } else if (mode == midgard_reg_mode_16 + && override == midgard_dest_override_lower) { + /* stub */ + } + + if (override != midgard_dest_override_none) { + bool modeable = (mode != midgard_reg_mode_8); + bool known = override != 0x3; /* Unused value */ + + if (!(modeable && known)) + printf("/* do%d */ ", override); + } + + print_mask(mask, dest_size, override); + + printf(", "); + + bool is_int = midgard_is_integer_op(alu_field->op); + print_vector_src(alu_field->src1, mode, reg_info->src1_reg, override, is_int); + + printf(", "); + + if (reg_info->src2_imm) { + uint16_t imm = decode_vector_imm(reg_info->src2_reg, alu_field->src2 >> 2); + print_immediate(imm); + } else { + print_vector_src(alu_field->src2, mode, + reg_info->src2_reg, override, is_int); + } + + printf("\n"); +} + +static void +print_scalar_src(unsigned src_binary, unsigned reg) +{ + midgard_scalar_alu_src *src = (midgard_scalar_alu_src *)&src_binary; + + if (src->negate) + printf("-"); + + if (src->abs) + printf("abs("); + + print_reg(reg, src->full ? 32 : 16); + + unsigned c = src->component; + + if (src->full) { + assert((c & 1) == 0); + c >>= 1; + } + + printf(".%c", components[c]); + + if (src->abs) + printf(")"); + +} + +static uint16_t +decode_scalar_imm(unsigned src2_reg, unsigned imm) +{ + uint16_t ret; + ret = src2_reg << 11; + ret |= (imm & 3) << 9; + ret |= (imm & 4) << 6; + ret |= (imm & 0x38) << 2; + ret |= imm >> 6; + return ret; +} + +static void +print_scalar_field(const char *name, uint16_t *words, uint16_t reg_word, + unsigned tabs) +{ + midgard_reg_info *reg_info = (midgard_reg_info *)®_word; + midgard_scalar_alu *alu_field = (midgard_scalar_alu *) words; + + if (alu_field->unknown) + printf("scalar ALU unknown bit set\n"); + + printf("%s.", name); + print_alu_opcode(alu_field->op); + print_outmod(alu_field->outmod, + midgard_is_integer_out_op(alu_field->op)); + printf(" "); + + bool full = alu_field->output_full; + print_reg(reg_info->out_reg, full ? 32 : 16); + unsigned c = alu_field->output_component; + + if (full) { + assert((c & 1) == 0); + c >>= 1; + } + + printf(".%c, ", components[c]); + + print_scalar_src(alu_field->src1, reg_info->src1_reg); + + printf(", "); + + if (reg_info->src2_imm) { + uint16_t imm = decode_scalar_imm(reg_info->src2_reg, + alu_field->src2); + print_immediate(imm); + } else + print_scalar_src(alu_field->src2, reg_info->src2_reg); + + printf("\n"); +} + +static void +print_branch_op(int op) +{ + switch (op) { + case midgard_jmp_writeout_op_branch_uncond: + printf("uncond."); + break; + + case midgard_jmp_writeout_op_branch_cond: + printf("cond."); + break; + + case midgard_jmp_writeout_op_writeout: + printf("write."); + break; + + case midgard_jmp_writeout_op_tilebuffer_pending: + printf("tilebuffer."); + break; + + case midgard_jmp_writeout_op_discard: + printf("discard."); + break; + + default: + printf("unk%d.", op); + break; + } +} + +static void +print_branch_cond(int cond) +{ + switch (cond) { + case midgard_condition_write0: + printf("write0"); + break; + + case midgard_condition_false: + printf("false"); + break; + + case midgard_condition_true: + printf("true"); + break; + + case midgard_condition_always: + printf("always"); + break; + + default: + printf("unk%X", cond); + break; + } +} + +static void +print_compact_branch_writeout_field(uint16_t word) +{ + midgard_jmp_writeout_op op = word & 0x7; + + switch (op) { + case midgard_jmp_writeout_op_branch_uncond: { + midgard_branch_uncond br_uncond; + memcpy((char *) &br_uncond, (char *) &word, sizeof(br_uncond)); + printf("br.uncond "); + + if (br_uncond.unknown != 1) + printf("unknown:%d, ", br_uncond.unknown); + + if (br_uncond.offset >= 0) + printf("+"); + + printf("%d -> ", br_uncond.offset); + print_tag_short(br_uncond.dest_tag); + printf("\n"); + + break; + } + + case midgard_jmp_writeout_op_branch_cond: + case midgard_jmp_writeout_op_writeout: + case midgard_jmp_writeout_op_discard: + default: { + midgard_branch_cond br_cond; + memcpy((char *) &br_cond, (char *) &word, sizeof(br_cond)); + + printf("br."); + + print_branch_op(br_cond.op); + print_branch_cond(br_cond.cond); + + printf(" "); + + if (br_cond.offset >= 0) + printf("+"); + + printf("%d -> ", br_cond.offset); + print_tag_short(br_cond.dest_tag); + printf("\n"); + + break; + } + } +} + +static void +print_extended_branch_writeout_field(uint8_t *words) +{ + midgard_branch_extended br; + memcpy((char *) &br, (char *) words, sizeof(br)); + + printf("brx."); + + print_branch_op(br.op); + + /* Condition repeated 8 times in all known cases. Check this. */ + + unsigned cond = br.cond & 0x3; + + for (unsigned i = 0; i < 16; i += 2) { + assert(((br.cond >> i) & 0x3) == cond); + } + + print_branch_cond(cond); + + if (br.unknown) + printf(".unknown%d", br.unknown); + + printf(" "); + + if (br.offset >= 0) + printf("+"); + + printf("%d -> ", br.offset); + print_tag_short(br.dest_tag); + printf("\n"); +} + +static unsigned +num_alu_fields_enabled(uint32_t control_word) +{ + unsigned ret = 0; + + if ((control_word >> 17) & 1) + ret++; + + if ((control_word >> 19) & 1) + ret++; + + if ((control_word >> 21) & 1) + ret++; + + if ((control_word >> 23) & 1) + ret++; + + if ((control_word >> 25) & 1) + ret++; + + return ret; +} + +static float +float_bitcast(uint32_t integer) +{ + union { + uint32_t i; + float f; + } v; + + v.i = integer; + return v.f; +} + +static void +print_alu_word(uint32_t *words, unsigned num_quad_words, + unsigned tabs) +{ + uint32_t control_word = words[0]; + uint16_t *beginning_ptr = (uint16_t *)(words + 1); + unsigned num_fields = num_alu_fields_enabled(control_word); + uint16_t *word_ptr = beginning_ptr + num_fields; + unsigned num_words = 2 + num_fields; + + if ((control_word >> 16) & 1) + printf("unknown bit 16 enabled\n"); + + if ((control_word >> 17) & 1) { + print_vector_field("vmul", word_ptr, *beginning_ptr, tabs); + beginning_ptr += 1; + word_ptr += 3; + num_words += 3; + } + + if ((control_word >> 18) & 1) + printf("unknown bit 18 enabled\n"); + + if ((control_word >> 19) & 1) { + print_scalar_field("sadd", word_ptr, *beginning_ptr, tabs); + beginning_ptr += 1; + word_ptr += 2; + num_words += 2; + } + + if ((control_word >> 20) & 1) + printf("unknown bit 20 enabled\n"); + + if ((control_word >> 21) & 1) { + print_vector_field("vadd", word_ptr, *beginning_ptr, tabs); + beginning_ptr += 1; + word_ptr += 3; + num_words += 3; + } + + if ((control_word >> 22) & 1) + printf("unknown bit 22 enabled\n"); + + if ((control_word >> 23) & 1) { + print_scalar_field("smul", word_ptr, *beginning_ptr, tabs); + beginning_ptr += 1; + word_ptr += 2; + num_words += 2; + } + + if ((control_word >> 24) & 1) + printf("unknown bit 24 enabled\n"); + + if ((control_word >> 25) & 1) { + print_vector_field("lut", word_ptr, *beginning_ptr, tabs); + beginning_ptr += 1; + word_ptr += 3; + num_words += 3; + } + + if ((control_word >> 26) & 1) { + print_compact_branch_writeout_field(*word_ptr); + word_ptr += 1; + num_words += 1; + } + + if ((control_word >> 27) & 1) { + print_extended_branch_writeout_field((uint8_t *) word_ptr); + word_ptr += 3; + num_words += 3; + } + + if (num_quad_words > (num_words + 7) / 8) { + assert(num_quad_words == (num_words + 15) / 8); + //Assume that the extra quadword is constants + void *consts = words + (4 * num_quad_words - 4); + + if (is_embedded_constant_int) { + if (is_embedded_constant_half) { + int16_t *sconsts = (int16_t *) consts; + printf("sconstants %d, %d, %d, %d\n", + sconsts[0], + sconsts[1], + sconsts[2], + sconsts[3]); + } else { + int32_t *iconsts = (int32_t *) consts; + printf("iconstants %d, %d, %d, %d\n", + iconsts[0], + iconsts[1], + iconsts[2], + iconsts[3]); + } + } else { + if (is_embedded_constant_half) { + uint16_t *hconsts = (uint16_t *) consts; + printf("hconstants %g, %g, %g, %g\n", + _mesa_half_to_float(hconsts[0]), + _mesa_half_to_float(hconsts[1]), + _mesa_half_to_float(hconsts[2]), + _mesa_half_to_float(hconsts[3])); + } else { + uint32_t *fconsts = (uint32_t *) consts; + printf("fconstants %g, %g, %g, %g\n", + float_bitcast(fconsts[0]), + float_bitcast(fconsts[1]), + float_bitcast(fconsts[2]), + float_bitcast(fconsts[3])); + } + + } + } +} + +static void +print_varying_parameters(midgard_load_store_word *word) +{ + midgard_varying_parameter param; + unsigned v = word->varying_parameters; + memcpy(¶m, &v, sizeof(param)); + + if (param.is_varying) { + /* If a varying, there are qualifiers */ + if (param.flat) + printf(".flat"); + + if (param.interpolation != midgard_interp_default) { + if (param.interpolation == midgard_interp_centroid) + printf(".centroid"); + else + printf(".interp%d", param.interpolation); + } + + if (param.modifier != midgard_varying_mod_none) { + if (param.modifier == midgard_varying_mod_perspective_w) + printf(".perspectivew"); + else if (param.modifier == midgard_varying_mod_perspective_z) + printf(".perspectivez"); + else + printf(".mod%d", param.modifier); + } + } else if (param.flat || param.interpolation || param.modifier) { + printf(" /* is_varying not set but varying metadata attached */"); + } + + if (param.zero0 || param.zero1 || param.zero2) + printf(" /* zero tripped, %d %d %d */ ", param.zero0, param.zero1, param.zero2); +} + +static bool +is_op_varying(unsigned op) +{ + switch (op) { + case midgard_op_st_vary_16: + case midgard_op_st_vary_32: + case midgard_op_ld_vary_16: + case midgard_op_ld_vary_32: + return true; + } + + return false; +} + +static void +print_load_store_instr(uint64_t data, + unsigned tabs) +{ + midgard_load_store_word *word = (midgard_load_store_word *) &data; + + print_ld_st_opcode(word->op); + + if (is_op_varying(word->op)) + print_varying_parameters(word); + + printf(" r%d", word->reg); + print_mask_4(word->mask); + + int address = word->address; + + if (word->op == midgard_op_ld_uniform_32) { + /* Uniforms use their own addressing scheme */ + + int lo = word->varying_parameters >> 7; + int hi = word->address; + + /* TODO: Combine fields logically */ + address = (hi << 3) | lo; + } + + printf(", %d", address); + + print_swizzle_vec4(word->swizzle, false, false); + + printf(", 0x%X /* %X */\n", word->unknown, word->varying_parameters); +} + +static void +print_load_store_word(uint32_t *word, unsigned tabs) +{ + midgard_load_store *load_store = (midgard_load_store *) word; + + if (load_store->word1 != 3) { + print_load_store_instr(load_store->word1, tabs); + } + + if (load_store->word2 != 3) { + print_load_store_instr(load_store->word2, tabs); + } +} + +static void +print_texture_reg(bool full, bool select, bool upper) +{ + if (full) + printf("r%d", REG_TEX_BASE + select); + else + printf("hr%d", (REG_TEX_BASE + select) * 2 + upper); + + if (full && upper) + printf("// error: out full / upper mutually exclusive\n"); + +} + +static void +print_texture_reg_triple(unsigned triple) +{ + bool full = triple & 1; + bool select = triple & 2; + bool upper = triple & 4; + + print_texture_reg(full, select, upper); +} + +static void +print_texture_format(int format) +{ + /* Act like a modifier */ + printf("."); + + switch (format) { + DEFINE_CASE(MALI_TEX_1D, "1d"); + DEFINE_CASE(MALI_TEX_2D, "2d"); + DEFINE_CASE(MALI_TEX_3D, "3d"); + DEFINE_CASE(MALI_TEX_CUBE, "cube"); + + default: + unreachable("Bad format"); + } +} + +static void +print_texture_op(unsigned op, bool gather) +{ + /* Act like a bare name, like ESSL functions */ + + if (gather) { + printf("textureGather"); + + unsigned component = op >> 4; + unsigned bottom = op & 0xF; + + if (bottom != 0x2) + printf("_unk%d", bottom); + + printf(".%c", components[component]); + return; + } + + switch (op) { + DEFINE_CASE(TEXTURE_OP_NORMAL, "texture"); + DEFINE_CASE(TEXTURE_OP_LOD, "textureLod"); + DEFINE_CASE(TEXTURE_OP_TEXEL_FETCH, "texelFetch"); + + default: + printf("tex_%d", op); + break; + } +} + +static bool +texture_op_takes_bias(unsigned op) +{ + return op == TEXTURE_OP_NORMAL; +} + +static char +sampler_type_name(enum mali_sampler_type t) +{ + switch (t) { + case MALI_SAMPLER_FLOAT: + return 'f'; + case MALI_SAMPLER_UNSIGNED: + return 'u'; + case MALI_SAMPLER_SIGNED: + return 'i'; + default: + return '?'; + } + +} + +#undef DEFINE_CASE + +static void +print_texture_word(uint32_t *word, unsigned tabs) +{ + midgard_texture_word *texture = (midgard_texture_word *) word; + + /* Broad category of texture operation in question */ + print_texture_op(texture->op, texture->is_gather); + + /* Specific format in question */ + print_texture_format(texture->format); + + assert(texture->zero == 0); + + /* Instruction "modifiers" parallel the ALU instructions. */ + + if (texture->shadow) + printf(".shadow"); + + if (texture->cont) + printf(".cont"); + + if (texture->last) + printf(".last"); + + printf(" "); + + print_texture_reg(texture->out_full, texture->out_reg_select, texture->out_upper); + print_mask_4(texture->mask); + printf(", "); + + printf("texture%d, ", texture->texture_handle); + + /* Print the type, GL style */ + printf("%c", sampler_type_name(texture->sampler_type)); + printf("sampler%d", texture->sampler_handle); + print_swizzle_vec4(texture->swizzle, false, false); + printf(", "); + + print_texture_reg(texture->in_reg_full, texture->in_reg_select, texture->in_reg_upper); + print_swizzle_vec4(texture->in_reg_swizzle, false, false); + + /* There is *always* an offset attached. Of + * course, that offset is just immediate #0 for a + * GLES call that doesn't take an offset. If there + * is a non-negative non-zero offset, this is + * specified in immediate offset mode, with the + * values in the offset_* fields as immediates. If + * this is a negative offset, we instead switch to + * a register offset mode, where the offset_* + * fields become register triplets */ + + if (texture->offset_register) { + printf(" + "); + print_texture_reg_triple(texture->offset_x); + + /* The less questions you ask, the better. */ + + unsigned swizzle_lo, swizzle_hi; + unsigned orig_y = texture->offset_y; + unsigned orig_z = texture->offset_z; + + memcpy(&swizzle_lo, &orig_y, sizeof(unsigned)); + memcpy(&swizzle_hi, &orig_z, sizeof(unsigned)); + + /* Duplicate hi swizzle over */ + assert(swizzle_hi < 4); + swizzle_hi = (swizzle_hi << 2) | swizzle_hi; + + unsigned swiz = (swizzle_lo << 4) | swizzle_hi; + unsigned reversed = util_bitreverse(swiz) >> 24; + print_swizzle_vec4(reversed, false, false); + + printf(", "); + } else if (texture->offset_x || texture->offset_y || texture->offset_z) { + /* Only select ops allow negative immediate offsets, verify */ + + bool neg_x = texture->offset_x < 0; + bool neg_y = texture->offset_y < 0; + bool neg_z = texture->offset_z < 0; + bool any_neg = neg_x || neg_y || neg_z; + + if (any_neg && texture->op != TEXTURE_OP_TEXEL_FETCH) + printf("/* invalid negative */ "); + + /* Regardless, just print the immediate offset */ + + printf(" + <%d, %d, %d>, ", + texture->offset_x, + texture->offset_y, + texture->offset_z); + } else { + printf(", "); + } + + char lod_operand = texture_op_takes_bias(texture->op) ? '+' : '='; + + if (texture->lod_register) { + midgard_tex_register_select sel; + uint8_t raw = texture->bias; + memcpy(&sel, &raw, sizeof(raw)); + + unsigned c = (sel.component_hi << 1) | sel.component_lo; + + printf("lod %c ", lod_operand); + print_texture_reg(sel.full, sel.select, sel.upper); + printf(".%c, ", components[c]); + + if (!sel.component_hi) + printf(" /* gradient? */"); + + if (texture->bias_int) + printf(" /* bias_int = 0x%X */", texture->bias_int); + + if (sel.zero) + printf(" /* sel.zero = 0x%X */", sel.zero); + } else if (texture->op == TEXTURE_OP_TEXEL_FETCH) { + /* For texel fetch, the int LOD is in the fractional place and + * there is no fraction / possibility of bias. We *always* have + * an explicit LOD, even if it's zero. */ + + if (texture->bias_int) + printf(" /* bias_int = 0x%X */ ", texture->bias_int); + + printf("lod = %d, ", texture->bias); + } else if (texture->bias || texture->bias_int) { + signed bias_int = texture->bias_int; + float bias_frac = texture->bias / 256.0f; + float bias = bias_int + bias_frac; + + bool is_bias = texture_op_takes_bias(texture->op); + char sign = (bias >= 0.0) ? '+' : '-'; + char operand = is_bias ? sign : '='; + + printf("lod %c %f, ", operand, fabsf(bias)); + } + + printf("\n"); + + /* While not zero in general, for these simple instructions the + * following unknowns are zero, so we don't include them */ + + if (texture->unknown2 || + texture->unknown4 || + texture->unknownA || + texture->unknown8) { + printf("// unknown2 = 0x%x\n", texture->unknown2); + printf("// unknown4 = 0x%x\n", texture->unknown4); + printf("// unknownA = 0x%x\n", texture->unknownA); + printf("// unknown8 = 0x%x\n", texture->unknown8); + } +} + +void +disassemble_midgard(uint8_t *code, size_t size) +{ + uint32_t *words = (uint32_t *) code; + unsigned num_words = size / 4; + int tabs = 0; + + bool prefetch_flag = false; + + unsigned i = 0; + + while (i < num_words) { + unsigned tag = words[i] & 0xF; + unsigned num_quad_words = midgard_word_size[tag]; + + switch (midgard_word_types[tag]) { + case midgard_word_type_texture: + print_texture_word(&words[i], tabs); + break; + + case midgard_word_type_load_store: + print_load_store_word(&words[i], tabs); + break; + + case midgard_word_type_alu: + print_alu_word(&words[i], num_quad_words, tabs); + + if (prefetch_flag) + return; + + /* Reset word static analysis state */ + is_embedded_constant_half = false; + is_embedded_constant_int = false; + + break; + + default: + printf("Unknown word type %u:\n", words[i] & 0xF); + num_quad_words = 1; + print_quad_word(&words[i], tabs); + printf("\n"); + break; + } + + printf("\n"); + + unsigned next = (words[i] & 0xF0) >> 4; + + i += 4 * num_quad_words; + + /* Break based on instruction prefetch flag */ + + if (i < num_words && next == 1) { + prefetch_flag = true; + + if (midgard_word_types[words[i] & 0xF] != midgard_word_type_alu) + return; + } + } + + return; +} diff --git a/src/panfrost/midgard/disassemble.h b/src/panfrost/midgard/disassemble.h new file mode 100644 index 00000000000..ab1837c201e --- /dev/null +++ b/src/panfrost/midgard/disassemble.h @@ -0,0 +1,2 @@ +#include <stddef.h> +void disassemble_midgard(uint8_t *code, size_t size); diff --git a/src/panfrost/midgard/helpers.h b/src/panfrost/midgard/helpers.h new file mode 100644 index 00000000000..ef854dc60c1 --- /dev/null +++ b/src/panfrost/midgard/helpers.h @@ -0,0 +1,282 @@ +/* Copyright (c) 2018-2019 Alyssa Rosenzweig ([email protected]) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef __MDG_HELPERS_H +#define __MDG_HELPERS_H + +#include "util/macros.h" +#include <string.h> + +#define OP_IS_STORE_VARY(op) (\ + op == midgard_op_st_vary_16 || \ + op == midgard_op_st_vary_32 \ + ) + +#define OP_IS_STORE(op) (\ + OP_IS_STORE_VARY(op) || \ + op == midgard_op_st_cubemap_coords \ + ) + +#define OP_IS_MOVE(op) ( \ + op == midgard_alu_op_fmov || \ + op == midgard_alu_op_imov \ + ) + +/* ALU control words are single bit fields with a lot of space */ + +#define ALU_ENAB_VEC_MUL (1 << 17) +#define ALU_ENAB_SCAL_ADD (1 << 19) +#define ALU_ENAB_VEC_ADD (1 << 21) +#define ALU_ENAB_SCAL_MUL (1 << 23) +#define ALU_ENAB_VEC_LUT (1 << 25) +#define ALU_ENAB_BR_COMPACT (1 << 26) +#define ALU_ENAB_BRANCH (1 << 27) + +/* Other opcode properties that don't conflict with the ALU_ENABs, non-ISA */ + +/* Denotes an opcode that takes a vector input with a fixed-number of + * channels, but outputs to only a single output channel, like dot products. + * For these, to determine the effective mask, this quirk can be set. We have + * an intentional off-by-one (a la MALI_POSITIVE), since 0-channel makes no + * sense but we need to fit 4 channels in 2-bits. Similarly, 1-channel doesn't + * make sense (since then why are we quirked?), so that corresponds to "no + * count set" */ + +#define OP_CHANNEL_COUNT(c) ((c - 1) << 0) +#define GET_CHANNEL_COUNT(c) ((c & (0x3 << 0)) ? ((c & (0x3 << 0)) + 1) : 0) + +/* For instructions that take a single argument, normally the first argument + * slot is used for the argument and the second slot is a dummy #0 constant. + * However, there are exceptions: instructions like fmov store their argument + * in the _second_ slot and store a dummy r24 in the first slot, designated by + * QUIRK_FLIPPED_R24 */ + +#define QUIRK_FLIPPED_R24 (1 << 2) + +/* Is the op commutative? */ +#define OP_COMMUTES (1 << 3) + +/* Does the op convert types between int- and float- space (i2f/f2u/etc) */ +#define OP_TYPE_CONVERT (1 << 4) + +/* Vector-independant shorthands for the above; these numbers are arbitrary and + * not from the ISA. Convert to the above with unit_enum_to_midgard */ + +#define UNIT_MUL 0 +#define UNIT_ADD 1 +#define UNIT_LUT 2 + +/* 4-bit type tags */ + +#define TAG_TEXTURE_4_VTX 0x2 +#define TAG_TEXTURE_4 0x3 +#define TAG_LOAD_STORE_4 0x5 +#define TAG_ALU_4 0x8 +#define TAG_ALU_8 0x9 +#define TAG_ALU_12 0xA +#define TAG_ALU_16 0xB + +static inline int +quadword_size(int tag) +{ + switch (tag) { + case TAG_ALU_4: + case TAG_LOAD_STORE_4: + case TAG_TEXTURE_4: + case TAG_TEXTURE_4_VTX: + return 1; + case TAG_ALU_8: + return 2; + case TAG_ALU_12: + return 3; + case TAG_ALU_16: + return 4; + default: + unreachable("Unknown tag"); + } +} + +#define IS_ALU(tag) (tag == TAG_ALU_4 || tag == TAG_ALU_8 || \ + tag == TAG_ALU_12 || tag == TAG_ALU_16) + +/* Special register aliases */ + +#define MAX_WORK_REGISTERS 16 + +/* Uniforms are begin at (REGISTER_UNIFORMS - uniform_count) */ +#define REGISTER_UNIFORMS 24 + +#define REGISTER_UNUSED 24 +#define REGISTER_CONSTANT 26 +#define REGISTER_VARYING_BASE 26 +#define REGISTER_OFFSET 27 +#define REGISTER_TEXTURE_BASE 28 +#define REGISTER_SELECT 31 + +/* SSA helper aliases to mimic the registers. UNUSED_0 encoded as an inline + * constant. UNUSED_1 encoded as REGISTER_UNUSED */ + +#define SSA_UNUSED_0 0 +#define SSA_UNUSED_1 -2 + +#define SSA_FIXED_SHIFT 24 +#define SSA_FIXED_REGISTER(reg) ((1 + reg) << SSA_FIXED_SHIFT) +#define SSA_REG_FROM_FIXED(reg) ((reg >> SSA_FIXED_SHIFT) - 1) +#define SSA_FIXED_MINIMUM SSA_FIXED_REGISTER(0) + +/* Swizzle support */ + +#define SWIZZLE(A, B, C, D) ((D << 6) | (C << 4) | (B << 2) | (A << 0)) +#define SWIZZLE_FROM_ARRAY(r) SWIZZLE(r[0], r[1], r[2], r[3]) +#define COMPONENT_X 0x0 +#define COMPONENT_Y 0x1 +#define COMPONENT_Z 0x2 +#define COMPONENT_W 0x3 + +#define SWIZZLE_XXXX SWIZZLE(COMPONENT_X, COMPONENT_X, COMPONENT_X, COMPONENT_X) +#define SWIZZLE_XYXX SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_X, COMPONENT_X) +#define SWIZZLE_XYZX SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_X) +#define SWIZZLE_XYZW SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W) +#define SWIZZLE_XYXZ SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_X, COMPONENT_Z) +#define SWIZZLE_XYZZ SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_Z) +#define SWIZZLE_WWWW SWIZZLE(COMPONENT_W, COMPONENT_W, COMPONENT_W, COMPONENT_W) + +static inline unsigned +swizzle_of(unsigned comp) +{ + switch (comp) { + case 1: + return SWIZZLE_XXXX; + case 2: + return SWIZZLE_XYXX; + case 3: + return SWIZZLE_XYZX; + case 4: + return SWIZZLE_XYZW; + default: + unreachable("Invalid component count"); + } +} + +static inline unsigned +mask_of(unsigned nr_comp) +{ + return (1 << nr_comp) - 1; +} + + +/* See ISA notes */ + +#define LDST_NOP (3) + +/* There are five ALU units: VMUL, VADD, SMUL, SADD, LUT. A given opcode is + * implemented on some subset of these units (or occassionally all of them). + * This table encodes a bit mask of valid units for each opcode, so the + * scheduler can figure where to plonk the instruction. */ + +/* Shorthands for each unit */ +#define UNIT_VMUL ALU_ENAB_VEC_MUL +#define UNIT_SADD ALU_ENAB_SCAL_ADD +#define UNIT_VADD ALU_ENAB_VEC_ADD +#define UNIT_SMUL ALU_ENAB_SCAL_MUL +#define UNIT_VLUT ALU_ENAB_VEC_LUT + +/* Shorthands for usual combinations of units */ + +#define UNITS_MUL (UNIT_VMUL | UNIT_SMUL) +#define UNITS_ADD (UNIT_VADD | UNIT_SADD) +#define UNITS_MOST (UNITS_MUL | UNITS_ADD) +#define UNITS_ALL (UNITS_MOST | UNIT_VLUT) +#define UNITS_SCALAR (UNIT_SADD | UNIT_SMUL) +#define UNITS_VECTOR (UNIT_VMUL | UNIT_VADD) +#define UNITS_ANY_VECTOR (UNITS_VECTOR | UNIT_VLUT) + +struct mir_op_props { + const char *name; + unsigned props; +}; + +/* This file is common, so don't define the tables themselves. #include + * midgard_op.h if you need that, or edit midgard_ops.c directly */ + +/* Duplicate bits to convert a 4-bit writemask to duplicated 8-bit format, + * which is used for 32-bit vector units */ + +static inline unsigned +expand_writemask_32(unsigned mask) +{ + unsigned o = 0; + + for (int i = 0; i < 4; ++i) + if (mask & (1 << i)) + o |= (3 << (2 * i)); + + return o; +} + +/* Coerce structs to integer */ + +static inline unsigned +vector_alu_srco_unsigned(midgard_vector_alu_src src) +{ + unsigned u; + memcpy(&u, &src, sizeof(src)); + return u; +} + +static inline midgard_vector_alu_src +vector_alu_from_unsigned(unsigned u) +{ + midgard_vector_alu_src s; + memcpy(&s, &u, sizeof(s)); + return s; +} + +/* Composes two swizzles */ +static inline unsigned +pan_compose_swizzle(unsigned left, unsigned right) +{ + unsigned out = 0; + + for (unsigned c = 0; c < 4; ++c) { + unsigned s = (left >> (2*c)) & 0x3; + unsigned q = (right >> (2*s)) & 0x3; + + out |= (q << (2*c)); + } + + return out; +} + +/* Applies a swizzle to an ALU source */ + +static inline unsigned +vector_alu_apply_swizzle(unsigned src, unsigned swizzle) +{ + midgard_vector_alu_src s = + vector_alu_from_unsigned(src); + + s.swizzle = pan_compose_swizzle(s.swizzle, swizzle); + + return vector_alu_srco_unsigned(s); +} + +#endif diff --git a/src/panfrost/midgard/meson.build b/src/panfrost/midgard/meson.build new file mode 100644 index 00000000000..cbe26004e2d --- /dev/null +++ b/src/panfrost/midgard/meson.build @@ -0,0 +1,63 @@ +# Copyright © 2018 Rob Clark +# Copyright © 2019 Collabora + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +libpanfrost_midgard_files = files( + 'midgard_compile.c', + 'mir.c', + 'midgard_print.c', + 'midgard_schedule.c', + 'midgard_emit.c', + 'midgard_ra.c', + 'midgard_ra_pipeline.c', + 'midgard_liveness.c', + 'midgard_ops.c', + 'cppwrap.cpp', + 'disassemble.c', +) + +midgard_nir_algebraic_c = custom_target( + 'midgard_nir_algebraic.c', + input : 'midgard_nir_algebraic.py', + output : 'midgard_nir_algebraic.c', + command : [ + prog_python, '@INPUT@', + '-p', join_paths(meson.source_root(), 'src/compiler/nir/'), + ], + capture : true, + depend_files : nir_algebraic_py, +) + +libpanfrost_midgard = static_library( + 'panfrost_midgard', + [libpanfrost_midgard_files, midgard_nir_algebraic_c], + include_directories : [ + inc_common, + inc_include, + inc_src, + inc_panfrost_hw, + ], + dependencies: [ + idep_nir + ], + c_args : [c_vis_args, no_override_init_args], + cpp_args : [cpp_vis_args], + build_by_default : false, +) diff --git a/src/panfrost/midgard/midgard-parse.h b/src/panfrost/midgard/midgard-parse.h new file mode 100644 index 00000000000..5d134839406 --- /dev/null +++ b/src/panfrost/midgard/midgard-parse.h @@ -0,0 +1,70 @@ +/* Author(s): + * Connor Abbott + * Alyssa Rosenzweig + * + * Copyright (c) 2013 Connor Abbott ([email protected]) + * Copyright (c) 2018 Alyssa Rosenzweig ([email protected]) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef __midgard_parse_h__ +#define __midgard_parse_h__ + +/* Additional metadata for parsing Midgard binaries, not needed for compilation */ + +static midgard_word_type midgard_word_types[16] = { + midgard_word_type_unknown, /* 0x0 */ + midgard_word_type_unknown, /* 0x1 */ + midgard_word_type_texture, /* 0x2 */ + midgard_word_type_texture, /* 0x3 */ + midgard_word_type_unknown, /* 0x4 */ + midgard_word_type_load_store, /* 0x5 */ + midgard_word_type_unknown, /* 0x6 */ + midgard_word_type_unknown, /* 0x7 */ + midgard_word_type_alu, /* 0x8 */ + midgard_word_type_alu, /* 0x9 */ + midgard_word_type_alu, /* 0xA */ + midgard_word_type_alu, /* 0xB */ + midgard_word_type_alu, /* 0xC */ + midgard_word_type_alu, /* 0xD */ + midgard_word_type_alu, /* 0xE */ + midgard_word_type_alu, /* 0xF */ +}; + +static unsigned midgard_word_size[16] = { + 0, /* 0x0 */ + 0, /* 0x1 */ + 1, /* 0x2 */ + 1, /* 0x3 */ + 0, /* 0x4 */ + 1, /* 0x5 */ + 0, /* 0x6 */ + 0, /* 0x7 */ + 1, /* 0x8 */ + 2, /* 0x9 */ + 3, /* 0xA */ + 4, /* 0xB */ + 1, /* 0xC */ + 2, /* 0xD */ + 3, /* 0xE */ + 4, /* 0xF */ +}; + +#endif diff --git a/src/panfrost/midgard/midgard.h b/src/panfrost/midgard/midgard.h new file mode 100644 index 00000000000..5953214c599 --- /dev/null +++ b/src/panfrost/midgard/midgard.h @@ -0,0 +1,646 @@ +/* Author(s): + * Connor Abbott + * Alyssa Rosenzweig + * + * Copyright (c) 2013 Connor Abbott ([email protected]) + * Copyright (c) 2018 Alyssa Rosenzweig ([email protected]) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef __midgard_h__ +#define __midgard_h__ + +#include <stdint.h> +#include <stdbool.h> +#include "panfrost-job.h" + +#define MIDGARD_DBG_MSGS 0x0001 +#define MIDGARD_DBG_SHADERS 0x0002 +#define MIDGARD_DBG_SHADERDB 0x0004 + +extern int midgard_debug; + +typedef enum { + midgard_word_type_alu, + midgard_word_type_load_store, + midgard_word_type_texture, + midgard_word_type_unknown +} midgard_word_type; + +typedef enum { + midgard_alu_vmul, + midgard_alu_sadd, + midgard_alu_smul, + midgard_alu_vadd, + midgard_alu_lut +} midgard_alu; + +/* + * ALU words + */ + +typedef enum { + midgard_alu_op_fadd = 0x10, + midgard_alu_op_fmul = 0x14, + + midgard_alu_op_fmin = 0x28, + midgard_alu_op_fmax = 0x2C, + + midgard_alu_op_fmov = 0x30, /* fmov_rte */ + midgard_alu_op_fmov_rtz = 0x31, + midgard_alu_op_fmov_rtn = 0x32, + midgard_alu_op_fmov_rtp = 0x33, + midgard_alu_op_froundeven = 0x34, + midgard_alu_op_ftrunc = 0x35, + midgard_alu_op_ffloor = 0x36, + midgard_alu_op_fceil = 0x37, + midgard_alu_op_ffma = 0x38, + midgard_alu_op_fdot3 = 0x3C, + midgard_alu_op_fdot3r = 0x3D, + midgard_alu_op_fdot4 = 0x3E, + midgard_alu_op_freduce = 0x3F, + + midgard_alu_op_iadd = 0x40, + midgard_alu_op_ishladd = 0x41, + midgard_alu_op_isub = 0x46, + midgard_alu_op_iaddsat = 0x48, + midgard_alu_op_uaddsat = 0x49, + midgard_alu_op_isubsat = 0x4E, + midgard_alu_op_usubsat = 0x4F, + + midgard_alu_op_imul = 0x58, + + midgard_alu_op_imin = 0x60, + midgard_alu_op_umin = 0x61, + midgard_alu_op_imax = 0x62, + midgard_alu_op_umax = 0x63, + midgard_alu_op_ihadd = 0x64, + midgard_alu_op_uhadd = 0x65, + midgard_alu_op_irhadd = 0x66, + midgard_alu_op_urhadd = 0x67, + midgard_alu_op_iasr = 0x68, + midgard_alu_op_ilsr = 0x69, + midgard_alu_op_ishl = 0x6E, + + midgard_alu_op_iand = 0x70, + midgard_alu_op_ior = 0x71, + midgard_alu_op_inand = 0x72, /* ~(a & b), for inot let a = b */ + midgard_alu_op_inor = 0x73, /* ~(a | b) */ + midgard_alu_op_iandnot = 0x74, /* (a & ~b), used for not/b2f */ + midgard_alu_op_iornot = 0x75, /* (a | ~b) */ + midgard_alu_op_ixor = 0x76, + midgard_alu_op_inxor = 0x77, /* ~(a & b) */ + midgard_alu_op_iclz = 0x78, /* Number of zeroes on left */ + midgard_alu_op_ibitcount8 = 0x7A, /* Counts bits in 8-bit increments */ + midgard_alu_op_imov = 0x7B, + midgard_alu_op_iabsdiff = 0x7C, + midgard_alu_op_uabsdiff = 0x7D, + midgard_alu_op_ichoose = 0x7E, /* vector, component number - dupe for shuffle() */ + + midgard_alu_op_feq = 0x80, + midgard_alu_op_fne = 0x81, + midgard_alu_op_flt = 0x82, + midgard_alu_op_fle = 0x83, + midgard_alu_op_fball_eq = 0x88, + midgard_alu_op_bball_eq = 0x89, + midgard_alu_op_fball_lt = 0x8A, /* all(lessThan(.., ..)) */ + midgard_alu_op_fball_lte = 0x8B, /* all(lessThanEqual(.., ..)) */ + + midgard_alu_op_bbany_neq = 0x90, /* used for bvec4(1) */ + midgard_alu_op_fbany_neq = 0x91, /* bvec4(0) also */ + midgard_alu_op_fbany_lt = 0x92, /* any(lessThan(.., ..)) */ + midgard_alu_op_fbany_lte = 0x93, /* any(lessThanEqual(.., ..)) */ + + midgard_alu_op_f2i_rte = 0x98, + midgard_alu_op_f2i_rtz = 0x99, + midgard_alu_op_f2i_rtn = 0x9A, + midgard_alu_op_f2i_rtp = 0x9B, + midgard_alu_op_f2u_rte = 0x9C, + midgard_alu_op_f2u_rtz = 0x9D, + midgard_alu_op_f2u_rtn = 0x9E, + midgard_alu_op_f2u_rtp = 0x9F, + + midgard_alu_op_ieq = 0xA0, + midgard_alu_op_ine = 0xA1, + midgard_alu_op_ult = 0xA2, + midgard_alu_op_ule = 0xA3, + midgard_alu_op_ilt = 0xA4, + midgard_alu_op_ile = 0xA5, + midgard_alu_op_iball_eq = 0xA8, + midgard_alu_op_iball_neq = 0xA9, + midgard_alu_op_uball_lt = 0xAA, + midgard_alu_op_uball_lte = 0xAB, + midgard_alu_op_iball_lt = 0xAC, + midgard_alu_op_iball_lte = 0xAD, + + midgard_alu_op_ibany_eq = 0xB0, + midgard_alu_op_ibany_neq = 0xB1, + midgard_alu_op_ubany_lt = 0xB2, + midgard_alu_op_ubany_lte = 0xB3, + midgard_alu_op_ibany_lt = 0xB4, /* any(lessThan(.., ..)) */ + midgard_alu_op_ibany_lte = 0xB5, /* any(lessThanEqual(.., ..)) */ + midgard_alu_op_i2f_rte = 0xB8, + midgard_alu_op_i2f_rtz = 0xB9, + midgard_alu_op_i2f_rtn = 0xBA, + midgard_alu_op_i2f_rtp = 0xBB, + midgard_alu_op_u2f_rte = 0xBC, + midgard_alu_op_u2f_rtz = 0xBD, + midgard_alu_op_u2f_rtn = 0xBE, + midgard_alu_op_u2f_rtp = 0xBF, + + midgard_alu_op_icsel_v = 0xC0, /* condition code r31 */ + midgard_alu_op_icsel = 0xC1, /* condition code r31.w */ + midgard_alu_op_fcsel_v = 0xC4, + midgard_alu_op_fcsel = 0xC5, + midgard_alu_op_fround = 0xC6, + + midgard_alu_op_fatan_pt2 = 0xE8, + midgard_alu_op_fpow_pt1 = 0xEC, + midgard_alu_op_fpown_pt1 = 0xED, + midgard_alu_op_fpowr_pt1 = 0xEE, + + midgard_alu_op_frcp = 0xF0, + midgard_alu_op_frsqrt = 0xF2, + midgard_alu_op_fsqrt = 0xF3, + midgard_alu_op_fexp2 = 0xF4, + midgard_alu_op_flog2 = 0xF5, + midgard_alu_op_fsin = 0xF6, + midgard_alu_op_fcos = 0xF7, + midgard_alu_op_fatan2_pt1 = 0xF9, +} midgard_alu_op; + +typedef enum { + midgard_outmod_none = 0, + midgard_outmod_pos = 1, + /* 0x2 unknown */ + midgard_outmod_sat = 3 +} midgard_outmod_float; + +typedef enum { + midgard_outmod_int_saturate = 0, + midgard_outmod_uint_saturate = 1, + midgard_outmod_int_wrap = 2, + midgard_outmod_int_high = 3, /* Overflowed portion */ +} midgard_outmod_int; + +typedef enum { + midgard_reg_mode_8 = 0, + midgard_reg_mode_16 = 1, + midgard_reg_mode_32 = 2, + midgard_reg_mode_64 = 3 +} midgard_reg_mode; + +typedef enum { + midgard_dest_override_lower = 0, + midgard_dest_override_upper = 1, + midgard_dest_override_none = 2 +} midgard_dest_override; + +typedef enum { + midgard_int_sign_extend = 0, + midgard_int_zero_extend = 1, + midgard_int_normal = 2, + midgard_int_shift = 3 +} midgard_int_mod; + +#define MIDGARD_FLOAT_MOD_ABS (1 << 0) +#define MIDGARD_FLOAT_MOD_NEG (1 << 1) + +typedef struct +__attribute__((__packed__)) +{ + /* Either midgard_int_mod or from midgard_float_mod_*, depending on the + * type of op */ + unsigned mod : 2; + + /* replicate lower half if dest = half, or low/high half selection if + * dest = full + */ + bool rep_low : 1; + bool rep_high : 1; /* unused if dest = full */ + bool half : 1; /* only matters if dest = full */ + unsigned swizzle : 8; +} +midgard_vector_alu_src; + +typedef struct +__attribute__((__packed__)) +{ + midgard_alu_op op : 8; + midgard_reg_mode reg_mode : 2; + unsigned src1 : 13; + unsigned src2 : 13; + midgard_dest_override dest_override : 2; + midgard_outmod_float outmod : 2; + unsigned mask : 8; +} +midgard_vector_alu; + +typedef struct +__attribute__((__packed__)) +{ + bool abs : 1; + bool negate : 1; + bool full : 1; /* 0 = half, 1 = full */ + unsigned component : 3; +} +midgard_scalar_alu_src; + +typedef struct +__attribute__((__packed__)) +{ + midgard_alu_op op : 8; + unsigned src1 : 6; + unsigned src2 : 11; + unsigned unknown : 1; + unsigned outmod : 2; + bool output_full : 1; + unsigned output_component : 3; +} +midgard_scalar_alu; + +typedef struct +__attribute__((__packed__)) +{ + unsigned src1_reg : 5; + unsigned src2_reg : 5; + unsigned out_reg : 5; + bool src2_imm : 1; +} +midgard_reg_info; + +/* In addition to conditional branches and jumps (unconditional branches), + * Midgard implements a bit of fixed function functionality used in fragment + * shaders via specially crafted branches. These have special branch opcodes, + * which perform a fixed-function operation and/or use the results of a + * fixed-function operation as the branch condition. */ + +typedef enum { + /* Regular branches */ + midgard_jmp_writeout_op_branch_uncond = 1, + midgard_jmp_writeout_op_branch_cond = 2, + + /* In a fragment shader, execute a discard_if instruction, with the + * corresponding condition code. Terminates the shader, so generally + * set the branch target to out of the shader */ + midgard_jmp_writeout_op_discard = 4, + + /* Branch if the tilebuffer is not yet ready. At the beginning of a + * fragment shader that reads from the tile buffer, for instance via + * ARM_shader_framebuffer_fetch or EXT_pixel_local_storage, this branch + * operation should be used as a loop. An instruction like + * "br.tilebuffer.always -1" does the trick, corresponding to + * "while(!is_tilebuffer_ready) */ + midgard_jmp_writeout_op_tilebuffer_pending = 6, + + /* In a fragment shader, try to write out the value pushed to r0 to the + * tilebuffer, subject to unknown state in r1.z and r1.w. If this + * succeeds, the shader terminates. If it fails, it branches to the + * specified branch target. Generally, this should be used in a loop to + * itself, acting as "do { write(r0); } while(!write_successful);" */ + midgard_jmp_writeout_op_writeout = 7, +} midgard_jmp_writeout_op; + +typedef enum { + midgard_condition_write0 = 0, + + /* These condition codes denote a conditional branch on FALSE and on + * TRUE respectively */ + midgard_condition_false = 1, + midgard_condition_true = 2, + + /* This condition code always branches. For a pure branch, the + * unconditional branch coding should be used instead, but for + * fixed-function branch opcodes, this is still useful */ + midgard_condition_always = 3, +} midgard_condition; + +typedef struct +__attribute__((__packed__)) +{ + midgard_jmp_writeout_op op : 3; /* == branch_uncond */ + unsigned dest_tag : 4; /* tag of branch destination */ + unsigned unknown : 2; + int offset : 7; +} +midgard_branch_uncond; + +typedef struct +__attribute__((__packed__)) +{ + midgard_jmp_writeout_op op : 3; /* == branch_cond */ + unsigned dest_tag : 4; /* tag of branch destination */ + int offset : 7; + midgard_condition cond : 2; +} +midgard_branch_cond; + +typedef struct +__attribute__((__packed__)) +{ + midgard_jmp_writeout_op op : 3; /* == branch_cond */ + unsigned dest_tag : 4; /* tag of branch destination */ + unsigned unknown : 2; + signed offset : 23; + unsigned cond : 16; +} +midgard_branch_extended; + +typedef struct +__attribute__((__packed__)) +{ + midgard_jmp_writeout_op op : 3; /* == writeout */ + unsigned unknown : 13; +} +midgard_writeout; + +/* + * Load/store words + */ + +typedef enum { + midgard_op_ld_st_noop = 0x03, + + /* Unclear why this is on the L/S unit, but (with an address of 0, + * appropriate swizzle, magic constant 0x24, and xy mask?) moves fp32 cube + * map coordinates in r27 to its cube map texture coordinate + * destination (e.g r29). 0x4 magic for lding from fp16 instead */ + + midgard_op_st_cubemap_coords = 0x0E, + + /* Used in OpenCL. Probably can ld other things as well */ + midgard_op_ld_global_id = 0x10, + + /* The L/S unit can do perspective division a clock faster than the ALU + * if you're lucky. Put the vec4 in r27, and call with 0x24 as the + * unknown state; the output will be <x/w, y/w, z/w, 1>. Replace w with + * z for the z version */ + midgard_op_ldst_perspective_division_z = 0x12, + midgard_op_ldst_perspective_division_w = 0x13, + + /* val in r27.y, address embedded, outputs result to argument. Invert val for sub. Let val = +-1 for inc/dec. */ + midgard_op_atomic_add = 0x40, + midgard_op_atomic_and = 0x44, + midgard_op_atomic_or = 0x48, + midgard_op_atomic_xor = 0x4C, + + midgard_op_atomic_imin = 0x50, + midgard_op_atomic_umin = 0x54, + midgard_op_atomic_imax = 0x58, + midgard_op_atomic_umax = 0x5C, + + midgard_op_atomic_xchg = 0x60, + + /* Used for compute shader's __global arguments, __local variables (or + * for register spilling) */ + + midgard_op_ld_char = 0x81, + midgard_op_ld_char2 = 0x84, + midgard_op_ld_short = 0x85, + midgard_op_ld_char4 = 0x88, /* short2, int, float */ + midgard_op_ld_short4 = 0x8C, /* int2, float2, long */ + midgard_op_ld_int4 = 0x90, /* float4, long2 */ + + midgard_op_ld_attr_32 = 0x94, + midgard_op_ld_attr_16 = 0x95, + midgard_op_ld_attr_32u = 0x96, + midgard_op_ld_attr_32i = 0x97, + midgard_op_ld_vary_32 = 0x98, + midgard_op_ld_vary_16 = 0x99, + midgard_op_ld_vary_32u = 0x9A, + midgard_op_ld_vary_32i = 0x9B, + midgard_op_ld_color_buffer_16 = 0x9D, + + midgard_op_ld_uniform_16 = 0xAC, + midgard_op_ld_uniform_32i = 0xA8, + + midgard_op_ld_uniform_32 = 0xB0, + midgard_op_ld_color_buffer_8 = 0xBA, + + midgard_op_st_char = 0xC0, + midgard_op_st_char2 = 0xC4, /* short */ + midgard_op_st_char4 = 0xC8, /* short2, int, float */ + midgard_op_st_short4 = 0xCC, /* int2, float2, long */ + midgard_op_st_int4 = 0xD0, /* float4, long2 */ + + midgard_op_st_vary_32 = 0xD4, + midgard_op_st_vary_16 = 0xD5, + midgard_op_st_vary_32u = 0xD6, + midgard_op_st_vary_32i = 0xD7, + + /* Value to st in r27, location r26.w as short2 */ + midgard_op_st_image_f = 0xD8, + midgard_op_st_image_ui = 0xDA, + midgard_op_st_image_i = 0xDB, +} midgard_load_store_op; + +typedef enum { + midgard_interp_centroid = 1, + midgard_interp_default = 2 +} midgard_interpolation; + +typedef enum { + midgard_varying_mod_none = 0, + + /* Other values unknown */ + + /* Take the would-be result and divide all components by its z/w + * (perspective division baked in with the load) */ + midgard_varying_mod_perspective_z = 2, + midgard_varying_mod_perspective_w = 3, +} midgard_varying_modifier; + +typedef struct +__attribute__((__packed__)) +{ + unsigned zero0 : 1; /* Always zero */ + + midgard_varying_modifier modifier : 2; + + unsigned zero1: 1; /* Always zero */ + + /* Varying qualifiers, zero if not a varying */ + unsigned flat : 1; + unsigned is_varying : 1; /* Always one for varying, but maybe something else? */ + midgard_interpolation interpolation : 2; + + unsigned zero2 : 2; /* Always zero */ +} +midgard_varying_parameter; + +typedef struct +__attribute__((__packed__)) +{ + midgard_load_store_op op : 8; + unsigned reg : 5; + unsigned mask : 4; + unsigned swizzle : 8; + unsigned unknown : 16; + + unsigned varying_parameters : 10; + + unsigned address : 9; +} +midgard_load_store_word; + +typedef struct +__attribute__((__packed__)) +{ + unsigned type : 4; + unsigned next_type : 4; + uint64_t word1 : 60; + uint64_t word2 : 60; +} +midgard_load_store; + +/* 8-bit register selector used in texture ops to select a bias/LOD/gradient + * register, shoved into the `bias` field */ + +typedef struct +__attribute__((__packed__)) +{ + /* Combines with component_hi to form 2-bit component select out of + * xyzw, as the component for bias/LOD and the starting component of a + * gradient vector */ + + unsigned component_lo : 1; + + /* Register select between r28/r29 */ + unsigned select : 1; + + /* For a half-register, selects the upper half */ + unsigned upper : 1; + + /* Specifies a full-register, clear for a half-register. Mutually + * exclusive with upper. */ + unsigned full : 1; + + /* Higher half of component_lo. Always seen to be set for LOD/bias + * and clear for processed gradients, but I'm not sure if that's a + * hardware requirement. */ + unsigned component_hi : 1; + + /* Padding to make this 8-bit */ + unsigned zero : 3; +} +midgard_tex_register_select; + +/* Texture pipeline results are in r28-r29 */ +#define REG_TEX_BASE 28 + +/* Texture opcodes... maybe? */ +#define TEXTURE_OP_NORMAL 0x11 /* texture */ +#define TEXTURE_OP_LOD 0x12 /* textureLod */ +#define TEXTURE_OP_TEXEL_FETCH 0x14 /* texelFetch */ + +enum mali_sampler_type { + MALI_SAMPLER_UNK = 0x0, + MALI_SAMPLER_FLOAT = 0x1, /* sampler */ + MALI_SAMPLER_UNSIGNED = 0x2, /* usampler */ + MALI_SAMPLER_SIGNED = 0x3, /* isampler */ +}; + +typedef struct +__attribute__((__packed__)) +{ + unsigned type : 4; + unsigned next_type : 4; + + unsigned op : 6; + unsigned shadow : 1; + unsigned is_gather : 1; + + /* A little obscure, but last is set for the last texture operation in + * a shader. cont appears to just be last's opposite (?). Yeah, I know, + * kind of funky.. BiOpen thinks it could do with memory hinting, or + * tile locking? */ + + unsigned cont : 1; + unsigned last : 1; + + enum mali_texture_type format : 2; + unsigned zero : 2; + + /* Is a register used to specify the + * LOD/bias/offset? If set, use the `bias` field as + * a register index. If clear, use the `bias` field + * as an immediate. */ + unsigned lod_register : 1; + + /* Is a register used to specify an offset? If set, use the + * offset_reg_* fields to encode this, duplicated for each of the + * components. If clear, there is implcitly always an immediate offst + * specificed in offset_imm_* */ + unsigned offset_register : 1; + + unsigned in_reg_full : 1; + unsigned in_reg_select : 1; + unsigned in_reg_upper : 1; + unsigned in_reg_swizzle : 8; + + unsigned unknown8 : 2; + + unsigned out_full : 1; + + enum mali_sampler_type sampler_type : 2; + + unsigned out_reg_select : 1; + unsigned out_upper : 1; + + unsigned mask : 4; + + unsigned unknown2 : 2; + + unsigned swizzle : 8; + unsigned unknown4 : 8; + + unsigned unknownA : 4; + + /* In immediate mode, each offset field is an immediate range [0, 7]. + * + * In register mode, offset_x becomes a register full / select / upper + * triplet and a vec3 swizzle is splattered across offset_y/offset_z in + * a genuinely bizarre way. + * + * For texel fetches in immediate mode, the range is the full [-8, 7], + * but for normal texturing the top bit must be zero and a register + * used instead. It's not clear where this limitation is from. */ + + signed offset_x : 4; + signed offset_y : 4; + signed offset_z : 4; + + /* In immediate bias mode, for a normal texture op, this is + * texture bias, computed as int(2^8 * frac(biasf)), with + * bias_int = floor(bias). For a textureLod, it's that, but + * s/bias/lod. For a texel fetch, this is the LOD as-is. + * + * In register mode, this is a midgard_tex_register_select + * structure and bias_int is zero */ + + unsigned bias : 8; + signed bias_int : 8; + + unsigned texture_handle : 16; + unsigned sampler_handle : 16; +} +midgard_texture_word; + +#endif diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c new file mode 100644 index 00000000000..9c1349094bd --- /dev/null +++ b/src/panfrost/midgard/midgard_compile.c @@ -0,0 +1,2901 @@ +/* + * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <fcntl.h> +#include <stdint.h> +#include <stdlib.h> +#include <stdio.h> +#include <err.h> + +#include "main/mtypes.h" +#include "compiler/glsl/glsl_to_nir.h" +#include "compiler/nir_types.h" +#include "main/imports.h" +#include "compiler/nir/nir_builder.h" +#include "util/half_float.h" +#include "util/u_math.h" +#include "util/u_debug.h" +#include "util/u_dynarray.h" +#include "util/list.h" +#include "main/mtypes.h" + +#include "midgard.h" +#include "midgard_nir.h" +#include "midgard_compile.h" +#include "midgard_ops.h" +#include "helpers.h" +#include "compiler.h" + +#include "disassemble.h" + +static const struct debug_named_value debug_options[] = { + {"msgs", MIDGARD_DBG_MSGS, "Print debug messages"}, + {"shaders", MIDGARD_DBG_SHADERS, "Dump shaders in NIR and MIR"}, + {"shaderdb", MIDGARD_DBG_SHADERDB, "Prints shader-db statistics"}, + DEBUG_NAMED_VALUE_END +}; + +DEBUG_GET_ONCE_FLAGS_OPTION(midgard_debug, "MIDGARD_MESA_DEBUG", debug_options, 0) + +unsigned SHADER_DB_COUNT = 0; + +int midgard_debug = 0; + +#define DBG(fmt, ...) \ + do { if (midgard_debug & MIDGARD_DBG_MSGS) \ + fprintf(stderr, "%s:%d: "fmt, \ + __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0) + +static bool +midgard_is_branch_unit(unsigned unit) +{ + return (unit == ALU_ENAB_BRANCH) || (unit == ALU_ENAB_BR_COMPACT); +} + +static void +midgard_block_add_successor(midgard_block *block, midgard_block *successor) +{ + block->successors[block->nr_successors++] = successor; + assert(block->nr_successors <= ARRAY_SIZE(block->successors)); +} + +/* Helpers to generate midgard_instruction's using macro magic, since every + * driver seems to do it that way */ + +#define EMIT(op, ...) emit_mir_instruction(ctx, v_##op(__VA_ARGS__)); + +#define M_LOAD_STORE(name, rname, uname) \ + static midgard_instruction m_##name(unsigned ssa, unsigned address) { \ + midgard_instruction i = { \ + .type = TAG_LOAD_STORE_4, \ + .mask = 0xF, \ + .ssa_args = { \ + .rname = ssa, \ + .uname = -1, \ + .src1 = -1 \ + }, \ + .load_store = { \ + .op = midgard_op_##name, \ + .swizzle = SWIZZLE_XYZW, \ + .address = address \ + } \ + }; \ + \ + return i; \ + } + +#define M_LOAD(name) M_LOAD_STORE(name, dest, src0) +#define M_STORE(name) M_LOAD_STORE(name, src0, dest) + +/* Inputs a NIR ALU source, with modifiers attached if necessary, and outputs + * the corresponding Midgard source */ + +static midgard_vector_alu_src +vector_alu_modifiers(nir_alu_src *src, bool is_int, unsigned broadcast_count, + bool half, bool sext) +{ + if (!src) return blank_alu_src; + + /* Figure out how many components there are so we can adjust the + * swizzle. Specifically we want to broadcast the last channel so + * things like ball2/3 work + */ + + if (broadcast_count) { + uint8_t last_component = src->swizzle[broadcast_count - 1]; + + for (unsigned c = broadcast_count; c < NIR_MAX_VEC_COMPONENTS; ++c) { + src->swizzle[c] = last_component; + } + } + + midgard_vector_alu_src alu_src = { + .rep_low = 0, + .rep_high = 0, + .half = half, + .swizzle = SWIZZLE_FROM_ARRAY(src->swizzle) + }; + + if (is_int) { + alu_src.mod = midgard_int_normal; + + /* Sign/zero-extend if needed */ + + if (half) { + alu_src.mod = sext ? + midgard_int_sign_extend + : midgard_int_zero_extend; + } + + /* These should have been lowered away */ + assert(!(src->abs || src->negate)); + } else { + alu_src.mod = (src->abs << 0) | (src->negate << 1); + } + + return alu_src; +} + +/* load/store instructions have both 32-bit and 16-bit variants, depending on + * whether we are using vectors composed of highp or mediump. At the moment, we + * don't support half-floats -- this requires changes in other parts of the + * compiler -- therefore the 16-bit versions are commented out. */ + +//M_LOAD(ld_attr_16); +M_LOAD(ld_attr_32); +//M_LOAD(ld_vary_16); +M_LOAD(ld_vary_32); +//M_LOAD(ld_uniform_16); +M_LOAD(ld_uniform_32); +M_LOAD(ld_color_buffer_8); +//M_STORE(st_vary_16); +M_STORE(st_vary_32); +M_STORE(st_cubemap_coords); + +static midgard_instruction +v_alu_br_compact_cond(midgard_jmp_writeout_op op, unsigned tag, signed offset, unsigned cond) +{ + midgard_branch_cond branch = { + .op = op, + .dest_tag = tag, + .offset = offset, + .cond = cond + }; + + uint16_t compact; + memcpy(&compact, &branch, sizeof(branch)); + + midgard_instruction ins = { + .type = TAG_ALU_4, + .unit = ALU_ENAB_BR_COMPACT, + .prepacked_branch = true, + .compact_branch = true, + .br_compact = compact + }; + + if (op == midgard_jmp_writeout_op_writeout) + ins.writeout = true; + + return ins; +} + +static midgard_instruction +v_branch(bool conditional, bool invert) +{ + midgard_instruction ins = { + .type = TAG_ALU_4, + .unit = ALU_ENAB_BRANCH, + .compact_branch = true, + .branch = { + .conditional = conditional, + .invert_conditional = invert + } + }; + + return ins; +} + +static midgard_branch_extended +midgard_create_branch_extended( midgard_condition cond, + midgard_jmp_writeout_op op, + unsigned dest_tag, + signed quadword_offset) +{ + /* For unclear reasons, the condition code is repeated 8 times */ + uint16_t duplicated_cond = + (cond << 14) | + (cond << 12) | + (cond << 10) | + (cond << 8) | + (cond << 6) | + (cond << 4) | + (cond << 2) | + (cond << 0); + + midgard_branch_extended branch = { + .op = op, + .dest_tag = dest_tag, + .offset = quadword_offset, + .cond = duplicated_cond + }; + + return branch; +} + +static void +attach_constants(compiler_context *ctx, midgard_instruction *ins, void *constants, int name) +{ + ins->has_constants = true; + memcpy(&ins->constants, constants, 16); +} + +static int +glsl_type_size(const struct glsl_type *type, bool bindless) +{ + return glsl_count_attribute_slots(type, false); +} + +/* Lower fdot2 to a vector multiplication followed by channel addition */ +static void +midgard_nir_lower_fdot2_body(nir_builder *b, nir_alu_instr *alu) +{ + if (alu->op != nir_op_fdot2) + return; + + b->cursor = nir_before_instr(&alu->instr); + + nir_ssa_def *src0 = nir_ssa_for_alu_src(b, alu, 0); + nir_ssa_def *src1 = nir_ssa_for_alu_src(b, alu, 1); + + nir_ssa_def *product = nir_fmul(b, src0, src1); + + nir_ssa_def *sum = nir_fadd(b, + nir_channel(b, product, 0), + nir_channel(b, product, 1)); + + /* Replace the fdot2 with this sum */ + nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(sum)); +} + +static int +midgard_nir_sysval_for_intrinsic(nir_intrinsic_instr *instr) +{ + switch (instr->intrinsic) { + case nir_intrinsic_load_viewport_scale: + return PAN_SYSVAL_VIEWPORT_SCALE; + case nir_intrinsic_load_viewport_offset: + return PAN_SYSVAL_VIEWPORT_OFFSET; + default: + return -1; + } +} + +static unsigned +nir_dest_index(compiler_context *ctx, nir_dest *dst) +{ + if (dst->is_ssa) + return dst->ssa.index; + else { + assert(!dst->reg.indirect); + return ctx->func->impl->ssa_alloc + dst->reg.reg->index; + } +} + +static int sysval_for_instr(compiler_context *ctx, nir_instr *instr, + unsigned *dest) +{ + nir_intrinsic_instr *intr; + nir_dest *dst = NULL; + nir_tex_instr *tex; + int sysval = -1; + + switch (instr->type) { + case nir_instr_type_intrinsic: + intr = nir_instr_as_intrinsic(instr); + sysval = midgard_nir_sysval_for_intrinsic(intr); + dst = &intr->dest; + break; + case nir_instr_type_tex: + tex = nir_instr_as_tex(instr); + if (tex->op != nir_texop_txs) + break; + + sysval = PAN_SYSVAL(TEXTURE_SIZE, + PAN_TXS_SYSVAL_ID(tex->texture_index, + nir_tex_instr_dest_size(tex) - + (tex->is_array ? 1 : 0), + tex->is_array)); + dst = &tex->dest; + break; + default: + break; + } + + if (dest && dst) + *dest = nir_dest_index(ctx, dst); + + return sysval; +} + +static void +midgard_nir_assign_sysval_body(compiler_context *ctx, nir_instr *instr) +{ + int sysval; + + sysval = sysval_for_instr(ctx, instr, NULL); + if (sysval < 0) + return; + + /* We have a sysval load; check if it's already been assigned */ + + if (_mesa_hash_table_u64_search(ctx->sysval_to_id, sysval)) + return; + + /* It hasn't -- so assign it now! */ + + unsigned id = ctx->sysval_count++; + _mesa_hash_table_u64_insert(ctx->sysval_to_id, sysval, (void *) ((uintptr_t) id + 1)); + ctx->sysvals[id] = sysval; +} + +static void +midgard_nir_assign_sysvals(compiler_context *ctx, nir_shader *shader) +{ + ctx->sysval_count = 0; + + nir_foreach_function(function, shader) { + if (!function->impl) continue; + + nir_foreach_block(block, function->impl) { + nir_foreach_instr_safe(instr, block) { + midgard_nir_assign_sysval_body(ctx, instr); + } + } + } +} + +static bool +midgard_nir_lower_fdot2(nir_shader *shader) +{ + bool progress = false; + + nir_foreach_function(function, shader) { + if (!function->impl) continue; + + nir_builder _b; + nir_builder *b = &_b; + nir_builder_init(b, function->impl); + + nir_foreach_block(block, function->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_alu) continue; + + nir_alu_instr *alu = nir_instr_as_alu(instr); + midgard_nir_lower_fdot2_body(b, alu); + + progress |= true; + } + } + + nir_metadata_preserve(function->impl, nir_metadata_block_index | nir_metadata_dominance); + + } + + return progress; +} + +/* Flushes undefined values to zero */ + +static void +optimise_nir(nir_shader *nir) +{ + bool progress; + unsigned lower_flrp = + (nir->options->lower_flrp16 ? 16 : 0) | + (nir->options->lower_flrp32 ? 32 : 0) | + (nir->options->lower_flrp64 ? 64 : 0); + + NIR_PASS(progress, nir, nir_lower_regs_to_ssa); + NIR_PASS(progress, nir, midgard_nir_lower_fdot2); + NIR_PASS(progress, nir, nir_lower_idiv); + + nir_lower_tex_options lower_tex_1st_pass_options = { + .lower_rect = true, + .lower_txp = ~0 + }; + + nir_lower_tex_options lower_tex_2nd_pass_options = { + .lower_txs_lod = true, + }; + + NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_1st_pass_options); + NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_2nd_pass_options); + + do { + progress = false; + + NIR_PASS(progress, nir, nir_lower_var_copies); + NIR_PASS(progress, nir, nir_lower_vars_to_ssa); + + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_dead_cf); + NIR_PASS(progress, nir, nir_opt_cse); + NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true); + NIR_PASS(progress, nir, nir_opt_algebraic); + NIR_PASS(progress, nir, nir_opt_constant_folding); + + if (lower_flrp != 0) { + bool lower_flrp_progress = false; + NIR_PASS(lower_flrp_progress, + nir, + nir_lower_flrp, + lower_flrp, + false /* always_precise */, + nir->options->lower_ffma); + if (lower_flrp_progress) { + NIR_PASS(progress, nir, + nir_opt_constant_folding); + progress = true; + } + + /* Nothing should rematerialize any flrps, so we only + * need to do this lowering once. + */ + lower_flrp = 0; + } + + NIR_PASS(progress, nir, nir_opt_undef); + NIR_PASS(progress, nir, nir_undef_to_zero); + + NIR_PASS(progress, nir, nir_opt_loop_unroll, + nir_var_shader_in | + nir_var_shader_out | + nir_var_function_temp); + + NIR_PASS(progress, nir, nir_opt_vectorize); + } while (progress); + + /* Must be run at the end to prevent creation of fsin/fcos ops */ + NIR_PASS(progress, nir, midgard_nir_scale_trig); + + do { + progress = false; + + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_algebraic); + NIR_PASS(progress, nir, nir_opt_constant_folding); + NIR_PASS(progress, nir, nir_copy_prop); + } while (progress); + + NIR_PASS(progress, nir, nir_opt_algebraic_late); + + /* We implement booleans as 32-bit 0/~0 */ + NIR_PASS(progress, nir, nir_lower_bool_to_int32); + + /* Now that booleans are lowered, we can run out late opts */ + NIR_PASS(progress, nir, midgard_nir_lower_algebraic_late); + + /* Lower mods for float ops only. Integer ops don't support modifiers + * (saturate doesn't make sense on integers, neg/abs require dedicated + * instructions) */ + + NIR_PASS(progress, nir, nir_lower_to_source_mods, nir_lower_float_source_mods); + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_dce); + + /* Take us out of SSA */ + NIR_PASS(progress, nir, nir_lower_locals_to_regs); + NIR_PASS(progress, nir, nir_convert_from_ssa, true); + + /* We are a vector architecture; write combine where possible */ + NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest); + NIR_PASS(progress, nir, nir_lower_vec_to_movs); + + NIR_PASS(progress, nir, nir_opt_dce); +} + +/* Front-half of aliasing the SSA slots, merely by inserting the flag in the + * appropriate hash table. Intentional off-by-one to avoid confusing NULL with + * r0. See the comments in compiler_context */ + +static void +alias_ssa(compiler_context *ctx, int dest, int src) +{ + _mesa_hash_table_u64_insert(ctx->ssa_to_alias, dest + 1, (void *) ((uintptr_t) src + 1)); + _mesa_set_add(ctx->leftover_ssa_to_alias, (void *) (uintptr_t) (dest + 1)); +} + +/* ...or undo it, after which the original index will be used (dummy move should be emitted alongside this) */ + +static void +unalias_ssa(compiler_context *ctx, int dest) +{ + _mesa_hash_table_u64_remove(ctx->ssa_to_alias, dest + 1); + /* TODO: Remove from leftover or no? */ +} + +/* Do not actually emit a load; instead, cache the constant for inlining */ + +static void +emit_load_const(compiler_context *ctx, nir_load_const_instr *instr) +{ + nir_ssa_def def = instr->def; + + float *v = rzalloc_array(NULL, float, 4); + nir_const_load_to_arr(v, instr, f32); + _mesa_hash_table_u64_insert(ctx->ssa_constants, def.index + 1, v); +} + +static unsigned +nir_src_index(compiler_context *ctx, nir_src *src) +{ + if (src->is_ssa) + return src->ssa->index; + else { + assert(!src->reg.indirect); + return ctx->func->impl->ssa_alloc + src->reg.reg->index; + } +} + +static unsigned +nir_alu_src_index(compiler_context *ctx, nir_alu_src *src) +{ + return nir_src_index(ctx, &src->src); +} + +static bool +nir_is_non_scalar_swizzle(nir_alu_src *src, unsigned nr_components) +{ + unsigned comp = src->swizzle[0]; + + for (unsigned c = 1; c < nr_components; ++c) { + if (src->swizzle[c] != comp) + return true; + } + + return false; +} + +/* Midgard puts scalar conditionals in r31.w; move an arbitrary source (the + * output of a conditional test) into that register */ + +static void +emit_condition(compiler_context *ctx, nir_src *src, bool for_branch, unsigned component) +{ + int condition = nir_src_index(ctx, src); + + /* Source to swizzle the desired component into w */ + + const midgard_vector_alu_src alu_src = { + .swizzle = SWIZZLE(component, component, component, component), + }; + + /* There is no boolean move instruction. Instead, we simulate a move by + * ANDing the condition with itself to get it into r31.w */ + + midgard_instruction ins = { + .type = TAG_ALU_4, + + /* We need to set the conditional as close as possible */ + .precede_break = true, + .unit = for_branch ? UNIT_SMUL : UNIT_SADD, + .mask = 1 << COMPONENT_W, + + .ssa_args = { + .src0 = condition, + .src1 = condition, + .dest = SSA_FIXED_REGISTER(31), + }, + + .alu = { + .op = midgard_alu_op_iand, + .outmod = midgard_outmod_int_wrap, + .reg_mode = midgard_reg_mode_32, + .dest_override = midgard_dest_override_none, + .src1 = vector_alu_srco_unsigned(alu_src), + .src2 = vector_alu_srco_unsigned(alu_src) + }, + }; + + emit_mir_instruction(ctx, ins); +} + +/* Or, for mixed conditions (with csel_v), here's a vector version using all of + * r31 instead */ + +static void +emit_condition_mixed(compiler_context *ctx, nir_alu_src *src, unsigned nr_comp) +{ + int condition = nir_src_index(ctx, &src->src); + + /* Source to swizzle the desired component into w */ + + const midgard_vector_alu_src alu_src = { + .swizzle = SWIZZLE_FROM_ARRAY(src->swizzle), + }; + + /* There is no boolean move instruction. Instead, we simulate a move by + * ANDing the condition with itself to get it into r31.w */ + + midgard_instruction ins = { + .type = TAG_ALU_4, + .precede_break = true, + .mask = mask_of(nr_comp), + .ssa_args = { + .src0 = condition, + .src1 = condition, + .dest = SSA_FIXED_REGISTER(31), + }, + .alu = { + .op = midgard_alu_op_iand, + .outmod = midgard_outmod_int_wrap, + .reg_mode = midgard_reg_mode_32, + .dest_override = midgard_dest_override_none, + .src1 = vector_alu_srco_unsigned(alu_src), + .src2 = vector_alu_srco_unsigned(alu_src) + }, + }; + + emit_mir_instruction(ctx, ins); +} + + + +/* Likewise, indirect offsets are put in r27.w. TODO: Allow componentwise + * pinning to eliminate this move in all known cases */ + +static void +emit_indirect_offset(compiler_context *ctx, nir_src *src) +{ + int offset = nir_src_index(ctx, src); + + midgard_instruction ins = { + .type = TAG_ALU_4, + .mask = 1 << COMPONENT_W, + .ssa_args = { + .src0 = SSA_UNUSED_1, + .src1 = offset, + .dest = SSA_FIXED_REGISTER(REGISTER_OFFSET), + }, + .alu = { + .op = midgard_alu_op_imov, + .outmod = midgard_outmod_int_wrap, + .reg_mode = midgard_reg_mode_32, + .dest_override = midgard_dest_override_none, + .src1 = vector_alu_srco_unsigned(zero_alu_src), + .src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx) + }, + }; + + emit_mir_instruction(ctx, ins); +} + +#define ALU_CASE(nir, _op) \ + case nir_op_##nir: \ + op = midgard_alu_op_##_op; \ + assert(src_bitsize == dst_bitsize); \ + break; + +#define ALU_CASE_BCAST(nir, _op, count) \ + case nir_op_##nir: \ + op = midgard_alu_op_##_op; \ + broadcast_swizzle = count; \ + assert(src_bitsize == dst_bitsize); \ + break; +static bool +nir_is_fzero_constant(nir_src src) +{ + if (!nir_src_is_const(src)) + return false; + + for (unsigned c = 0; c < nir_src_num_components(src); ++c) { + if (nir_src_comp_as_float(src, c) != 0.0) + return false; + } + + return true; +} + +/* Analyze the sizes of the inputs to determine which reg mode. Ops needed + * special treatment override this anyway. */ + +static midgard_reg_mode +reg_mode_for_nir(nir_alu_instr *instr) +{ + unsigned src_bitsize = nir_src_bit_size(instr->src[0].src); + + switch (src_bitsize) { + case 8: + return midgard_reg_mode_8; + case 16: + return midgard_reg_mode_16; + case 32: + return midgard_reg_mode_32; + case 64: + return midgard_reg_mode_64; + default: + unreachable("Invalid bit size"); + } +} + +static void +emit_alu(compiler_context *ctx, nir_alu_instr *instr) +{ + bool is_ssa = instr->dest.dest.is_ssa; + + unsigned dest = nir_dest_index(ctx, &instr->dest.dest); + unsigned nr_components = nir_dest_num_components(instr->dest.dest); + unsigned nr_inputs = nir_op_infos[instr->op].num_inputs; + + /* Most Midgard ALU ops have a 1:1 correspondance to NIR ops; these are + * supported. A few do not and are commented for now. Also, there are a + * number of NIR ops which Midgard does not support and need to be + * lowered, also TODO. This switch block emits the opcode and calling + * convention of the Midgard instruction; actual packing is done in + * emit_alu below */ + + unsigned op; + + /* Number of components valid to check for the instruction (the rest + * will be forced to the last), or 0 to use as-is. Relevant as + * ball-type instructions have a channel count in NIR but are all vec4 + * in Midgard */ + + unsigned broadcast_swizzle = 0; + + /* What register mode should we operate in? */ + midgard_reg_mode reg_mode = + reg_mode_for_nir(instr); + + /* Do we need a destination override? Used for inline + * type conversion */ + + midgard_dest_override dest_override = + midgard_dest_override_none; + + /* Should we use a smaller respective source and sign-extend? */ + + bool half_1 = false, sext_1 = false; + bool half_2 = false, sext_2 = false; + + unsigned src_bitsize = nir_src_bit_size(instr->src[0].src); + unsigned dst_bitsize = nir_dest_bit_size(instr->dest.dest); + + switch (instr->op) { + ALU_CASE(fadd, fadd); + ALU_CASE(fmul, fmul); + ALU_CASE(fmin, fmin); + ALU_CASE(fmax, fmax); + ALU_CASE(imin, imin); + ALU_CASE(imax, imax); + ALU_CASE(umin, umin); + ALU_CASE(umax, umax); + ALU_CASE(ffloor, ffloor); + ALU_CASE(fround_even, froundeven); + ALU_CASE(ftrunc, ftrunc); + ALU_CASE(fceil, fceil); + ALU_CASE(fdot3, fdot3); + ALU_CASE(fdot4, fdot4); + ALU_CASE(iadd, iadd); + ALU_CASE(isub, isub); + ALU_CASE(imul, imul); + + /* Zero shoved as second-arg */ + ALU_CASE(iabs, iabsdiff); + + ALU_CASE(mov, imov); + + ALU_CASE(feq32, feq); + ALU_CASE(fne32, fne); + ALU_CASE(flt32, flt); + ALU_CASE(ieq32, ieq); + ALU_CASE(ine32, ine); + ALU_CASE(ilt32, ilt); + ALU_CASE(ult32, ult); + + /* We don't have a native b2f32 instruction. Instead, like many + * GPUs, we exploit booleans as 0/~0 for false/true, and + * correspondingly AND + * by 1.0 to do the type conversion. For the moment, prime us + * to emit: + * + * iand [whatever], #0 + * + * At the end of emit_alu (as MIR), we'll fix-up the constant + */ + + ALU_CASE(b2f32, iand); + ALU_CASE(b2i32, iand); + + /* Likewise, we don't have a dedicated f2b32 instruction, but + * we can do a "not equal to 0.0" test. */ + + ALU_CASE(f2b32, fne); + ALU_CASE(i2b32, ine); + + ALU_CASE(frcp, frcp); + ALU_CASE(frsq, frsqrt); + ALU_CASE(fsqrt, fsqrt); + ALU_CASE(fexp2, fexp2); + ALU_CASE(flog2, flog2); + + ALU_CASE(f2i32, f2i_rtz); + ALU_CASE(f2u32, f2u_rtz); + ALU_CASE(i2f32, i2f_rtz); + ALU_CASE(u2f32, u2f_rtz); + + ALU_CASE(f2i16, f2i_rtz); + ALU_CASE(f2u16, f2u_rtz); + ALU_CASE(i2f16, i2f_rtz); + ALU_CASE(u2f16, u2f_rtz); + + ALU_CASE(fsin, fsin); + ALU_CASE(fcos, fcos); + + /* Second op implicit #0 */ + ALU_CASE(inot, inor); + ALU_CASE(iand, iand); + ALU_CASE(ior, ior); + ALU_CASE(ixor, ixor); + ALU_CASE(ishl, ishl); + ALU_CASE(ishr, iasr); + ALU_CASE(ushr, ilsr); + + ALU_CASE_BCAST(b32all_fequal2, fball_eq, 2); + ALU_CASE_BCAST(b32all_fequal3, fball_eq, 3); + ALU_CASE(b32all_fequal4, fball_eq); + + ALU_CASE_BCAST(b32any_fnequal2, fbany_neq, 2); + ALU_CASE_BCAST(b32any_fnequal3, fbany_neq, 3); + ALU_CASE(b32any_fnequal4, fbany_neq); + + ALU_CASE_BCAST(b32all_iequal2, iball_eq, 2); + ALU_CASE_BCAST(b32all_iequal3, iball_eq, 3); + ALU_CASE(b32all_iequal4, iball_eq); + + ALU_CASE_BCAST(b32any_inequal2, ibany_neq, 2); + ALU_CASE_BCAST(b32any_inequal3, ibany_neq, 3); + ALU_CASE(b32any_inequal4, ibany_neq); + + /* Source mods will be shoved in later */ + ALU_CASE(fabs, fmov); + ALU_CASE(fneg, fmov); + ALU_CASE(fsat, fmov); + + /* For size conversion, we use a move. Ideally though we would squash + * these ops together; maybe that has to happen after in NIR as part of + * propagation...? An earlier algebraic pass ensured we step down by + * only / exactly one size. If stepping down, we use a dest override to + * reduce the size; if stepping up, we use a larger-sized move with a + * half source and a sign/zero-extension modifier */ + + case nir_op_i2i8: + case nir_op_i2i16: + case nir_op_i2i32: + /* If we end up upscale, we'll need a sign-extend on the + * operand (the second argument) */ + + sext_2 = true; + case nir_op_u2u8: + case nir_op_u2u16: + case nir_op_u2u32: { + op = midgard_alu_op_imov; + + if (dst_bitsize == (src_bitsize * 2)) { + /* Converting up */ + half_2 = true; + + /* Use a greater register mode */ + reg_mode++; + } else if (src_bitsize == (dst_bitsize * 2)) { + /* Converting down */ + dest_override = midgard_dest_override_lower; + } + + break; + } + + case nir_op_f2f16: { + assert(src_bitsize == 32); + + op = midgard_alu_op_fmov; + dest_override = midgard_dest_override_lower; + break; + } + + case nir_op_f2f32: { + assert(src_bitsize == 16); + + op = midgard_alu_op_fmov; + half_2 = true; + reg_mode++; + break; + } + + + /* For greater-or-equal, we lower to less-or-equal and flip the + * arguments */ + + case nir_op_fge: + case nir_op_fge32: + case nir_op_ige32: + case nir_op_uge32: { + op = + instr->op == nir_op_fge ? midgard_alu_op_fle : + instr->op == nir_op_fge32 ? midgard_alu_op_fle : + instr->op == nir_op_ige32 ? midgard_alu_op_ile : + instr->op == nir_op_uge32 ? midgard_alu_op_ule : + 0; + + /* Swap via temporary */ + nir_alu_src temp = instr->src[1]; + instr->src[1] = instr->src[0]; + instr->src[0] = temp; + + break; + } + + case nir_op_b32csel: { + /* Midgard features both fcsel and icsel, depending on + * the type of the arguments/output. However, as long + * as we're careful we can _always_ use icsel and + * _never_ need fcsel, since the latter does additional + * floating-point-specific processing whereas the + * former just moves bits on the wire. It's not obvious + * why these are separate opcodes, save for the ability + * to do things like sat/pos/abs/neg for free */ + + bool mixed = nir_is_non_scalar_swizzle(&instr->src[0], nr_components); + op = mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel; + + /* csel works as a two-arg in Midgard, since the condition is hardcoded in r31.w */ + nr_inputs = 2; + + /* Emit the condition into r31 */ + + if (mixed) + emit_condition_mixed(ctx, &instr->src[0], nr_components); + else + emit_condition(ctx, &instr->src[0].src, false, instr->src[0].swizzle[0]); + + /* The condition is the first argument; move the other + * arguments up one to be a binary instruction for + * Midgard */ + + memmove(instr->src, instr->src + 1, 2 * sizeof(nir_alu_src)); + break; + } + + default: + DBG("Unhandled ALU op %s\n", nir_op_infos[instr->op].name); + assert(0); + return; + } + + /* Midgard can perform certain modifiers on output of an ALU op */ + unsigned outmod; + + if (midgard_is_integer_out_op(op)) { + outmod = midgard_outmod_int_wrap; + } else { + bool sat = instr->dest.saturate || instr->op == nir_op_fsat; + outmod = sat ? midgard_outmod_sat : midgard_outmod_none; + } + + /* fmax(a, 0.0) can turn into a .pos modifier as an optimization */ + + if (instr->op == nir_op_fmax) { + if (nir_is_fzero_constant(instr->src[0].src)) { + op = midgard_alu_op_fmov; + nr_inputs = 1; + outmod = midgard_outmod_pos; + instr->src[0] = instr->src[1]; + } else if (nir_is_fzero_constant(instr->src[1].src)) { + op = midgard_alu_op_fmov; + nr_inputs = 1; + outmod = midgard_outmod_pos; + } + } + + /* Fetch unit, quirks, etc information */ + unsigned opcode_props = alu_opcode_props[op].props; + bool quirk_flipped_r24 = opcode_props & QUIRK_FLIPPED_R24; + + /* src0 will always exist afaik, but src1 will not for 1-argument + * instructions. The latter can only be fetched if the instruction + * needs it, or else we may segfault. */ + + unsigned src0 = nir_alu_src_index(ctx, &instr->src[0]); + unsigned src1 = nr_inputs == 2 ? nir_alu_src_index(ctx, &instr->src[1]) : SSA_UNUSED_0; + + /* Rather than use the instruction generation helpers, we do it + * ourselves here to avoid the mess */ + + midgard_instruction ins = { + .type = TAG_ALU_4, + .ssa_args = { + .src0 = quirk_flipped_r24 ? SSA_UNUSED_1 : src0, + .src1 = quirk_flipped_r24 ? src0 : src1, + .dest = dest, + } + }; + + nir_alu_src *nirmods[2] = { NULL }; + + if (nr_inputs == 2) { + nirmods[0] = &instr->src[0]; + nirmods[1] = &instr->src[1]; + } else if (nr_inputs == 1) { + nirmods[quirk_flipped_r24] = &instr->src[0]; + } else { + assert(0); + } + + /* These were lowered to a move, so apply the corresponding mod */ + + if (instr->op == nir_op_fneg || instr->op == nir_op_fabs) { + nir_alu_src *s = nirmods[quirk_flipped_r24]; + + if (instr->op == nir_op_fneg) + s->negate = !s->negate; + + if (instr->op == nir_op_fabs) + s->abs = !s->abs; + } + + bool is_int = midgard_is_integer_op(op); + + ins.mask = mask_of(nr_components); + + midgard_vector_alu alu = { + .op = op, + .reg_mode = reg_mode, + .dest_override = dest_override, + .outmod = outmod, + + .src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[0], is_int, broadcast_swizzle, half_1, sext_1)), + .src2 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[1], is_int, broadcast_swizzle, half_2, sext_2)), + }; + + /* Apply writemask if non-SSA, keeping in mind that we can't write to components that don't exist */ + + if (!is_ssa) + ins.mask &= instr->dest.write_mask; + + ins.alu = alu; + + /* Late fixup for emulated instructions */ + + if (instr->op == nir_op_b2f32 || instr->op == nir_op_b2i32) { + /* Presently, our second argument is an inline #0 constant. + * Switch over to an embedded 1.0 constant (that can't fit + * inline, since we're 32-bit, not 16-bit like the inline + * constants) */ + + ins.ssa_args.inline_constant = false; + ins.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + ins.has_constants = true; + + if (instr->op == nir_op_b2f32) { + ins.constants[0] = 1.0f; + } else { + /* Type pun it into place */ + uint32_t one = 0x1; + memcpy(&ins.constants[0], &one, sizeof(uint32_t)); + } + + ins.alu.src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx); + } else if (nr_inputs == 1 && !quirk_flipped_r24) { + /* Lots of instructions need a 0 plonked in */ + ins.ssa_args.inline_constant = false; + ins.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + ins.has_constants = true; + ins.constants[0] = 0.0f; + ins.alu.src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx); + } else if (instr->op == nir_op_inot) { + /* ~b = ~(b & b), so duplicate the source */ + ins.ssa_args.src1 = ins.ssa_args.src0; + ins.alu.src2 = ins.alu.src1; + } + + if ((opcode_props & UNITS_ALL) == UNIT_VLUT) { + /* To avoid duplicating the lookup tables (probably), true LUT + * instructions can only operate as if they were scalars. Lower + * them here by changing the component. */ + + uint8_t original_swizzle[4]; + memcpy(original_swizzle, nirmods[0]->swizzle, sizeof(nirmods[0]->swizzle)); + unsigned orig_mask = ins.mask; + + for (int i = 0; i < nr_components; ++i) { + /* Mask the associated component, dropping the + * instruction if needed */ + + ins.mask = 1 << i; + ins.mask &= orig_mask; + + if (!ins.mask) + continue; + + for (int j = 0; j < 4; ++j) + nirmods[0]->swizzle[j] = original_swizzle[i]; /* Pull from the correct component */ + + ins.alu.src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[0], is_int, broadcast_swizzle, half_1, false)); + emit_mir_instruction(ctx, ins); + } + } else { + emit_mir_instruction(ctx, ins); + } +} + +#undef ALU_CASE + +/* Uniforms and UBOs use a shared code path, as uniforms are just (slightly + * optimized) versions of UBO #0 */ + +static void +emit_ubo_read( + compiler_context *ctx, + unsigned dest, + unsigned offset, + nir_src *indirect_offset, + unsigned index) +{ + /* TODO: half-floats */ + + if (!indirect_offset && offset < ctx->uniform_cutoff && index == 0) { + /* Fast path: For the first 16 uniforms, direct accesses are + * 0-cycle, since they're just a register fetch in the usual + * case. So, we alias the registers while we're still in + * SSA-space */ + + int reg_slot = 23 - offset; + alias_ssa(ctx, dest, SSA_FIXED_REGISTER(reg_slot)); + } else { + /* Otherwise, read from the 'special' UBO to access + * higher-indexed uniforms, at a performance cost. More + * generally, we're emitting a UBO read instruction. */ + + midgard_instruction ins = m_ld_uniform_32(dest, offset); + + /* TODO: Don't split */ + ins.load_store.varying_parameters = (offset & 7) << 7; + ins.load_store.address = offset >> 3; + + if (indirect_offset) { + emit_indirect_offset(ctx, indirect_offset); + ins.load_store.unknown = 0x8700 | index; /* xxx: what is this? */ + } else { + ins.load_store.unknown = 0x1E00 | index; /* xxx: what is this? */ + } + + /* TODO respect index */ + + emit_mir_instruction(ctx, ins); + } +} + +static void +emit_varying_read( + compiler_context *ctx, + unsigned dest, unsigned offset, + unsigned nr_comp, unsigned component, + nir_src *indirect_offset, nir_alu_type type) +{ + /* XXX: Half-floats? */ + /* TODO: swizzle, mask */ + + midgard_instruction ins = m_ld_vary_32(dest, offset); + ins.mask = mask_of(nr_comp); + ins.load_store.swizzle = SWIZZLE_XYZW >> (2 * component); + + midgard_varying_parameter p = { + .is_varying = 1, + .interpolation = midgard_interp_default, + .flat = /*var->data.interpolation == INTERP_MODE_FLAT*/ 0 + }; + + unsigned u; + memcpy(&u, &p, sizeof(p)); + ins.load_store.varying_parameters = u; + + if (indirect_offset) { + /* We need to add in the dynamic index, moved to r27.w */ + emit_indirect_offset(ctx, indirect_offset); + ins.load_store.unknown = 0x79e; /* xxx: what is this? */ + } else { + /* Just a direct load */ + ins.load_store.unknown = 0x1e9e; /* xxx: what is this? */ + } + + /* Use the type appropriate load */ + switch (type) { + case nir_type_uint: + case nir_type_bool: + ins.load_store.op = midgard_op_ld_vary_32u; + break; + case nir_type_int: + ins.load_store.op = midgard_op_ld_vary_32i; + break; + case nir_type_float: + ins.load_store.op = midgard_op_ld_vary_32; + break; + default: + unreachable("Attempted to load unknown type"); + break; + } + + emit_mir_instruction(ctx, ins); +} + +static void +emit_sysval_read(compiler_context *ctx, nir_instr *instr) +{ + unsigned dest; + /* Figure out which uniform this is */ + int sysval = sysval_for_instr(ctx, instr, &dest); + void *val = _mesa_hash_table_u64_search(ctx->sysval_to_id, sysval); + + /* Sysvals are prefix uniforms */ + unsigned uniform = ((uintptr_t) val) - 1; + + /* Emit the read itself -- this is never indirect */ + emit_ubo_read(ctx, dest, uniform, NULL, 0); +} + +static void +emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) +{ + unsigned offset = 0, reg; + + switch (instr->intrinsic) { + case nir_intrinsic_discard_if: + emit_condition(ctx, &instr->src[0], true, COMPONENT_X); + + /* fallthrough */ + + case nir_intrinsic_discard: { + bool conditional = instr->intrinsic == nir_intrinsic_discard_if; + struct midgard_instruction discard = v_branch(conditional, false); + discard.branch.target_type = TARGET_DISCARD; + emit_mir_instruction(ctx, discard); + + ctx->can_discard = true; + break; + } + + case nir_intrinsic_load_uniform: + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_input: { + bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform; + bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo; + + /* Get the base type of the intrinsic */ + /* TODO: Infer type? Does it matter? */ + nir_alu_type t = + is_ubo ? nir_type_uint : nir_intrinsic_type(instr); + t = nir_alu_type_get_base_type(t); + + if (!is_ubo) { + offset = nir_intrinsic_base(instr); + } + + unsigned nr_comp = nir_intrinsic_dest_components(instr); + + nir_src *src_offset = nir_get_io_offset_src(instr); + + bool direct = nir_src_is_const(*src_offset); + + if (direct) + offset += nir_src_as_uint(*src_offset); + + /* We may need to apply a fractional offset */ + int component = instr->intrinsic == nir_intrinsic_load_input ? + nir_intrinsic_component(instr) : 0; + reg = nir_dest_index(ctx, &instr->dest); + + if (is_uniform && !ctx->is_blend) { + emit_ubo_read(ctx, reg, ctx->sysval_count + offset, !direct ? &instr->src[0] : NULL, 0); + } else if (is_ubo) { + nir_src index = instr->src[0]; + + /* We don't yet support indirect UBOs. For indirect + * block numbers (if that's possible), we don't know + * enough about the hardware yet. For indirect sources, + * we know what we need but we need to add some NIR + * support for lowering correctly with respect to + * 128-bit reads */ + + assert(nir_src_is_const(index)); + assert(nir_src_is_const(*src_offset)); + + /* TODO: Alignment */ + assert((offset & 0xF) == 0); + + uint32_t uindex = nir_src_as_uint(index) + 1; + emit_ubo_read(ctx, reg, offset / 16, NULL, uindex); + } else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_blend) { + emit_varying_read(ctx, reg, offset, nr_comp, component, !direct ? &instr->src[0] : NULL, t); + } else if (ctx->is_blend) { + /* For blend shaders, load the input color, which is + * preloaded to r0 */ + + midgard_instruction move = v_mov(reg, blank_alu_src, SSA_FIXED_REGISTER(0)); + emit_mir_instruction(ctx, move); + } else if (ctx->stage == MESA_SHADER_VERTEX) { + midgard_instruction ins = m_ld_attr_32(reg, offset); + ins.load_store.unknown = 0x1E1E; /* XXX: What is this? */ + ins.mask = mask_of(nr_comp); + + /* Use the type appropriate load */ + switch (t) { + case nir_type_uint: + case nir_type_bool: + ins.load_store.op = midgard_op_ld_attr_32u; + break; + case nir_type_int: + ins.load_store.op = midgard_op_ld_attr_32i; + break; + case nir_type_float: + ins.load_store.op = midgard_op_ld_attr_32; + break; + default: + unreachable("Attempted to load unknown type"); + break; + } + + emit_mir_instruction(ctx, ins); + } else { + DBG("Unknown load\n"); + assert(0); + } + + break; + } + + /* Reads 128-bit value raw off the tilebuffer during blending, tasty */ + + case nir_intrinsic_load_raw_output_pan: + reg = nir_dest_index(ctx, &instr->dest); + assert(ctx->is_blend); + + midgard_instruction ins = m_ld_color_buffer_8(reg, 0); + emit_mir_instruction(ctx, ins); + break; + + case nir_intrinsic_load_blend_const_color_rgba: { + assert(ctx->is_blend); + reg = nir_dest_index(ctx, &instr->dest); + + /* Blend constants are embedded directly in the shader and + * patched in, so we use some magic routing */ + + midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, reg); + ins.has_constants = true; + ins.has_blend_constant = true; + emit_mir_instruction(ctx, ins); + break; + } + + case nir_intrinsic_store_output: + assert(nir_src_is_const(instr->src[1]) && "no indirect outputs"); + + offset = nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[1]); + + reg = nir_src_index(ctx, &instr->src[0]); + + if (ctx->stage == MESA_SHADER_FRAGMENT) { + /* gl_FragColor is not emitted with load/store + * instructions. Instead, it gets plonked into + * r0 at the end of the shader and we do the + * framebuffer writeout dance. TODO: Defer + * writes */ + + midgard_instruction move = v_mov(reg, blank_alu_src, SSA_FIXED_REGISTER(0)); + emit_mir_instruction(ctx, move); + + /* Save the index we're writing to for later reference + * in the epilogue */ + + ctx->fragment_output = reg; + } else if (ctx->stage == MESA_SHADER_VERTEX) { + /* Varyings are written into one of two special + * varying register, r26 or r27. The register itself is + * selected as the register in the st_vary instruction, + * minus the base of 26. E.g. write into r27 and then + * call st_vary(1) */ + + midgard_instruction ins = v_mov(reg, blank_alu_src, SSA_FIXED_REGISTER(26)); + emit_mir_instruction(ctx, ins); + + /* We should have been vectorized, though we don't + * currently check that st_vary is emitted only once + * per slot (this is relevant, since there's not a mask + * parameter available on the store [set to 0 by the + * blob]). We do respect the component by adjusting the + * swizzle. */ + + unsigned component = nir_intrinsic_component(instr); + + midgard_instruction st = m_st_vary_32(SSA_FIXED_REGISTER(0), offset); + st.load_store.unknown = 0x1E9E; /* XXX: What is this? */ + st.load_store.swizzle = SWIZZLE_XYZW << (2*component); + emit_mir_instruction(ctx, st); + } else { + DBG("Unknown store\n"); + assert(0); + } + + break; + + /* Special case of store_output for lowered blend shaders */ + case nir_intrinsic_store_raw_output_pan: + assert (ctx->stage == MESA_SHADER_FRAGMENT); + reg = nir_src_index(ctx, &instr->src[0]); + + midgard_instruction move = v_mov(reg, blank_alu_src, SSA_FIXED_REGISTER(0)); + emit_mir_instruction(ctx, move); + ctx->fragment_output = reg; + + break; + + case nir_intrinsic_load_alpha_ref_float: + assert(instr->dest.is_ssa); + + float ref_value = ctx->alpha_ref; + + float *v = ralloc_array(NULL, float, 4); + memcpy(v, &ref_value, sizeof(float)); + _mesa_hash_table_u64_insert(ctx->ssa_constants, instr->dest.ssa.index + 1, v); + break; + + case nir_intrinsic_load_viewport_scale: + case nir_intrinsic_load_viewport_offset: + emit_sysval_read(ctx, &instr->instr); + break; + + default: + printf ("Unhandled intrinsic\n"); + assert(0); + break; + } +} + +static unsigned +midgard_tex_format(enum glsl_sampler_dim dim) +{ + switch (dim) { + case GLSL_SAMPLER_DIM_1D: + case GLSL_SAMPLER_DIM_BUF: + return MALI_TEX_1D; + + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_EXTERNAL: + return MALI_TEX_2D; + + case GLSL_SAMPLER_DIM_3D: + return MALI_TEX_3D; + + case GLSL_SAMPLER_DIM_CUBE: + return MALI_TEX_CUBE; + + default: + DBG("Unknown sampler dim type\n"); + assert(0); + return 0; + } +} + +/* Tries to attach an explicit LOD / bias as a constant. Returns whether this + * was successful */ + +static bool +pan_attach_constant_bias( + compiler_context *ctx, + nir_src lod, + midgard_texture_word *word) +{ + /* To attach as constant, it has to *be* constant */ + + if (!nir_src_is_const(lod)) + return false; + + float f = nir_src_as_float(lod); + + /* Break into fixed-point */ + signed lod_int = f; + float lod_frac = f - lod_int; + + /* Carry over negative fractions */ + if (lod_frac < 0.0) { + lod_int--; + lod_frac += 1.0; + } + + /* Encode */ + word->bias = float_to_ubyte(lod_frac); + word->bias_int = lod_int; + + return true; +} + +static enum mali_sampler_type +midgard_sampler_type(nir_alu_type t) { + switch (nir_alu_type_get_base_type(t)) + { + case nir_type_float: + return MALI_SAMPLER_FLOAT; + case nir_type_int: + return MALI_SAMPLER_SIGNED; + case nir_type_uint: + return MALI_SAMPLER_UNSIGNED; + default: + unreachable("Unknown sampler type"); + } +} + +static void +emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, + unsigned midgard_texop) +{ + /* TODO */ + //assert (!instr->sampler); + //assert (!instr->texture_array_size); + + /* Allocate registers via a round robin scheme to alternate between the two registers */ + int reg = ctx->texture_op_count & 1; + int in_reg = reg, out_reg = reg; + + /* Make room for the reg */ + + if (ctx->texture_index[reg] > -1) + unalias_ssa(ctx, ctx->texture_index[reg]); + + int texture_index = instr->texture_index; + int sampler_index = texture_index; + + /* No helper to build texture words -- we do it all here */ + midgard_instruction ins = { + .type = TAG_TEXTURE_4, + .mask = 0xF, + .texture = { + .op = midgard_texop, + .format = midgard_tex_format(instr->sampler_dim), + .texture_handle = texture_index, + .sampler_handle = sampler_index, + + /* TODO: Regalloc it in */ + .swizzle = SWIZZLE_XYZW, + + /* TODO: half */ + .in_reg_full = 1, + .out_full = 1, + + .sampler_type = midgard_sampler_type(instr->dest_type), + } + }; + + for (unsigned i = 0; i < instr->num_srcs; ++i) { + int reg = SSA_FIXED_REGISTER(REGISTER_TEXTURE_BASE + in_reg); + int index = nir_src_index(ctx, &instr->src[i].src); + int nr_comp = nir_src_num_components(instr->src[i].src); + midgard_vector_alu_src alu_src = blank_alu_src; + + switch (instr->src[i].src_type) { + case nir_tex_src_coord: { + if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { + /* texelFetch is undefined on samplerCube */ + assert(midgard_texop != TEXTURE_OP_TEXEL_FETCH); + + /* For cubemaps, we need to load coords into + * special r27, and then use a special ld/st op + * to select the face and copy the xy into the + * texture register */ + + alu_src.swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_X); + + midgard_instruction move = v_mov(index, alu_src, SSA_FIXED_REGISTER(27)); + emit_mir_instruction(ctx, move); + + midgard_instruction st = m_st_cubemap_coords(reg, 0); + st.load_store.unknown = 0x24; /* XXX: What is this? */ + st.mask = 0x3; /* xy */ + st.load_store.swizzle = alu_src.swizzle; + emit_mir_instruction(ctx, st); + + ins.texture.in_reg_swizzle = swizzle_of(2); + } else { + ins.texture.in_reg_swizzle = alu_src.swizzle = swizzle_of(nr_comp); + + midgard_instruction mov = v_mov(index, alu_src, reg); + mov.mask = mask_of(nr_comp); + emit_mir_instruction(ctx, mov); + + if (midgard_texop == TEXTURE_OP_TEXEL_FETCH) { + /* Texel fetch opcodes care about the + * values of z and w, so we actually + * need to spill into a second register + * for a texel fetch with register bias + * (for non-2D). TODO: Implement that + */ + + assert(instr->sampler_dim == GLSL_SAMPLER_DIM_2D); + + midgard_instruction zero = v_mov(index, alu_src, reg); + zero.ssa_args.inline_constant = true; + zero.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + zero.has_constants = true; + zero.mask = ~mov.mask; + emit_mir_instruction(ctx, zero); + + ins.texture.in_reg_swizzle = SWIZZLE_XYZZ; + } else { + /* Non-texel fetch doesn't need that + * nonsense. However we do use the Z + * for array indexing */ + bool is_3d = instr->sampler_dim == GLSL_SAMPLER_DIM_3D; + ins.texture.in_reg_swizzle = is_3d ? SWIZZLE_XYZZ : SWIZZLE_XYXZ; + } + } + + break; + } + + case nir_tex_src_bias: + case nir_tex_src_lod: { + /* Try as a constant if we can */ + + bool is_txf = midgard_texop == TEXTURE_OP_TEXEL_FETCH; + if (!is_txf && pan_attach_constant_bias(ctx, instr->src[i].src, &ins.texture)) + break; + + /* Otherwise we use a register. To keep RA simple, we + * put the bias/LOD into the w component of the input + * source, which is otherwise in xy */ + + alu_src.swizzle = SWIZZLE_XXXX; + + midgard_instruction mov = v_mov(index, alu_src, reg); + mov.mask = 1 << COMPONENT_W; + emit_mir_instruction(ctx, mov); + + ins.texture.lod_register = true; + + midgard_tex_register_select sel = { + .select = in_reg, + .full = 1, + + /* w */ + .component_lo = 1, + .component_hi = 1 + }; + + uint8_t packed; + memcpy(&packed, &sel, sizeof(packed)); + ins.texture.bias = packed; + + break; + }; + + default: + unreachable("Unknown texture source type\n"); + } + } + + /* Set registers to read and write from the same place */ + ins.texture.in_reg_select = in_reg; + ins.texture.out_reg_select = out_reg; + + emit_mir_instruction(ctx, ins); + + int o_reg = REGISTER_TEXTURE_BASE + out_reg, o_index = nir_dest_index(ctx, &instr->dest); + midgard_instruction ins2 = v_mov(SSA_FIXED_REGISTER(o_reg), blank_alu_src, o_index); + emit_mir_instruction(ctx, ins2); + + /* Used for .cont and .last hinting */ + ctx->texture_op_count++; +} + +static void +emit_tex(compiler_context *ctx, nir_tex_instr *instr) +{ + /* Fixup op, since only textureLod is permitted in VS but NIR can give + * generic tex in some cases (which confuses the hardware) */ + + bool is_vertex = ctx->stage == MESA_SHADER_VERTEX; + + if (is_vertex && instr->op == nir_texop_tex) + instr->op = nir_texop_txl; + + switch (instr->op) { + case nir_texop_tex: + case nir_texop_txb: + emit_texop_native(ctx, instr, TEXTURE_OP_NORMAL); + break; + case nir_texop_txl: + emit_texop_native(ctx, instr, TEXTURE_OP_LOD); + break; + case nir_texop_txf: + emit_texop_native(ctx, instr, TEXTURE_OP_TEXEL_FETCH); + break; + case nir_texop_txs: + emit_sysval_read(ctx, &instr->instr); + break; + default: + unreachable("Unhanlded texture op"); + } +} + +static void +emit_jump(compiler_context *ctx, nir_jump_instr *instr) +{ + switch (instr->type) { + case nir_jump_break: { + /* Emit a branch out of the loop */ + struct midgard_instruction br = v_branch(false, false); + br.branch.target_type = TARGET_BREAK; + br.branch.target_break = ctx->current_loop_depth; + emit_mir_instruction(ctx, br); + + DBG("break..\n"); + break; + } + + default: + DBG("Unknown jump type %d\n", instr->type); + break; + } +} + +static void +emit_instr(compiler_context *ctx, struct nir_instr *instr) +{ + switch (instr->type) { + case nir_instr_type_load_const: + emit_load_const(ctx, nir_instr_as_load_const(instr)); + break; + + case nir_instr_type_intrinsic: + emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); + break; + + case nir_instr_type_alu: + emit_alu(ctx, nir_instr_as_alu(instr)); + break; + + case nir_instr_type_tex: + emit_tex(ctx, nir_instr_as_tex(instr)); + break; + + case nir_instr_type_jump: + emit_jump(ctx, nir_instr_as_jump(instr)); + break; + + case nir_instr_type_ssa_undef: + /* Spurious */ + break; + + default: + DBG("Unhandled instruction type\n"); + break; + } +} + + +/* ALU instructions can inline or embed constants, which decreases register + * pressure and saves space. */ + +#define CONDITIONAL_ATTACH(src) { \ + void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->ssa_args.src + 1); \ +\ + if (entry) { \ + attach_constants(ctx, alu, entry, alu->ssa_args.src + 1); \ + alu->ssa_args.src = SSA_FIXED_REGISTER(REGISTER_CONSTANT); \ + } \ +} + +static void +inline_alu_constants(compiler_context *ctx) +{ + mir_foreach_instr(ctx, alu) { + /* Other instructions cannot inline constants */ + if (alu->type != TAG_ALU_4) continue; + + /* If there is already a constant here, we can do nothing */ + if (alu->has_constants) continue; + + /* It makes no sense to inline constants on a branch */ + if (alu->compact_branch || alu->prepacked_branch) continue; + + CONDITIONAL_ATTACH(src0); + + if (!alu->has_constants) { + CONDITIONAL_ATTACH(src1) + } else if (!alu->inline_constant) { + /* Corner case: _two_ vec4 constants, for instance with a + * csel. For this case, we can only use a constant + * register for one, we'll have to emit a move for the + * other. Note, if both arguments are constants, then + * necessarily neither argument depends on the value of + * any particular register. As the destination register + * will be wiped, that means we can spill the constant + * to the destination register. + */ + + void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->ssa_args.src1 + 1); + unsigned scratch = alu->ssa_args.dest; + + if (entry) { + midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, scratch); + attach_constants(ctx, &ins, entry, alu->ssa_args.src1 + 1); + + /* Force a break XXX Defer r31 writes */ + ins.unit = UNIT_VLUT; + + /* Set the source */ + alu->ssa_args.src1 = scratch; + + /* Inject us -before- the last instruction which set r31 */ + mir_insert_instruction_before(mir_prev_op(alu), ins); + } + } + } +} + +/* Midgard supports two types of constants, embedded constants (128-bit) and + * inline constants (16-bit). Sometimes, especially with scalar ops, embedded + * constants can be demoted to inline constants, for space savings and + * sometimes a performance boost */ + +static void +embedded_to_inline_constant(compiler_context *ctx) +{ + mir_foreach_instr(ctx, ins) { + if (!ins->has_constants) continue; + + if (ins->ssa_args.inline_constant) continue; + + /* Blend constants must not be inlined by definition */ + if (ins->has_blend_constant) continue; + + /* We can inline 32-bit (sometimes) or 16-bit (usually) */ + bool is_16 = ins->alu.reg_mode == midgard_reg_mode_16; + bool is_32 = ins->alu.reg_mode == midgard_reg_mode_32; + + if (!(is_16 || is_32)) + continue; + + /* src1 cannot be an inline constant due to encoding + * restrictions. So, if possible we try to flip the arguments + * in that case */ + + int op = ins->alu.op; + + if (ins->ssa_args.src0 == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) { + switch (op) { + /* These ops require an operational change to flip + * their arguments TODO */ + case midgard_alu_op_flt: + case midgard_alu_op_fle: + case midgard_alu_op_ilt: + case midgard_alu_op_ile: + case midgard_alu_op_fcsel: + case midgard_alu_op_icsel: + DBG("Missed non-commutative flip (%s)\n", alu_opcode_props[op].name); + default: + break; + } + + if (alu_opcode_props[op].props & OP_COMMUTES) { + /* Flip the SSA numbers */ + ins->ssa_args.src0 = ins->ssa_args.src1; + ins->ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + + /* And flip the modifiers */ + + unsigned src_temp; + + src_temp = ins->alu.src2; + ins->alu.src2 = ins->alu.src1; + ins->alu.src1 = src_temp; + } + } + + if (ins->ssa_args.src1 == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) { + /* Extract the source information */ + + midgard_vector_alu_src *src; + int q = ins->alu.src2; + midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q; + src = m; + + /* Component is from the swizzle, e.g. r26.w -> w component. TODO: What if x is masked out? */ + int component = src->swizzle & 3; + + /* Scale constant appropriately, if we can legally */ + uint16_t scaled_constant = 0; + + if (midgard_is_integer_op(op) || is_16) { + unsigned int *iconstants = (unsigned int *) ins->constants; + scaled_constant = (uint16_t) iconstants[component]; + + /* Constant overflow after resize */ + if (scaled_constant != iconstants[component]) + continue; + } else { + float original = (float) ins->constants[component]; + scaled_constant = _mesa_float_to_half(original); + + /* Check for loss of precision. If this is + * mediump, we don't care, but for a highp + * shader, we need to pay attention. NIR + * doesn't yet tell us which mode we're in! + * Practically this prevents most constants + * from being inlined, sadly. */ + + float fp32 = _mesa_half_to_float(scaled_constant); + + if (fp32 != original) + continue; + } + + /* We don't know how to handle these with a constant */ + + if (src->mod || src->half || src->rep_low || src->rep_high) { + DBG("Bailing inline constant...\n"); + continue; + } + + /* Make sure that the constant is not itself a + * vector by checking if all accessed values + * (by the swizzle) are the same. */ + + uint32_t *cons = (uint32_t *) ins->constants; + uint32_t value = cons[component]; + + bool is_vector = false; + unsigned mask = effective_writemask(&ins->alu, ins->mask); + + for (int c = 1; c < 4; ++c) { + /* We only care if this component is actually used */ + if (!(mask & (1 << c))) + continue; + + uint32_t test = cons[(src->swizzle >> (2 * c)) & 3]; + + if (test != value) { + is_vector = true; + break; + } + } + + if (is_vector) + continue; + + /* Get rid of the embedded constant */ + ins->has_constants = false; + ins->ssa_args.src1 = SSA_UNUSED_0; + ins->ssa_args.inline_constant = true; + ins->inline_constant = scaled_constant; + } + } +} + +/* Map normal SSA sources to other SSA sources / fixed registers (like + * uniforms) */ + +static void +map_ssa_to_alias(compiler_context *ctx, int *ref) +{ + /* Sign is used quite deliberately for unused */ + if (*ref < 0) + return; + + unsigned int alias = (uintptr_t) _mesa_hash_table_u64_search(ctx->ssa_to_alias, *ref + 1); + + if (alias) { + /* Remove entry in leftovers to avoid a redunant fmov */ + + struct set_entry *leftover = _mesa_set_search(ctx->leftover_ssa_to_alias, ((void *) (uintptr_t) (*ref + 1))); + + if (leftover) + _mesa_set_remove(ctx->leftover_ssa_to_alias, leftover); + + /* Assign the alias map */ + *ref = alias - 1; + return; + } +} + +/* Basic dead code elimination on the MIR itself, which cleans up e.g. the + * texture pipeline */ + +static bool +midgard_opt_dead_code_eliminate(compiler_context *ctx, midgard_block *block) +{ + bool progress = false; + + mir_foreach_instr_in_block_safe(block, ins) { + if (ins->type != TAG_ALU_4) continue; + if (ins->compact_branch) continue; + + if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue; + if (mir_is_live_after(ctx, block, ins, ins->ssa_args.dest)) continue; + + mir_remove_instruction(ins); + progress = true; + } + + return progress; +} + +/* Dead code elimination for branches at the end of a block - only one branch + * per block is legal semantically */ + +static void +midgard_opt_cull_dead_branch(compiler_context *ctx, midgard_block *block) +{ + bool branched = false; + + mir_foreach_instr_in_block_safe(block, ins) { + if (!midgard_is_branch_unit(ins->unit)) continue; + + /* We ignore prepacked branches since the fragment epilogue is + * just generally special */ + if (ins->prepacked_branch) continue; + + /* Discards are similarly special and may not correspond to the + * end of a block */ + + if (ins->branch.target_type == TARGET_DISCARD) continue; + + if (branched) { + /* We already branched, so this is dead */ + mir_remove_instruction(ins); + } + + branched = true; + } +} + +static bool +mir_nontrivial_mod(midgard_vector_alu_src src, bool is_int, unsigned mask) +{ + /* abs or neg */ + if (!is_int && src.mod) return true; + + /* Other int mods don't matter in isolation */ + if (is_int && src.mod == midgard_int_shift) return true; + + /* size-conversion */ + if (src.half) return true; + + /* swizzle */ + for (unsigned c = 0; c < 4; ++c) { + if (!(mask & (1 << c))) continue; + if (((src.swizzle >> (2*c)) & 3) != c) return true; + } + + return false; +} + +static bool +mir_nontrivial_source2_mod(midgard_instruction *ins) +{ + bool is_int = midgard_is_integer_op(ins->alu.op); + + midgard_vector_alu_src src2 = + vector_alu_from_unsigned(ins->alu.src2); + + return mir_nontrivial_mod(src2, is_int, ins->mask); +} + +static bool +mir_nontrivial_outmod(midgard_instruction *ins) +{ + bool is_int = midgard_is_integer_op(ins->alu.op); + unsigned mod = ins->alu.outmod; + + /* Type conversion is a sort of outmod */ + if (ins->alu.dest_override != midgard_dest_override_none) + return true; + + if (is_int) + return mod != midgard_outmod_int_wrap; + else + return mod != midgard_outmod_none; +} + +static bool +midgard_opt_copy_prop(compiler_context *ctx, midgard_block *block) +{ + bool progress = false; + + mir_foreach_instr_in_block_safe(block, ins) { + if (ins->type != TAG_ALU_4) continue; + if (!OP_IS_MOVE(ins->alu.op)) continue; + + unsigned from = ins->ssa_args.src1; + unsigned to = ins->ssa_args.dest; + + /* We only work on pure SSA */ + + if (to >= SSA_FIXED_MINIMUM) continue; + if (from >= SSA_FIXED_MINIMUM) continue; + if (to >= ctx->func->impl->ssa_alloc) continue; + if (from >= ctx->func->impl->ssa_alloc) continue; + + /* Constant propagation is not handled here, either */ + if (ins->ssa_args.inline_constant) continue; + if (ins->has_constants) continue; + + if (mir_nontrivial_source2_mod(ins)) continue; + if (mir_nontrivial_outmod(ins)) continue; + + /* We're clear -- rewrite */ + mir_rewrite_index_src(ctx, to, from); + mir_remove_instruction(ins); + progress |= true; + } + + return progress; +} + +/* fmov.pos is an idiom for fpos. Propoagate the .pos up to the source, so then + * the move can be propagated away entirely */ + +static bool +mir_compose_float_outmod(midgard_outmod_float *outmod, midgard_outmod_float comp) +{ + /* Nothing to do */ + if (comp == midgard_outmod_none) + return true; + + if (*outmod == midgard_outmod_none) { + *outmod = comp; + return true; + } + + /* TODO: Compose rules */ + return false; +} + +static bool +midgard_opt_pos_propagate(compiler_context *ctx, midgard_block *block) +{ + bool progress = false; + + mir_foreach_instr_in_block_safe(block, ins) { + if (ins->type != TAG_ALU_4) continue; + if (ins->alu.op != midgard_alu_op_fmov) continue; + if (ins->alu.outmod != midgard_outmod_pos) continue; + + /* TODO: Registers? */ + unsigned src = ins->ssa_args.src1; + if (src >= ctx->func->impl->ssa_alloc) continue; + assert(!mir_has_multiple_writes(ctx, src)); + + /* There might be a source modifier, too */ + if (mir_nontrivial_source2_mod(ins)) continue; + + /* Backpropagate the modifier */ + mir_foreach_instr_in_block_from_rev(block, v, mir_prev_op(ins)) { + if (v->type != TAG_ALU_4) continue; + if (v->ssa_args.dest != src) continue; + + /* Can we even take a float outmod? */ + if (midgard_is_integer_out_op(v->alu.op)) continue; + + midgard_outmod_float temp = v->alu.outmod; + progress |= mir_compose_float_outmod(&temp, ins->alu.outmod); + + /* Throw in the towel.. */ + if (!progress) break; + + /* Otherwise, transfer the modifier */ + v->alu.outmod = temp; + ins->alu.outmod = midgard_outmod_none; + + break; + } + } + + return progress; +} + +/* The following passes reorder MIR instructions to enable better scheduling */ + +static void +midgard_pair_load_store(compiler_context *ctx, midgard_block *block) +{ + mir_foreach_instr_in_block_safe(block, ins) { + if (ins->type != TAG_LOAD_STORE_4) continue; + + /* We've found a load/store op. Check if next is also load/store. */ + midgard_instruction *next_op = mir_next_op(ins); + if (&next_op->link != &block->instructions) { + if (next_op->type == TAG_LOAD_STORE_4) { + /* If so, we're done since we're a pair */ + ins = mir_next_op(ins); + continue; + } + + /* Maximum search distance to pair, to avoid register pressure disasters */ + int search_distance = 8; + + /* Otherwise, we have an orphaned load/store -- search for another load */ + mir_foreach_instr_in_block_from(block, c, mir_next_op(ins)) { + /* Terminate search if necessary */ + if (!(search_distance--)) break; + + if (c->type != TAG_LOAD_STORE_4) continue; + + /* Stores cannot be reordered, since they have + * dependencies. For the same reason, indirect + * loads cannot be reordered as their index is + * loaded in r27.w */ + + if (OP_IS_STORE(c->load_store.op)) continue; + + /* It appears the 0x800 bit is set whenever a + * load is direct, unset when it is indirect. + * Skip indirect loads. */ + + if (!(c->load_store.unknown & 0x800)) continue; + + /* We found one! Move it up to pair and remove it from the old location */ + + mir_insert_instruction_before(ins, *c); + mir_remove_instruction(c); + + break; + } + } + } +} + +/* If there are leftovers after the below pass, emit actual fmov + * instructions for the slow-but-correct path */ + +static void +emit_leftover_move(compiler_context *ctx) +{ + set_foreach(ctx->leftover_ssa_to_alias, leftover) { + int base = ((uintptr_t) leftover->key) - 1; + int mapped = base; + + map_ssa_to_alias(ctx, &mapped); + EMIT(mov, mapped, blank_alu_src, base); + } +} + +static void +actualise_ssa_to_alias(compiler_context *ctx) +{ + mir_foreach_instr(ctx, ins) { + map_ssa_to_alias(ctx, &ins->ssa_args.src0); + map_ssa_to_alias(ctx, &ins->ssa_args.src1); + } + + emit_leftover_move(ctx); +} + +static void +emit_fragment_epilogue(compiler_context *ctx) +{ + /* Special case: writing out constants requires us to include the move + * explicitly now, so shove it into r0 */ + + void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, ctx->fragment_output + 1); + + if (constant_value) { + midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, SSA_FIXED_REGISTER(0)); + attach_constants(ctx, &ins, constant_value, ctx->fragment_output + 1); + emit_mir_instruction(ctx, ins); + } + + /* Perform the actual fragment writeout. We have two writeout/branch + * instructions, forming a loop until writeout is successful as per the + * docs. TODO: gl_FragDepth */ + + EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, 0, midgard_condition_always); + EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, -1, midgard_condition_always); +} + +static midgard_block * +emit_block(compiler_context *ctx, nir_block *block) +{ + midgard_block *this_block = calloc(sizeof(midgard_block), 1); + list_addtail(&this_block->link, &ctx->blocks); + + this_block->is_scheduled = false; + ++ctx->block_count; + + ctx->texture_index[0] = -1; + ctx->texture_index[1] = -1; + + /* Add us as a successor to the block we are following */ + if (ctx->current_block) + midgard_block_add_successor(ctx->current_block, this_block); + + /* Set up current block */ + list_inithead(&this_block->instructions); + ctx->current_block = this_block; + + nir_foreach_instr(instr, block) { + emit_instr(ctx, instr); + ++ctx->instruction_count; + } + + inline_alu_constants(ctx); + embedded_to_inline_constant(ctx); + + /* Perform heavylifting for aliasing */ + actualise_ssa_to_alias(ctx); + + midgard_pair_load_store(ctx, this_block); + + /* Append fragment shader epilogue (value writeout) */ + if (ctx->stage == MESA_SHADER_FRAGMENT) { + if (block == nir_impl_last_block(ctx->func->impl)) { + emit_fragment_epilogue(ctx); + } + } + + if (block == nir_start_block(ctx->func->impl)) + ctx->initial_block = this_block; + + if (block == nir_impl_last_block(ctx->func->impl)) + ctx->final_block = this_block; + + /* Allow the next control flow to access us retroactively, for + * branching etc */ + ctx->current_block = this_block; + + /* Document the fallthrough chain */ + ctx->previous_source_block = this_block; + + return this_block; +} + +static midgard_block *emit_cf_list(struct compiler_context *ctx, struct exec_list *list); + +static void +emit_if(struct compiler_context *ctx, nir_if *nif) +{ + /* Conditional branches expect the condition in r31.w; emit a move for + * that in the _previous_ block (which is the current block). */ + emit_condition(ctx, &nif->condition, true, COMPONENT_X); + + /* Speculatively emit the branch, but we can't fill it in until later */ + EMIT(branch, true, true); + midgard_instruction *then_branch = mir_last_in_block(ctx->current_block); + + /* Emit the two subblocks */ + midgard_block *then_block = emit_cf_list(ctx, &nif->then_list); + + /* Emit a jump from the end of the then block to the end of the else */ + EMIT(branch, false, false); + midgard_instruction *then_exit = mir_last_in_block(ctx->current_block); + + /* Emit second block, and check if it's empty */ + + int else_idx = ctx->block_count; + int count_in = ctx->instruction_count; + midgard_block *else_block = emit_cf_list(ctx, &nif->else_list); + int after_else_idx = ctx->block_count; + + /* Now that we have the subblocks emitted, fix up the branches */ + + assert(then_block); + assert(else_block); + + if (ctx->instruction_count == count_in) { + /* The else block is empty, so don't emit an exit jump */ + mir_remove_instruction(then_exit); + then_branch->branch.target_block = after_else_idx; + } else { + then_branch->branch.target_block = else_idx; + then_exit->branch.target_block = after_else_idx; + } +} + +static void +emit_loop(struct compiler_context *ctx, nir_loop *nloop) +{ + /* Remember where we are */ + midgard_block *start_block = ctx->current_block; + + /* Allocate a loop number, growing the current inner loop depth */ + int loop_idx = ++ctx->current_loop_depth; + + /* Get index from before the body so we can loop back later */ + int start_idx = ctx->block_count; + + /* Emit the body itself */ + emit_cf_list(ctx, &nloop->body); + + /* Branch back to loop back */ + struct midgard_instruction br_back = v_branch(false, false); + br_back.branch.target_block = start_idx; + emit_mir_instruction(ctx, br_back); + + /* Mark down that branch in the graph. Note that we're really branching + * to the block *after* we started in. TODO: Why doesn't the branch + * itself have an off-by-one then...? */ + midgard_block_add_successor(ctx->current_block, start_block->successors[0]); + + /* Find the index of the block about to follow us (note: we don't add + * one; blocks are 0-indexed so we get a fencepost problem) */ + int break_block_idx = ctx->block_count; + + /* Fix up the break statements we emitted to point to the right place, + * now that we can allocate a block number for them */ + + list_for_each_entry_from(struct midgard_block, block, start_block, &ctx->blocks, link) { + mir_foreach_instr_in_block(block, ins) { + if (ins->type != TAG_ALU_4) continue; + if (!ins->compact_branch) continue; + if (ins->prepacked_branch) continue; + + /* We found a branch -- check the type to see if we need to do anything */ + if (ins->branch.target_type != TARGET_BREAK) continue; + + /* It's a break! Check if it's our break */ + if (ins->branch.target_break != loop_idx) continue; + + /* Okay, cool, we're breaking out of this loop. + * Rewrite from a break to a goto */ + + ins->branch.target_type = TARGET_GOTO; + ins->branch.target_block = break_block_idx; + } + } + + /* Now that we've finished emitting the loop, free up the depth again + * so we play nice with recursion amid nested loops */ + --ctx->current_loop_depth; + + /* Dump loop stats */ + ++ctx->loop_count; +} + +static midgard_block * +emit_cf_list(struct compiler_context *ctx, struct exec_list *list) +{ + midgard_block *start_block = NULL; + + foreach_list_typed(nir_cf_node, node, node, list) { + switch (node->type) { + case nir_cf_node_block: { + midgard_block *block = emit_block(ctx, nir_cf_node_as_block(node)); + + if (!start_block) + start_block = block; + + break; + } + + case nir_cf_node_if: + emit_if(ctx, nir_cf_node_as_if(node)); + break; + + case nir_cf_node_loop: + emit_loop(ctx, nir_cf_node_as_loop(node)); + break; + + case nir_cf_node_function: + assert(0); + break; + } + } + + return start_block; +} + +/* Due to lookahead, we need to report the first tag executed in the command + * stream and in branch targets. An initial block might be empty, so iterate + * until we find one that 'works' */ + +static unsigned +midgard_get_first_tag_from_block(compiler_context *ctx, unsigned block_idx) +{ + midgard_block *initial_block = mir_get_block(ctx, block_idx); + + unsigned first_tag = 0; + + do { + midgard_bundle *initial_bundle = util_dynarray_element(&initial_block->bundles, midgard_bundle, 0); + + if (initial_bundle) { + first_tag = initial_bundle->tag; + break; + } + + /* Initial block is empty, try the next block */ + initial_block = list_first_entry(&(initial_block->link), midgard_block, link); + } while(initial_block != NULL); + + assert(first_tag); + return first_tag; +} + +int +midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_blend) +{ + struct util_dynarray *compiled = &program->compiled; + + midgard_debug = debug_get_option_midgard_debug(); + + compiler_context ictx = { + .nir = nir, + .stage = nir->info.stage, + + .is_blend = is_blend, + .blend_constant_offset = 0, + + .alpha_ref = program->alpha_ref + }; + + compiler_context *ctx = &ictx; + + /* TODO: Decide this at runtime */ + ctx->uniform_cutoff = 8; + + /* Initialize at a global (not block) level hash tables */ + + ctx->ssa_constants = _mesa_hash_table_u64_create(NULL); + ctx->ssa_to_alias = _mesa_hash_table_u64_create(NULL); + ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL); + ctx->sysval_to_id = _mesa_hash_table_u64_create(NULL); + ctx->leftover_ssa_to_alias = _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); + + /* Record the varying mapping for the command stream's bookkeeping */ + + struct exec_list *varyings = + ctx->stage == MESA_SHADER_VERTEX ? &nir->outputs : &nir->inputs; + + unsigned max_varying = 0; + nir_foreach_variable(var, varyings) { + unsigned loc = var->data.driver_location; + unsigned sz = glsl_type_size(var->type, FALSE); + + for (int c = 0; c < sz; ++c) { + program->varyings[loc + c] = var->data.location + c; + max_varying = MAX2(max_varying, loc + c); + } + } + + /* Lower gl_Position pre-optimisation, but after lowering vars to ssa + * (so we don't accidentally duplicate the epilogue since mesa/st has + * messed with our I/O quite a bit already) */ + + NIR_PASS_V(nir, nir_lower_vars_to_ssa); + + if (ctx->stage == MESA_SHADER_VERTEX) + NIR_PASS_V(nir, nir_lower_viewport_transform); + + NIR_PASS_V(nir, nir_lower_var_copies); + NIR_PASS_V(nir, nir_lower_vars_to_ssa); + NIR_PASS_V(nir, nir_split_var_copies); + NIR_PASS_V(nir, nir_lower_var_copies); + NIR_PASS_V(nir, nir_lower_global_vars_to_local); + NIR_PASS_V(nir, nir_lower_var_copies); + NIR_PASS_V(nir, nir_lower_vars_to_ssa); + + NIR_PASS_V(nir, nir_lower_io, nir_var_all, glsl_type_size, 0); + + /* Optimisation passes */ + + optimise_nir(nir); + + if (midgard_debug & MIDGARD_DBG_SHADERS) { + nir_print_shader(nir, stdout); + } + + /* Assign sysvals and counts, now that we're sure + * (post-optimisation) */ + + midgard_nir_assign_sysvals(ctx, nir); + + program->uniform_count = nir->num_uniforms; + program->sysval_count = ctx->sysval_count; + memcpy(program->sysvals, ctx->sysvals, sizeof(ctx->sysvals[0]) * ctx->sysval_count); + + program->attribute_count = (ctx->stage == MESA_SHADER_VERTEX) ? nir->num_inputs : 0; + program->varying_count = max_varying + 1; /* Fencepost off-by-one */ + + nir_foreach_function(func, nir) { + if (!func->impl) + continue; + + list_inithead(&ctx->blocks); + ctx->block_count = 0; + ctx->func = func; + + emit_cf_list(ctx, &func->impl->body); + emit_block(ctx, func->impl->end_block); + + break; /* TODO: Multi-function shaders */ + } + + util_dynarray_init(compiled, NULL); + + /* MIR-level optimizations */ + + bool progress = false; + + do { + progress = false; + + mir_foreach_block(ctx, block) { + progress |= midgard_opt_pos_propagate(ctx, block); + progress |= midgard_opt_copy_prop(ctx, block); + progress |= midgard_opt_dead_code_eliminate(ctx, block); + } + } while (progress); + + /* Nested control-flow can result in dead branches at the end of the + * block. This messes with our analysis and is just dead code, so cull + * them */ + mir_foreach_block(ctx, block) { + midgard_opt_cull_dead_branch(ctx, block); + } + + /* Schedule! */ + schedule_program(ctx); + + /* Now that all the bundles are scheduled and we can calculate block + * sizes, emit actual branch instructions rather than placeholders */ + + int br_block_idx = 0; + + mir_foreach_block(ctx, block) { + util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) { + for (int c = 0; c < bundle->instruction_count; ++c) { + midgard_instruction *ins = bundle->instructions[c]; + + if (!midgard_is_branch_unit(ins->unit)) continue; + + if (ins->prepacked_branch) continue; + + /* Parse some basic branch info */ + bool is_compact = ins->unit == ALU_ENAB_BR_COMPACT; + bool is_conditional = ins->branch.conditional; + bool is_inverted = ins->branch.invert_conditional; + bool is_discard = ins->branch.target_type == TARGET_DISCARD; + + /* Determine the block we're jumping to */ + int target_number = ins->branch.target_block; + + /* Report the destination tag */ + int dest_tag = is_discard ? 0 : midgard_get_first_tag_from_block(ctx, target_number); + + /* Count up the number of quadwords we're + * jumping over = number of quadwords until + * (br_block_idx, target_number) */ + + int quadword_offset = 0; + + if (is_discard) { + /* Jump to the end of the shader. We + * need to include not only the + * following blocks, but also the + * contents of our current block (since + * discard can come in the middle of + * the block) */ + + midgard_block *blk = mir_get_block(ctx, br_block_idx + 1); + + for (midgard_bundle *bun = bundle + 1; bun < (midgard_bundle *)((char*) block->bundles.data + block->bundles.size); ++bun) { + quadword_offset += quadword_size(bun->tag); + } + + mir_foreach_block_from(ctx, blk, b) { + quadword_offset += b->quadword_count; + } + + } else if (target_number > br_block_idx) { + /* Jump forward */ + + for (int idx = br_block_idx + 1; idx < target_number; ++idx) { + midgard_block *blk = mir_get_block(ctx, idx); + assert(blk); + + quadword_offset += blk->quadword_count; + } + } else { + /* Jump backwards */ + + for (int idx = br_block_idx; idx >= target_number; --idx) { + midgard_block *blk = mir_get_block(ctx, idx); + assert(blk); + + quadword_offset -= blk->quadword_count; + } + } + + /* Unconditional extended branches (far jumps) + * have issues, so we always use a conditional + * branch, setting the condition to always for + * unconditional. For compact unconditional + * branches, cond isn't used so it doesn't + * matter what we pick. */ + + midgard_condition cond = + !is_conditional ? midgard_condition_always : + is_inverted ? midgard_condition_false : + midgard_condition_true; + + midgard_jmp_writeout_op op = + is_discard ? midgard_jmp_writeout_op_discard : + (is_compact && !is_conditional) ? midgard_jmp_writeout_op_branch_uncond : + midgard_jmp_writeout_op_branch_cond; + + if (!is_compact) { + midgard_branch_extended branch = + midgard_create_branch_extended( + cond, op, + dest_tag, + quadword_offset); + + memcpy(&ins->branch_extended, &branch, sizeof(branch)); + } else if (is_conditional || is_discard) { + midgard_branch_cond branch = { + .op = op, + .dest_tag = dest_tag, + .offset = quadword_offset, + .cond = cond + }; + + assert(branch.offset == quadword_offset); + + memcpy(&ins->br_compact, &branch, sizeof(branch)); + } else { + assert(op == midgard_jmp_writeout_op_branch_uncond); + + midgard_branch_uncond branch = { + .op = op, + .dest_tag = dest_tag, + .offset = quadword_offset, + .unknown = 1 + }; + + assert(branch.offset == quadword_offset); + + memcpy(&ins->br_compact, &branch, sizeof(branch)); + } + } + } + + ++br_block_idx; + } + + /* Emit flat binary from the instruction arrays. Iterate each block in + * sequence. Save instruction boundaries such that lookahead tags can + * be assigned easily */ + + /* Cache _all_ bundles in source order for lookahead across failed branches */ + + int bundle_count = 0; + mir_foreach_block(ctx, block) { + bundle_count += block->bundles.size / sizeof(midgard_bundle); + } + midgard_bundle **source_order_bundles = malloc(sizeof(midgard_bundle *) * bundle_count); + int bundle_idx = 0; + mir_foreach_block(ctx, block) { + util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) { + source_order_bundles[bundle_idx++] = bundle; + } + } + + int current_bundle = 0; + + /* Midgard prefetches instruction types, so during emission we + * need to lookahead. Unless this is the last instruction, in + * which we return 1. Or if this is the second to last and the + * last is an ALU, then it's also 1... */ + + mir_foreach_block(ctx, block) { + mir_foreach_bundle_in_block(block, bundle) { + int lookahead = 1; + + if (current_bundle + 1 < bundle_count) { + uint8_t next = source_order_bundles[current_bundle + 1]->tag; + + if (!(current_bundle + 2 < bundle_count) && IS_ALU(next)) { + lookahead = 1; + } else { + lookahead = next; + } + } + + emit_binary_bundle(ctx, bundle, compiled, lookahead); + ++current_bundle; + } + + /* TODO: Free deeper */ + //util_dynarray_fini(&block->instructions); + } + + free(source_order_bundles); + + /* Report the very first tag executed */ + program->first_tag = midgard_get_first_tag_from_block(ctx, 0); + + /* Deal with off-by-one related to the fencepost problem */ + program->work_register_count = ctx->work_registers + 1; + + program->can_discard = ctx->can_discard; + program->uniform_cutoff = ctx->uniform_cutoff; + + program->blend_patch_offset = ctx->blend_constant_offset; + + if (midgard_debug & MIDGARD_DBG_SHADERS) + disassemble_midgard(program->compiled.data, program->compiled.size); + + if (midgard_debug & MIDGARD_DBG_SHADERDB) { + unsigned nr_bundles = 0, nr_ins = 0, nr_quadwords = 0; + + /* Count instructions and bundles */ + + mir_foreach_instr_global(ctx, ins) { + nr_ins++; + } + + mir_foreach_block(ctx, block) { + nr_bundles += util_dynarray_num_elements( + &block->bundles, midgard_bundle); + + nr_quadwords += block->quadword_count; + } + + /* Calculate thread count. There are certain cutoffs by + * register count for thread count */ + + unsigned nr_registers = program->work_register_count; + + unsigned nr_threads = + (nr_registers <= 4) ? 4 : + (nr_registers <= 8) ? 2 : + 1; + + /* Dump stats */ + + fprintf(stderr, "shader%d - %s shader: " + "%u inst, %u bundles, %u quadwords, " + "%u registers, %u threads, %u loops\n", + SHADER_DB_COUNT++, + gl_shader_stage_name(ctx->stage), + nr_ins, nr_bundles, nr_quadwords, + nr_registers, nr_threads, + ctx->loop_count); + } + + + return 0; +} diff --git a/src/panfrost/midgard/midgard_compile.h b/src/panfrost/midgard/midgard_compile.h new file mode 100644 index 00000000000..147494b8e8a --- /dev/null +++ b/src/panfrost/midgard/midgard_compile.h @@ -0,0 +1,127 @@ +/* + * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MIDGARD_H_ +#define __MIDGARD_H_ + +#include "compiler/nir/nir.h" +#include "util/u_dynarray.h" + +/* Define the general compiler entry point */ + +#define MAX_SYSVAL_COUNT 32 + +/* Allow 2D of sysval IDs, while allowing nonparametric sysvals to equal + * their class for equal comparison */ + +#define PAN_SYSVAL(type, no) (((no) << 16) | PAN_SYSVAL_##type) +#define PAN_SYSVAL_TYPE(sysval) ((sysval) & 0xffff) +#define PAN_SYSVAL_ID(sysval) ((sysval) >> 16) + +/* Define some common types. We start at one for easy indexing of hash + * tables internal to the compiler */ + +enum { + PAN_SYSVAL_VIEWPORT_SCALE = 1, + PAN_SYSVAL_VIEWPORT_OFFSET = 2, + PAN_SYSVAL_TEXTURE_SIZE = 3, +} pan_sysval; + +#define PAN_TXS_SYSVAL_ID(texidx, dim, is_array) \ + ((texidx) | ((dim) << 7) | ((is_array) ? (1 << 9) : 0)) + +#define PAN_SYSVAL_ID_TO_TXS_TEX_IDX(id) ((id) & 0x7f) +#define PAN_SYSVAL_ID_TO_TXS_DIM(id) (((id) >> 7) & 0x3) +#define PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(id) !!((id) & (1 << 9)) + +typedef struct { + int work_register_count; + int uniform_count; + int uniform_cutoff; + + int attribute_count; + int varying_count; + + /* Prepended before uniforms, mapping to SYSVAL_ names for the + * sysval */ + + unsigned sysval_count; + unsigned sysvals[MAX_SYSVAL_COUNT]; + + unsigned varyings[32]; + + /* Boolean properties of the program */ + bool can_discard; + bool writes_point_size; + + int first_tag; + + struct util_dynarray compiled; + + /* For a blend shader using a constant color -- patch point. If + * negative, there's no constant. */ + + int blend_patch_offset; + + /* IN: For a fragment shader with a lowered alpha test, the ref value */ + float alpha_ref; +} midgard_program; + +int +midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_blend); + +/* NIR options are shared between the standalone compiler and the online + * compiler. Defining it here is the simplest, though maybe not the Right + * solution. */ + +static const nir_shader_compiler_options midgard_nir_options = { + .lower_ffma = true, + .lower_sub = true, + .lower_scmp = true, + .lower_flrp32 = true, + .lower_flrp64 = true, + .lower_ffract = true, + .lower_fmod = true, + .lower_fdiv = true, + .lower_idiv = true, + .lower_isign = true, + .lower_fpow = true, + .lower_find_lsb = true, + + .lower_wpos_pntc = true, + + /* TODO: We have native ops to help here, which we'll want to look into + * eventually */ + .lower_fsign = true, + + .vertex_id_zero_based = true, + .lower_extract_byte = true, + .lower_extract_word = true, + .lower_rotate = true, + + .lower_doubles_options = nir_lower_dmod, + + .vectorize_io = true, +}; + +#endif diff --git a/src/panfrost/midgard/midgard_emit.c b/src/panfrost/midgard/midgard_emit.c new file mode 100644 index 00000000000..3522e77d5b1 --- /dev/null +++ b/src/panfrost/midgard/midgard_emit.c @@ -0,0 +1,273 @@ +/* + * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "compiler.h" +#include "midgard_ops.h" + +/* Midgard IR only knows vector ALU types, but we sometimes need to actually + * use scalar ALU instructions, for functional or performance reasons. To do + * this, we just demote vector ALU payloads to scalar. */ + +static int +component_from_mask(unsigned mask) +{ + for (int c = 0; c < 8; ++c) { + if (mask & (1 << c)) + return c; + } + + assert(0); + return 0; +} + +static unsigned +vector_to_scalar_source(unsigned u, bool is_int, bool is_full) +{ + midgard_vector_alu_src v; + memcpy(&v, &u, sizeof(v)); + + /* TODO: Integers */ + + unsigned component = v.swizzle & 3; + bool upper = false; /* TODO */ + + midgard_scalar_alu_src s = { 0 }; + + if (is_full) { + /* For a 32-bit op, just check the source half flag */ + s.full = !v.half; + } else if (!v.half) { + /* For a 16-bit op that's not subdivided, never full */ + s.full = false; + } else { + /* We can't do 8-bit scalar, abort! */ + assert(0); + } + + /* Component indexing takes size into account */ + + if (s.full) + s.component = component << 1; + else + s.component = component + (upper << 2); + + if (is_int) { + /* TODO */ + } else { + s.abs = v.mod & MIDGARD_FLOAT_MOD_ABS; + s.negate = v.mod & MIDGARD_FLOAT_MOD_NEG; + } + + unsigned o; + memcpy(&o, &s, sizeof(s)); + + return o & ((1 << 6) - 1); +} + +static midgard_scalar_alu +vector_to_scalar_alu(midgard_vector_alu v, midgard_instruction *ins) +{ + bool is_int = midgard_is_integer_op(v.op); + bool is_full = v.reg_mode == midgard_reg_mode_32; + bool is_inline_constant = ins->ssa_args.inline_constant; + + /* The output component is from the mask */ + midgard_scalar_alu s = { + .op = v.op, + .src1 = vector_to_scalar_source(v.src1, is_int, is_full), + .src2 = !is_inline_constant ? vector_to_scalar_source(v.src2, is_int, is_full) : 0, + .unknown = 0, + .outmod = v.outmod, + .output_full = is_full, + .output_component = component_from_mask(ins->mask), + }; + + /* Full components are physically spaced out */ + if (is_full) { + assert(s.output_component < 4); + s.output_component <<= 1; + } + + /* Inline constant is passed along rather than trying to extract it + * from v */ + + if (ins->ssa_args.inline_constant) { + uint16_t imm = 0; + int lower_11 = ins->inline_constant & ((1 << 12) - 1); + imm |= (lower_11 >> 9) & 3; + imm |= (lower_11 >> 6) & 4; + imm |= (lower_11 >> 2) & 0x38; + imm |= (lower_11 & 63) << 6; + + s.src2 = imm; + } + + return s; +} + +static void +emit_alu_bundle(compiler_context *ctx, + midgard_bundle *bundle, + struct util_dynarray *emission, + unsigned lookahead) +{ + /* Emit the control word */ + util_dynarray_append(emission, uint32_t, bundle->control | lookahead); + + /* Next up, emit register words */ + for (unsigned i = 0; i < bundle->instruction_count; ++i) { + midgard_instruction *ins = bundle->instructions[i]; + + /* Check if this instruction has registers */ + if (ins->compact_branch || ins->prepacked_branch) continue; + + /* Otherwise, just emit the registers */ + uint16_t reg_word = 0; + memcpy(®_word, &ins->registers, sizeof(uint16_t)); + util_dynarray_append(emission, uint16_t, reg_word); + } + + /* Now, we emit the body itself */ + for (unsigned i = 0; i < bundle->instruction_count; ++i) { + midgard_instruction *ins = bundle->instructions[i]; + + /* Where is this body */ + unsigned size = 0; + void *source = NULL; + + /* In case we demote to a scalar */ + midgard_scalar_alu scalarized; + + if (ins->unit & UNITS_ANY_VECTOR) { + if (ins->alu.reg_mode == midgard_reg_mode_32) + ins->alu.mask = expand_writemask_32(ins->mask); + else + ins->alu.mask = ins->mask; + + size = sizeof(midgard_vector_alu); + source = &ins->alu; + } else if (ins->unit == ALU_ENAB_BR_COMPACT) { + size = sizeof(midgard_branch_cond); + source = &ins->br_compact; + } else if (ins->compact_branch) { /* misnomer */ + size = sizeof(midgard_branch_extended); + source = &ins->branch_extended; + } else { + size = sizeof(midgard_scalar_alu); + scalarized = vector_to_scalar_alu(ins->alu, ins); + source = &scalarized; + } + + memcpy(util_dynarray_grow_bytes(emission, 1, size), source, size); + } + + /* Emit padding (all zero) */ + memset(util_dynarray_grow_bytes(emission, 1, bundle->padding), 0, bundle->padding); + + /* Tack on constants */ + + if (bundle->has_embedded_constants) { + util_dynarray_append(emission, float, bundle->constants[0]); + util_dynarray_append(emission, float, bundle->constants[1]); + util_dynarray_append(emission, float, bundle->constants[2]); + util_dynarray_append(emission, float, bundle->constants[3]); + } +} + +/* After everything is scheduled, emit whole bundles at a time */ + +void +emit_binary_bundle(compiler_context *ctx, + midgard_bundle *bundle, + struct util_dynarray *emission, + int next_tag) +{ + int lookahead = next_tag << 4; + + switch (bundle->tag) { + case TAG_ALU_4: + case TAG_ALU_8: + case TAG_ALU_12: + case TAG_ALU_16: + emit_alu_bundle(ctx, bundle, emission, lookahead); + break; + + case TAG_LOAD_STORE_4: { + /* One or two composing instructions */ + + uint64_t current64, next64 = LDST_NOP; + + /* Copy masks */ + + for (unsigned i = 0; i < bundle->instruction_count; ++i) { + bundle->instructions[i]->load_store.mask = + bundle->instructions[i]->mask; + } + + memcpy(¤t64, &bundle->instructions[0]->load_store, sizeof(current64)); + + if (bundle->instruction_count == 2) + memcpy(&next64, &bundle->instructions[1]->load_store, sizeof(next64)); + + midgard_load_store instruction = { + .type = bundle->tag, + .next_type = next_tag, + .word1 = current64, + .word2 = next64 + }; + + util_dynarray_append(emission, midgard_load_store, instruction); + + break; + } + + case TAG_TEXTURE_4: + case TAG_TEXTURE_4_VTX: { + /* Texture instructions are easy, since there is no pipelining + * nor VLIW to worry about. We may need to set .cont/.last + * flags. */ + + midgard_instruction *ins = bundle->instructions[0]; + + ins->texture.type = bundle->tag; + ins->texture.next_type = next_tag; + ins->texture.mask = ins->mask; + + ctx->texture_op_count--; + + if (ins->texture.op == TEXTURE_OP_NORMAL) { + bool continues = ctx->texture_op_count > 0; + ins->texture.cont = continues; + ins->texture.last = !continues; + } else { + ins->texture.cont = ins->texture.last = 1; + } + + util_dynarray_append(emission, midgard_texture_word, ins->texture); + break; + } + + default: + unreachable("Unknown midgard instruction type\n"); + } +} diff --git a/src/panfrost/midgard/midgard_liveness.c b/src/panfrost/midgard/midgard_liveness.c new file mode 100644 index 00000000000..a18d8b9f8ad --- /dev/null +++ b/src/panfrost/midgard/midgard_liveness.c @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* mir_is_live_after performs liveness analysis on the MIR, used primarily + * as part of register allocation. TODO: Algorithmic improvements for + * compiler performance (this is the worst algorithm possible -- see + * backlog with Connor on IRC) */ + +#include "compiler.h" + +static bool +midgard_is_live_in_instr(midgard_instruction *ins, int src) +{ + if (ins->compact_branch) + return false; + + if (ins->ssa_args.src0 == src) + return true; + + if (!ins->ssa_args.inline_constant && ins->ssa_args.src1 == src) + return true; + + return false; +} + +/* Determine if a variable is live in the successors of a block */ +static bool +is_live_after_successors(compiler_context *ctx, midgard_block *bl, int src) +{ + for (unsigned i = 0; i < bl->nr_successors; ++i) { + midgard_block *succ = bl->successors[i]; + + /* If we already visited, the value we're seeking + * isn't down this path (or we would have short + * circuited */ + + if (succ->visited) continue; + + /* Otherwise (it's visited *now*), check the block */ + + succ->visited = true; + + mir_foreach_instr_in_block(succ, ins) { + if (midgard_is_live_in_instr(ins, src)) + return true; + } + + /* ...and also, check *its* successors */ + if (is_live_after_successors(ctx, succ, src)) + return true; + + } + + /* Welp. We're really not live. */ + + return false; +} + +bool +mir_is_live_after(compiler_context *ctx, midgard_block *block, midgard_instruction *start, int src) +{ + /* Check the rest of the block for liveness */ + + mir_foreach_instr_in_block_from(block, ins, mir_next_op(start)) { + if (midgard_is_live_in_instr(ins, src)) + return true; + } + + /* Check the rest of the blocks for liveness recursively */ + + bool succ = is_live_after_successors(ctx, block, src); + + mir_foreach_block(ctx, block) { + block->visited = false; + } + + return succ; +} + +/* Just a quick check -- is it written more than once? (I.e. are we definitely + * not SSA?) */ + +bool +mir_has_multiple_writes(compiler_context *ctx, int dest) +{ + unsigned write_count = 0; + + mir_foreach_instr_global(ctx, ins) { + if (ins->ssa_args.dest == dest) + write_count++; + } + + return write_count > 1; +} diff --git a/src/panfrost/midgard/midgard_nir.h b/src/panfrost/midgard/midgard_nir.h new file mode 100644 index 00000000000..85eadd34631 --- /dev/null +++ b/src/panfrost/midgard/midgard_nir.h @@ -0,0 +1,5 @@ +#include <stdbool.h> +#include "nir.h" + +bool midgard_nir_lower_algebraic_late(nir_shader *shader); +bool midgard_nir_scale_trig(nir_shader *shader); diff --git a/src/panfrost/midgard/midgard_nir_algebraic.py b/src/panfrost/midgard/midgard_nir_algebraic.py new file mode 100644 index 00000000000..faf83364c3a --- /dev/null +++ b/src/panfrost/midgard/midgard_nir_algebraic.py @@ -0,0 +1,96 @@ +# +# Copyright (C) 2018 Alyssa Rosenzweig +# +# Copyright (C) 2016 Intel Corporation +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +import argparse +import sys +import math + +a = 'a' +b = 'b' +c = 'c' + +algebraic_late = [ + # ineg must be lowered late, but only for integers; floats will try to + # have modifiers attached... hence why this has to be here rather than + # a more standard lower_negate approach + + (('ineg', a), ('isub', 0, a)), + + # These two special-cases save space/an op than the actual csel op + + # scheduler flexibility + + (('b32csel', a, 'b@32', 0), ('iand', a, b)), + (('b32csel', a, 0, 'b@32'), ('iand', ('inot', a), b)), +] + + +# Midgard is able to type convert down by only one "step" per instruction; if +# NIR wants more than one step, we need to break up into multiple instructions + +converts = [ + (('i2i8', 'a@32'), ('i2i8', ('i2i16', a))), + (('u2u8', 'a@32'), ('u2u8', ('u2u16', a))), + + (('i2i32', 'a@8'), ('i2i32', ('i2i16', a))), + (('u2u32', 'a@8'), ('u2u32', ('u2u16', a))), + + (('f2i32', 'a@16'), ('f2i32', ('f2f32', a))), + (('f2u32', 'a@16'), ('f2u32', ('f2f32', a))), + + # Totally redundant + (('~f2f16', ('f2f32', 'a@16')), a), + + (('pack_half_2x16_split', 'a@32', 'b@32'), ('ior', ('ishl', ('i2i32', ('f2f16', b)), 16), ('i2i32', ('f2f16', a)))), +] + +# Midgard scales fsin/fcos arguments by pi. +# Pass must be run only once, after the main loop + +scale_trig = [ + (('fsin', a), ('fsin', ('fdiv', a, math.pi))), + (('fcos', a), ('fcos', ('fdiv', a, math.pi))), +] + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--import-path', required=True) + args = parser.parse_args() + sys.path.insert(0, args.import_path) + run() + + +def run(): + import nir_algebraic # pylint: disable=import-error + + print('#include "midgard_nir.h"') + + print(nir_algebraic.AlgebraicPass("midgard_nir_lower_algebraic_late", + algebraic_late + converts).render()) + + print(nir_algebraic.AlgebraicPass("midgard_nir_scale_trig", + scale_trig).render()) + + +if __name__ == '__main__': + main() diff --git a/src/panfrost/midgard/midgard_ops.c b/src/panfrost/midgard/midgard_ops.c new file mode 100644 index 00000000000..ccd750cff83 --- /dev/null +++ b/src/panfrost/midgard/midgard_ops.c @@ -0,0 +1,221 @@ +/* Copyright (c) 2018-2019 Alyssa Rosenzweig ([email protected]) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "midgard.h" + +/* Include the definitions of the macros and such */ + +#define MIDGARD_OPS_TABLE +#include "helpers.h" +#undef MIDGARD_OPS_TABLE + +/* Table of mapping opcodes to accompanying properties. This is used for both + * the disassembler and the compiler. It is placed in a .c file like this to + * avoid duplications in the binary */ + +struct mir_op_props alu_opcode_props[256] = { + [midgard_alu_op_fadd] = {"fadd", UNITS_ADD | OP_COMMUTES}, + [midgard_alu_op_fmul] = {"fmul", UNITS_MUL | UNIT_VLUT | OP_COMMUTES}, + [midgard_alu_op_fmin] = {"fmin", UNITS_MUL | UNITS_ADD | OP_COMMUTES}, + [midgard_alu_op_fmax] = {"fmax", UNITS_MUL | UNITS_ADD | OP_COMMUTES}, + [midgard_alu_op_imin] = {"imin", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_imax] = {"imax", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_umin] = {"umin", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_umax] = {"umax", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_ihadd] = {"ihadd", UNITS_ADD | OP_COMMUTES}, + [midgard_alu_op_uhadd] = {"uhadd", UNITS_ADD | OP_COMMUTES}, + [midgard_alu_op_irhadd] = {"irhadd", UNITS_ADD | OP_COMMUTES}, + [midgard_alu_op_urhadd] = {"urhadd", UNITS_ADD | OP_COMMUTES}, + + [midgard_alu_op_fmov] = {"fmov", UNITS_ALL | QUIRK_FLIPPED_R24}, + [midgard_alu_op_fmov_rtz] = {"fmov_rtz", UNITS_ALL | QUIRK_FLIPPED_R24}, + [midgard_alu_op_fmov_rtn] = {"fmov_rtn", UNITS_ALL | QUIRK_FLIPPED_R24}, + [midgard_alu_op_fmov_rtp] = {"fmov_rtp", UNITS_ALL | QUIRK_FLIPPED_R24}, + [midgard_alu_op_fround] = {"fround", UNITS_ADD}, + [midgard_alu_op_froundeven] = {"froundeven", UNITS_ADD}, + [midgard_alu_op_ftrunc] = {"ftrunc", UNITS_ADD}, + [midgard_alu_op_ffloor] = {"ffloor", UNITS_ADD}, + [midgard_alu_op_fceil] = {"fceil", UNITS_ADD}, + [midgard_alu_op_ffma] = {"ffma", UNIT_VLUT}, + + /* Though they output a scalar, they need to run on a vector unit + * since they process vectors */ + [midgard_alu_op_fdot3] = {"fdot3", UNIT_VMUL | OP_CHANNEL_COUNT(3) | OP_COMMUTES}, + [midgard_alu_op_fdot3r] = {"fdot3r", UNIT_VMUL | OP_CHANNEL_COUNT(3) | OP_COMMUTES}, + [midgard_alu_op_fdot4] = {"fdot4", UNIT_VMUL | OP_CHANNEL_COUNT(4) | OP_COMMUTES}, + + /* Incredibly, iadd can run on vmul, etc */ + [midgard_alu_op_iadd] = {"iadd", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_iaddsat] = {"iaddsat", UNITS_ADD | OP_COMMUTES}, + [midgard_alu_op_uaddsat] = {"uaddsat", UNITS_ADD | OP_COMMUTES}, + [midgard_alu_op_iabsdiff] = {"iabsdiff", UNITS_ADD}, + [midgard_alu_op_uabsdiff] = {"uabsdiff", UNITS_ADD}, + [midgard_alu_op_ichoose] = {"ichoose", UNITS_ADD}, + [midgard_alu_op_isub] = {"isub", UNITS_MOST}, + [midgard_alu_op_isubsat] = {"isubsat", UNITS_MOST}, + [midgard_alu_op_usubsat] = {"usubsat", UNITS_MOST}, + [midgard_alu_op_imul] = {"imul", UNITS_MUL | OP_COMMUTES}, + [midgard_alu_op_imov] = {"imov", UNITS_MOST | QUIRK_FLIPPED_R24}, + + /* For vector comparisons, use ball etc */ + [midgard_alu_op_feq] = {"feq", UNITS_MOST | OP_TYPE_CONVERT | OP_COMMUTES}, + [midgard_alu_op_fne] = {"fne", UNITS_MOST | OP_TYPE_CONVERT | OP_COMMUTES}, + [midgard_alu_op_fle] = {"fle", UNITS_MOST | OP_TYPE_CONVERT}, + [midgard_alu_op_flt] = {"flt", UNITS_MOST | OP_TYPE_CONVERT}, + [midgard_alu_op_ieq] = {"ieq", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_ine] = {"ine", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_ilt] = {"ilt", UNITS_MOST}, + [midgard_alu_op_ile] = {"ile", UNITS_MOST}, + [midgard_alu_op_ult] = {"ult", UNITS_MOST}, + [midgard_alu_op_ule] = {"ule", UNITS_MOST}, + + [midgard_alu_op_icsel] = {"icsel", UNITS_ADD}, + [midgard_alu_op_icsel_v] = {"icsel_v", UNITS_ADD}, /* Acts as bitselect() */ + [midgard_alu_op_fcsel_v] = {"fcsel_v", UNITS_ADD}, + [midgard_alu_op_fcsel] = {"fcsel", UNITS_ADD | UNIT_SMUL}, + + [midgard_alu_op_frcp] = {"frcp", UNIT_VLUT}, + [midgard_alu_op_frsqrt] = {"frsqrt", UNIT_VLUT}, + [midgard_alu_op_fsqrt] = {"fsqrt", UNIT_VLUT}, + [midgard_alu_op_fpow_pt1] = {"fpow_pt1", UNIT_VLUT}, + [midgard_alu_op_fpown_pt1] = {"fpown_pt1", UNIT_VLUT}, + [midgard_alu_op_fpowr_pt1] = {"fpowr_pt1", UNIT_VLUT}, + [midgard_alu_op_fexp2] = {"fexp2", UNIT_VLUT}, + [midgard_alu_op_flog2] = {"flog2", UNIT_VLUT}, + + [midgard_alu_op_f2i_rte] = {"f2i_rte", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_f2i_rtz] = {"f2i_rtz", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_f2i_rtn] = {"f2i_rtn", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_f2i_rtp] = {"f2i_rtp", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_f2u_rte] = {"f2i_rte", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_f2u_rtz] = {"f2i_rtz", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_f2u_rtn] = {"f2i_rtn", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_f2u_rtp] = {"f2i_rtp", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_i2f_rte] = {"i2f", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_i2f_rtz] = {"i2f_rtz", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_i2f_rtn] = {"i2f_rtn", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_i2f_rtp] = {"i2f_rtp", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_u2f_rte] = {"u2f", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_u2f_rtz] = {"u2f_rtz", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_u2f_rtn] = {"u2f_rtn", UNITS_ADD | OP_TYPE_CONVERT}, + [midgard_alu_op_u2f_rtp] = {"u2f_rtp", UNITS_ADD | OP_TYPE_CONVERT}, + + [midgard_alu_op_fsin] = {"fsin", UNIT_VLUT}, + [midgard_alu_op_fcos] = {"fcos", UNIT_VLUT}, + + /* XXX: Test case where it's right on smul but not sadd */ + [midgard_alu_op_iand] = {"iand", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_iandnot] = {"iandnot", UNITS_MOST}, + + [midgard_alu_op_ior] = {"ior", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_iornot] = {"iornot", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_inor] = {"inor", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_ixor] = {"ixor", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_inxor] = {"inxor", UNITS_MOST | OP_COMMUTES}, + [midgard_alu_op_iclz] = {"iclz", UNITS_ADD}, + [midgard_alu_op_ibitcount8] = {"ibitcount8", UNITS_ADD}, + [midgard_alu_op_inand] = {"inand", UNITS_MOST}, + [midgard_alu_op_ishl] = {"ishl", UNITS_ADD}, + [midgard_alu_op_iasr] = {"iasr", UNITS_ADD}, + [midgard_alu_op_ilsr] = {"ilsr", UNITS_ADD}, + + [midgard_alu_op_fball_eq] = {"fball_eq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES}, + [midgard_alu_op_fbany_neq] = {"fbany_neq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES}, + [midgard_alu_op_iball_eq] = {"iball_eq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES}, + [midgard_alu_op_iball_neq] = {"iball_neq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES}, + [midgard_alu_op_ibany_eq] = {"ibany_eq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES}, + [midgard_alu_op_ibany_neq] = {"ibany_neq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES}, + + /* These instructions are not yet emitted by the compiler, so + * don't speculate about units yet */ + [midgard_alu_op_ishladd] = {"ishladd", 0}, + + [midgard_alu_op_uball_lt] = {"uball_lt", 0}, + [midgard_alu_op_uball_lte] = {"uball_lte", 0}, + [midgard_alu_op_iball_lt] = {"iball_lt", 0}, + [midgard_alu_op_iball_lte] = {"iball_lte", 0}, + [midgard_alu_op_ubany_lt] = {"ubany_lt", 0}, + [midgard_alu_op_ubany_lte] = {"ubany_lte", 0}, + [midgard_alu_op_ibany_lt] = {"ibany_lt", 0}, + [midgard_alu_op_ibany_lte] = {"ibany_lte", 0}, + + [midgard_alu_op_freduce] = {"freduce", 0}, + [midgard_alu_op_bball_eq] = {"bball_eq", 0 | OP_CHANNEL_COUNT(4) | OP_COMMUTES}, + [midgard_alu_op_bbany_neq] = {"bball_eq", 0 | OP_CHANNEL_COUNT(4) | OP_COMMUTES}, + [midgard_alu_op_fatan2_pt1] = {"fatan2_pt1", 0}, + [midgard_alu_op_fatan_pt2] = {"fatan_pt2", 0}, +}; + +const char *load_store_opcode_names[256] = { + [midgard_op_st_cubemap_coords] = "st_cubemap_coords", + [midgard_op_ld_global_id] = "ld_global_id", + [midgard_op_ldst_perspective_division_z] = "ldst_perspective_division_z", + [midgard_op_ldst_perspective_division_w] = "ldst_perspective_division_w", + + [midgard_op_atomic_add] = "atomic_add", + [midgard_op_atomic_and] = "atomic_and", + [midgard_op_atomic_or] = "atomic_or", + [midgard_op_atomic_xor] = "atomic_xor", + [midgard_op_atomic_imin] = "atomic_imin", + [midgard_op_atomic_umin] = "atomic_umin", + [midgard_op_atomic_imax] = "atomic_imax", + [midgard_op_atomic_umax] = "atomic_umax", + [midgard_op_atomic_xchg] = "atomic_xchg", + + [midgard_op_ld_char] = "ld_char", + [midgard_op_ld_char2] = "ld_char2", + [midgard_op_ld_short] = "ld_short", + [midgard_op_ld_char4] = "ld_char4", + [midgard_op_ld_short4] = "ld_short4", + [midgard_op_ld_int4] = "ld_int4", + + [midgard_op_ld_attr_32] = "ld_attr_32", + [midgard_op_ld_attr_16] = "ld_attr_16", + [midgard_op_ld_attr_32i] = "ld_attr_32i", + [midgard_op_ld_attr_32u] = "ld_attr_32u", + + [midgard_op_ld_vary_32] = "ld_vary_32", + [midgard_op_ld_vary_16] = "ld_vary_16", + [midgard_op_ld_vary_32i] = "ld_vary_32i", + [midgard_op_ld_vary_32u] = "ld_vary_32u", + + [midgard_op_ld_color_buffer_16] = "ld_color_buffer_16", + + [midgard_op_ld_uniform_16] = "ld_uniform_16", + [midgard_op_ld_uniform_32] = "ld_uniform_32", + [midgard_op_ld_uniform_32i] = "ld_uniform_32i", + [midgard_op_ld_color_buffer_8] = "ld_color_buffer_8", + + [midgard_op_st_char] = "st_char", + [midgard_op_st_char2] = "st_char2", + [midgard_op_st_char4] = "st_char4", + [midgard_op_st_short4] = "st_short4", + [midgard_op_st_int4] = "st_int4", + + [midgard_op_st_vary_32] = "st_vary_32", + [midgard_op_st_vary_16] = "st_vary_16", + [midgard_op_st_vary_32i] = "st_vary_32i", + [midgard_op_st_vary_32u] = "st_vary_32u", + + [midgard_op_st_image_f] = "st_image_f", + [midgard_op_st_image_ui] = "st_image_ui", + [midgard_op_st_image_i] = "st_image_i", +}; diff --git a/src/panfrost/midgard/midgard_ops.h b/src/panfrost/midgard/midgard_ops.h new file mode 100644 index 00000000000..64c91a5bcac --- /dev/null +++ b/src/panfrost/midgard/midgard_ops.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2018-2019 Alyssa Rosenzweig ([email protected]) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "helpers.h" + +/* Forward declare */ + +extern struct mir_op_props alu_opcode_props[256]; +extern const char *load_store_opcode_names[256]; + +/* Is this opcode that of an integer (regardless of signedness)? Instruction + * names authoritatively determine types */ + +static inline bool +midgard_is_integer_op(int op) +{ + const char *name = alu_opcode_props[op].name; + + if (!name) + return false; + + return (name[0] == 'i') || (name[0] == 'u'); +} + +/* Does this opcode *write* an integer? Same as is_integer_op, unless it's a + * conversion between int<->float in which case we do the opposite */ + +static inline bool +midgard_is_integer_out_op(int op) +{ + bool is_int = midgard_is_integer_op(op); + bool is_conversion = alu_opcode_props[op].props & OP_TYPE_CONVERT; + + return is_int ^ is_conversion; +} + +/* Determines effective writemask, taking quirks and expansion into account */ + +static inline unsigned +effective_writemask(midgard_vector_alu *alu, unsigned existing_mask) +{ + /* Channel count is off-by-one to fit in two-bits (0 channel makes no + * sense) */ + + unsigned channel_count = GET_CHANNEL_COUNT(alu_opcode_props[alu->op].props); + + /* If there is a fixed channel count, construct the appropriate mask */ + + if (channel_count) + return (1 << channel_count) - 1; + + return existing_mask; +}; + + diff --git a/src/panfrost/midgard/midgard_print.c b/src/panfrost/midgard/midgard_print.c new file mode 100644 index 00000000000..6e10429ccee --- /dev/null +++ b/src/panfrost/midgard/midgard_print.c @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "compiler.h" +#include "helpers.h" +#include "midgard_ops.h" + +/* Pretty printer for Midgard IR, for use debugging compiler-internal + * passes like register allocation. The output superficially resembles + * Midgard assembly, with the exception that unit information and such is + * (normally) omitted, and generic indices are usually used instead of + * registers */ + +static void +mir_print_source(int source) +{ + if (source >= SSA_FIXED_MINIMUM) { + /* Specific register */ + int reg = SSA_REG_FROM_FIXED(source); + + /* TODO: Moving threshold */ + if (reg > 16 && reg < 24) + printf("u%d", 23 - reg); + else + printf("r%d", reg); + } else { + printf("%d", source); + } +} + +void +mir_print_instruction(midgard_instruction *ins) +{ + printf("\t"); + + switch (ins->type) { + case TAG_ALU_4: { + midgard_alu_op op = ins->alu.op; + const char *name = alu_opcode_props[op].name; + + if (ins->unit) + printf("%d.", ins->unit); + + printf("%s", name ? name : "??"); + break; + } + + case TAG_LOAD_STORE_4: { + midgard_load_store_op op = ins->load_store.op; + const char *name = load_store_opcode_names[op]; + + assert(name); + printf("%s", name); + break; + } + + case TAG_TEXTURE_4: { + printf("texture"); + break; + } + + default: + assert(0); + } + + ssa_args *args = &ins->ssa_args; + + printf(" %d, ", args->dest); + + mir_print_source(args->src0); + printf(", "); + + if (args->inline_constant) + printf("#%d", ins->inline_constant); + else + mir_print_source(args->src1); + + if (ins->has_constants) + printf(" <%f, %f, %f, %f>", ins->constants[0], ins->constants[1], ins->constants[2], ins->constants[3]); + + printf("\n"); +} + +/* Dumps MIR for a block or entire shader respective */ + +void +mir_print_block(midgard_block *block) +{ + printf("{\n"); + + mir_foreach_instr_in_block(block, ins) { + mir_print_instruction(ins); + } + + printf("}\n"); +} + +void +mir_print_shader(compiler_context *ctx) +{ + mir_foreach_block(ctx, block) { + mir_print_block(block); + } +} + +void +mir_print_bundle(midgard_bundle *bundle) +{ + printf("[\n"); + + for (unsigned i = 0; i < bundle->instruction_count; ++i) { + midgard_instruction *ins = bundle->instructions[i]; + mir_print_instruction(ins); + } + + printf("]\n"); +} diff --git a/src/panfrost/midgard/midgard_ra.c b/src/panfrost/midgard/midgard_ra.c new file mode 100644 index 00000000000..cfe091326ed --- /dev/null +++ b/src/panfrost/midgard/midgard_ra.c @@ -0,0 +1,506 @@ +/* + * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]> + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "compiler.h" +#include "midgard_ops.h" +#include "util/register_allocate.h" +#include "util/u_math.h" + +/* For work registers, we can subdivide in various ways. So we create + * classes for the various sizes and conflict accordingly, keeping in + * mind that physical registers are divided along 128-bit boundaries. + * The important part is that 128-bit boundaries are not crossed. + * + * For each 128-bit register, we can subdivide to 32-bits 10 ways + * + * vec4: xyzw + * vec3: xyz, yzw + * vec2: xy, yz, zw, + * vec1: x, y, z, w + * + * For each 64-bit register, we can subdivide similarly to 16-bit + * (TODO: half-float RA, not that we support fp16 yet) + */ + +#define WORK_STRIDE 10 + +/* Prepacked masks/swizzles for virtual register types */ +static unsigned reg_type_to_mask[WORK_STRIDE] = { + 0xF, /* xyzw */ + 0x7, 0x7 << 1, /* xyz */ + 0x3, 0x3 << 1, 0x3 << 2, /* xy */ + 0x1, 0x1 << 1, 0x1 << 2, 0x1 << 3 /* x */ +}; + +static unsigned reg_type_to_swizzle[WORK_STRIDE] = { + SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), + + SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), + SWIZZLE(COMPONENT_Y, COMPONENT_Z, COMPONENT_W, COMPONENT_W), + + SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), + SWIZZLE(COMPONENT_Y, COMPONENT_Z, COMPONENT_Z, COMPONENT_W), + SWIZZLE(COMPONENT_Z, COMPONENT_W, COMPONENT_Z, COMPONENT_W), + + SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), + SWIZZLE(COMPONENT_Y, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), + SWIZZLE(COMPONENT_Z, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), + SWIZZLE(COMPONENT_W, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), +}; + +struct phys_reg { + unsigned reg; + unsigned mask; + unsigned swizzle; +}; + +/* Given the mask/swizzle of both the register and the original source, + * compose to find the actual mask/swizzle to give the hardware */ + +static unsigned +compose_writemask(unsigned mask, struct phys_reg reg) +{ + /* Note: the reg mask is guaranteed to be contiguous. So we shift + * into the X place, compose via a simple AND, and shift back */ + + unsigned shift = __builtin_ctz(reg.mask); + return ((reg.mask >> shift) & mask) << shift; +} + +static unsigned +compose_swizzle(unsigned swizzle, unsigned mask, + struct phys_reg reg, struct phys_reg dst) +{ + unsigned out = pan_compose_swizzle(swizzle, reg.swizzle); + + /* Based on the register mask, we need to adjust over. E.g if we're + * writing to yz, a base swizzle of xy__ becomes _xy_. Save the + * original first component (x). But to prevent duplicate shifting + * (only applies to ALU -- mask param is set to xyzw out on L/S to + * prevent changes), we have to account for the shift inherent to the + * original writemask */ + + unsigned rep = out & 0x3; + unsigned shift = __builtin_ctz(dst.mask) - __builtin_ctz(mask); + unsigned shifted = out << (2*shift); + + /* ..but we fill in the gaps so it appears to replicate */ + + for (unsigned s = 0; s < shift; ++s) + shifted |= rep << (2*s); + + return shifted; +} + +/* When we're 'squeezing down' the values in the IR, we maintain a hash + * as such */ + +static unsigned +find_or_allocate_temp(compiler_context *ctx, unsigned hash) +{ + if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM)) + return hash; + + unsigned temp = (uintptr_t) _mesa_hash_table_u64_search( + ctx->hash_to_temp, hash + 1); + + if (temp) + return temp - 1; + + /* If no temp is find, allocate one */ + temp = ctx->temp_count++; + ctx->max_hash = MAX2(ctx->max_hash, hash); + + _mesa_hash_table_u64_insert(ctx->hash_to_temp, + hash + 1, (void *) ((uintptr_t) temp + 1)); + + return temp; +} + +/* Callback for register allocation selection, trivial default for now */ + +static unsigned int +midgard_ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data) +{ + /* Choose the first available register to minimise register pressure */ + + for (int i = 0; i < (16 * WORK_STRIDE); ++i) { + if (BITSET_TEST(regs, i)) { + return i; + } + } + + assert(0); + return 0; +} + +/* Helper to return the default phys_reg for a given register */ + +static struct phys_reg +default_phys_reg(int reg) +{ + struct phys_reg r = { + .reg = reg, + .mask = 0xF, /* xyzw */ + .swizzle = 0xE4 /* xyzw */ + }; + + return r; +} + +/* Determine which physical register, swizzle, and mask a virtual + * register corresponds to */ + +static struct phys_reg +index_to_reg(compiler_context *ctx, struct ra_graph *g, int reg) +{ + /* Check for special cases */ + if (reg >= SSA_FIXED_MINIMUM) + return default_phys_reg(SSA_REG_FROM_FIXED(reg)); + else if ((reg < 0) || !g) + return default_phys_reg(REGISTER_UNUSED); + + /* Special cases aside, we pick the underlying register */ + int virt = ra_get_node_reg(g, reg); + + /* Divide out the register and classification */ + int phys = virt / WORK_STRIDE; + int type = virt % WORK_STRIDE; + + struct phys_reg r = { + .reg = phys, + .mask = reg_type_to_mask[type], + .swizzle = reg_type_to_swizzle[type] + }; + + /* Report that we actually use this register, and return it */ + ctx->work_registers = MAX2(ctx->work_registers, phys); + return r; +} + +/* This routine performs the actual register allocation. It should be succeeded + * by install_registers */ + +struct ra_graph * +allocate_registers(compiler_context *ctx) +{ + /* The number of vec4 work registers available depends on when the + * uniforms start, so compute that first */ + + int work_count = 16 - MAX2((ctx->uniform_cutoff - 8), 0); + + int virtual_count = work_count * WORK_STRIDE; + + /* First, initialize the RA */ + struct ra_regs *regs = ra_alloc_reg_set(NULL, virtual_count, true); + + int work_vec4 = ra_alloc_reg_class(regs); + int work_vec3 = ra_alloc_reg_class(regs); + int work_vec2 = ra_alloc_reg_class(regs); + int work_vec1 = ra_alloc_reg_class(regs); + + unsigned classes[4] = { + work_vec1, + work_vec2, + work_vec3, + work_vec4 + }; + + /* Add the full set of work registers */ + for (unsigned i = 0; i < work_count; ++i) { + int base = WORK_STRIDE * i; + + /* Build a full set of subdivisions */ + ra_class_add_reg(regs, work_vec4, base); + ra_class_add_reg(regs, work_vec3, base + 1); + ra_class_add_reg(regs, work_vec3, base + 2); + ra_class_add_reg(regs, work_vec2, base + 3); + ra_class_add_reg(regs, work_vec2, base + 4); + ra_class_add_reg(regs, work_vec2, base + 5); + ra_class_add_reg(regs, work_vec1, base + 6); + ra_class_add_reg(regs, work_vec1, base + 7); + ra_class_add_reg(regs, work_vec1, base + 8); + ra_class_add_reg(regs, work_vec1, base + 9); + + for (unsigned a = 0; a < 10; ++a) { + unsigned mask1 = reg_type_to_mask[a]; + + for (unsigned b = 0; b < 10; ++b) { + unsigned mask2 = reg_type_to_mask[b]; + + if (mask1 & mask2) + ra_add_reg_conflict(regs, + base + a, base + b); + } + } + } + + /* We're done setting up */ + ra_set_finalize(regs, NULL); + + /* Transform the MIR into squeezed index form */ + mir_foreach_block(ctx, block) { + mir_foreach_instr_in_block(block, ins) { + if (ins->compact_branch) continue; + + ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest); + ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0); + + if (!ins->ssa_args.inline_constant) + ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1); + + } + } + + /* No register allocation to do with no SSA */ + + if (!ctx->temp_count) + return NULL; + + /* Let's actually do register allocation */ + int nodes = ctx->temp_count; + struct ra_graph *g = ra_alloc_interference_graph(regs, nodes); + + /* Determine minimum size needed to hold values, to indirectly + * determine class */ + + unsigned *found_class = calloc(sizeof(unsigned), ctx->temp_count); + + mir_foreach_block(ctx, block) { + mir_foreach_instr_in_block(block, ins) { + if (ins->compact_branch) continue; + if (ins->ssa_args.dest < 0) continue; + if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue; + + int class = util_logbase2(ins->mask) + 1; + + /* Use the largest class if there's ambiguity, this + * handles partial writes */ + + int dest = ins->ssa_args.dest; + found_class[dest] = MAX2(found_class[dest], class); + } + } + + for (unsigned i = 0; i < ctx->temp_count; ++i) { + unsigned class = found_class[i]; + if (!class) continue; + ra_set_node_class(g, i, classes[class - 1]); + } + + /* Determine liveness */ + + int *live_start = malloc(nodes * sizeof(int)); + int *live_end = malloc(nodes * sizeof(int)); + + /* Initialize as non-existent */ + + for (int i = 0; i < nodes; ++i) { + live_start[i] = live_end[i] = -1; + } + + int d = 0; + + mir_foreach_block(ctx, block) { + mir_foreach_instr_in_block(block, ins) { + if (ins->compact_branch) continue; + + /* Dest is < 0 for st_vary instructions, which break + * the usual SSA conventions. Liveness analysis doesn't + * make sense on these instructions, so skip them to + * avoid memory corruption */ + + if (ins->ssa_args.dest < 0) continue; + + if (ins->ssa_args.dest < SSA_FIXED_MINIMUM) { + /* If this destination is not yet live, it is + * now since we just wrote it */ + + int dest = ins->ssa_args.dest; + + if (live_start[dest] == -1) + live_start[dest] = d; + } + + /* Since we just used a source, the source might be + * dead now. Scan the rest of the block for + * invocations, and if there are none, the source dies + * */ + + int sources[2] = { + ins->ssa_args.src0, ins->ssa_args.src1 + }; + + for (int src = 0; src < 2; ++src) { + int s = sources[src]; + + if (s < 0) continue; + + if (s >= SSA_FIXED_MINIMUM) continue; + + if (!mir_is_live_after(ctx, block, ins, s)) { + live_end[s] = d; + } + } + + ++d; + } + } + + /* If a node still hasn't been killed, kill it now */ + + for (int i = 0; i < nodes; ++i) { + /* live_start == -1 most likely indicates a pinned output */ + + if (live_end[i] == -1) + live_end[i] = d; + } + + /* Setup interference between nodes that are live at the same time */ + + for (int i = 0; i < nodes; ++i) { + for (int j = i + 1; j < nodes; ++j) { + bool j_overlaps_i = live_start[j] < live_end[i]; + bool i_overlaps_j = live_end[j] < live_start[i]; + + if (i_overlaps_j || j_overlaps_i) + ra_add_node_interference(g, i, j); + } + } + + ra_set_select_reg_callback(g, midgard_ra_select_callback, NULL); + + if (!ra_allocate(g)) { + unreachable("Error allocating registers\n"); + } + + /* Cleanup */ + free(live_start); + free(live_end); + + return g; +} + +/* Once registers have been decided via register allocation + * (allocate_registers), we need to rewrite the MIR to use registers instead of + * indices */ + +static void +install_registers_instr( + compiler_context *ctx, + struct ra_graph *g, + midgard_instruction *ins) +{ + ssa_args args = ins->ssa_args; + + switch (ins->type) { + case TAG_ALU_4: { + int adjusted_src = args.inline_constant ? -1 : args.src1; + struct phys_reg src1 = index_to_reg(ctx, g, args.src0); + struct phys_reg src2 = index_to_reg(ctx, g, adjusted_src); + struct phys_reg dest = index_to_reg(ctx, g, args.dest); + + unsigned uncomposed_mask = ins->mask; + ins->mask = compose_writemask(uncomposed_mask, dest); + + /* Adjust the dest mask if necessary. Mostly this is a no-op + * but it matters for dot products */ + dest.mask = effective_writemask(&ins->alu, ins->mask); + + midgard_vector_alu_src mod1 = + vector_alu_from_unsigned(ins->alu.src1); + mod1.swizzle = compose_swizzle(mod1.swizzle, uncomposed_mask, src1, dest); + ins->alu.src1 = vector_alu_srco_unsigned(mod1); + + ins->registers.src1_reg = src1.reg; + + ins->registers.src2_imm = args.inline_constant; + + if (args.inline_constant) { + /* Encode inline 16-bit constant. See disassembler for + * where the algorithm is from */ + + ins->registers.src2_reg = ins->inline_constant >> 11; + + int lower_11 = ins->inline_constant & ((1 << 12) - 1); + uint16_t imm = ((lower_11 >> 8) & 0x7) | + ((lower_11 & 0xFF) << 3); + + ins->alu.src2 = imm << 2; + } else { + midgard_vector_alu_src mod2 = + vector_alu_from_unsigned(ins->alu.src2); + mod2.swizzle = compose_swizzle( + mod2.swizzle, uncomposed_mask, src2, dest); + ins->alu.src2 = vector_alu_srco_unsigned(mod2); + + ins->registers.src2_reg = src2.reg; + } + + ins->registers.out_reg = dest.reg; + break; + } + + case TAG_LOAD_STORE_4: { + if (OP_IS_STORE_VARY(ins->load_store.op)) { + /* TODO: use ssa_args for st_vary */ + ins->load_store.reg = 0; + } else { + /* Which physical register we read off depends on + * whether we are loading or storing -- think about the + * logical dataflow */ + + unsigned r = OP_IS_STORE(ins->load_store.op) ? + args.src0 : args.dest; + struct phys_reg src = index_to_reg(ctx, g, r); + + ins->load_store.reg = src.reg; + + ins->load_store.swizzle = compose_swizzle( + ins->load_store.swizzle, 0xF, + default_phys_reg(0), src); + + ins->mask = compose_writemask( + ins->mask, src); + } + + break; + } + + default: + break; + } +} + +void +install_registers(compiler_context *ctx, struct ra_graph *g) +{ + mir_foreach_block(ctx, block) { + mir_foreach_instr_in_block(block, ins) { + if (ins->compact_branch) continue; + install_registers_instr(ctx, g, ins); + } + } + +} diff --git a/src/panfrost/midgard/midgard_ra_pipeline.c b/src/panfrost/midgard/midgard_ra_pipeline.c new file mode 100644 index 00000000000..cd64bdf29e5 --- /dev/null +++ b/src/panfrost/midgard/midgard_ra_pipeline.c @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2019 Alyssa Rosenzweig <[email protected]> + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "compiler.h" + +/* Creates pipeline registers. This is a prepass run before the main register + * allocator but after scheduling, once bundles are created. It works by + * iterating the scheduled IR, checking if a value is ever used after the end + * of the current bundle. If it is not, it is promoted to a bundle-specific + * pipeline register. + * + * Pipeline registers are only written from the first two stages of the + * pipeline (vmul/sadd) lasting the duration of the bundle only. There are two + * 128-bit pipeline registers available (r24/r25). The upshot is that no actual + * register allocation is needed; we can _always_ promote a value to a pipeline + * register, liveness permitting. This greatly simplifies the logic of this + * passing, negating the need for a proper RA like work registers. + */ + +static bool +mir_pipeline_ins( + compiler_context *ctx, + midgard_block *block, + midgard_bundle *bundle, unsigned i, + unsigned pipeline_count) +{ + midgard_instruction *ins = bundle->instructions[i]; + unsigned dest = ins->ssa_args.dest; + + /* Check to make sure we're legal */ + + if (ins->compact_branch) + return false; + + /* Don't allow non-SSA. Pipelining registers is theoretically possible, + * but the analysis is much hairier, so don't bother quite yet */ + if ((dest < 0) || (dest >= ctx->func->impl->ssa_alloc)) + return false; + + /* Make sure they're not lying to us. Blend shaders lie. TODO: Fix your + * bad code Alyssa */ + + if (mir_has_multiple_writes(ctx, dest)) + return false; + + /* We want to know if we live after this bundle, so check if + * we're live after the last instruction of the bundle */ + + midgard_instruction *end = bundle->instructions[ + bundle->instruction_count - 1]; + + if (mir_is_live_after(ctx, block, end, ins->ssa_args.dest)) + return false; + + /* We're only live in this bundle -- pipeline! */ + + mir_rewrite_index(ctx, dest, SSA_FIXED_REGISTER(24 + pipeline_count)); + + return true; +} + +void +mir_create_pipeline_registers(compiler_context *ctx) +{ + mir_foreach_block(ctx, block) { + mir_foreach_bundle_in_block(block, bundle) { + if (!mir_is_alu_bundle(bundle)) continue; + if (bundle->instruction_count < 2) continue; + + /* Only first 2 instructions could pipeline */ + bool succ = mir_pipeline_ins(ctx, block, bundle, 0, 0); + mir_pipeline_ins(ctx, block, bundle, 1, succ); + } + } +} diff --git a/src/panfrost/midgard/midgard_schedule.c b/src/panfrost/midgard/midgard_schedule.c new file mode 100644 index 00000000000..7a3841e4d44 --- /dev/null +++ b/src/panfrost/midgard/midgard_schedule.c @@ -0,0 +1,541 @@ +/* + * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "compiler.h" +#include "midgard_ops.h" +#include "util/u_memory.h" + +/* Create a mask of accessed components from a swizzle to figure out vector + * dependencies */ + +static unsigned +swizzle_to_access_mask(unsigned swizzle) +{ + unsigned component_mask = 0; + + for (int i = 0; i < 4; ++i) { + unsigned c = (swizzle >> (2 * i)) & 3; + component_mask |= (1 << c); + } + + return component_mask; +} + +/* Does the mask cover more than a scalar? */ + +static bool +is_single_component_mask(unsigned mask) +{ + int components = 0; + + for (int c = 0; c < 8; ++c) { + if (mask & (1 << c)) + components++; + } + + return components == 1; +} + +/* Checks for an SSA data hazard between two adjacent instructions, keeping in + * mind that we are a vector architecture and we can write to different + * components simultaneously */ + +static bool +can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second) +{ + /* Each instruction reads some registers and writes to a register. See + * where the first writes */ + + /* Figure out where exactly we wrote to */ + int source = first->ssa_args.dest; + int source_mask = first->mask; + + /* As long as the second doesn't read from the first, we're okay */ + if (second->ssa_args.src0 == source) { + if (first->type == TAG_ALU_4) { + /* Figure out which components we just read from */ + + int q = second->alu.src1; + midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q; + + /* Check if there are components in common, and fail if so */ + if (swizzle_to_access_mask(m->swizzle) & source_mask) + return false; + } else + return false; + + } + + if (second->ssa_args.src1 == source) + return false; + + /* Otherwise, it's safe in that regard. Another data hazard is both + * writing to the same place, of course */ + + if (second->ssa_args.dest == source) { + /* ...but only if the components overlap */ + + if (second->mask & source_mask) + return false; + } + + /* ...That's it */ + return true; +} + +static bool +midgard_has_hazard( + midgard_instruction **segment, unsigned segment_size, + midgard_instruction *ains) +{ + for (int s = 0; s < segment_size; ++s) + if (!can_run_concurrent_ssa(segment[s], ains)) + return true; + + return false; + + +} + +/* Schedules, but does not emit, a single basic block. After scheduling, the + * final tag and size of the block are known, which are necessary for branching + * */ + +static midgard_bundle +schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip) +{ + int instructions_emitted = 0, packed_idx = 0; + midgard_bundle bundle = { 0 }; + + uint8_t tag = ins->type; + + /* Default to the instruction's tag */ + bundle.tag = tag; + + switch (ins->type) { + case TAG_ALU_4: { + uint32_t control = 0; + size_t bytes_emitted = sizeof(control); + + /* TODO: Constant combining */ + int index = 0, last_unit = 0; + + /* Previous instructions, for the purpose of parallelism */ + midgard_instruction *segment[4] = {0}; + int segment_size = 0; + + instructions_emitted = -1; + midgard_instruction *pins = ins; + + unsigned constant_count = 0; + + for (;;) { + midgard_instruction *ains = pins; + + /* Advance instruction pointer */ + if (index) { + ains = mir_next_op(pins); + pins = ains; + } + + /* Out-of-work condition */ + if ((struct list_head *) ains == &block->instructions) + break; + + /* Ensure that the chain can continue */ + if (ains->type != TAG_ALU_4) break; + + /* If there's already something in the bundle and we + * have weird scheduler constraints, break now */ + if (ains->precede_break && index) break; + + /* According to the presentation "The ARM + * Mali-T880 Mobile GPU" from HotChips 27, + * there are two pipeline stages. Branching + * position determined experimentally. Lines + * are executed in parallel: + * + * [ VMUL ] [ SADD ] + * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ] + * + * Verify that there are no ordering dependencies here. + * + * TODO: Allow for parallelism!!! + */ + + /* Pick a unit for it if it doesn't force a particular unit */ + + int unit = ains->unit; + + if (!unit) { + int op = ains->alu.op; + int units = alu_opcode_props[op].props; + + bool scalarable = units & UNITS_SCALAR; + bool could_scalar = is_single_component_mask(ains->mask); + + /* Only 16/32-bit can run on a scalar unit */ + could_scalar &= ains->alu.reg_mode != midgard_reg_mode_8; + could_scalar &= ains->alu.reg_mode != midgard_reg_mode_64; + could_scalar &= ains->alu.dest_override == midgard_dest_override_none; + + if (ains->alu.reg_mode == midgard_reg_mode_16) { + /* If we're running in 16-bit mode, we + * can't have any 8-bit sources on the + * scalar unit (since the scalar unit + * doesn't understand 8-bit) */ + + midgard_vector_alu_src s1 = + vector_alu_from_unsigned(ains->alu.src1); + + could_scalar &= !s1.half; + + if (!ains->ssa_args.inline_constant) { + midgard_vector_alu_src s2 = + vector_alu_from_unsigned(ains->alu.src2); + + could_scalar &= !s2.half; + } + + } + + bool scalar = could_scalar && scalarable; + + /* TODO: Check ahead-of-time for other scalar + * hazards that otherwise get aborted out */ + + if (scalar) + assert(units & UNITS_SCALAR); + + if (!scalar) { + if (last_unit >= UNIT_VADD) { + if (units & UNIT_VLUT) + unit = UNIT_VLUT; + else + break; + } else { + if ((units & UNIT_VMUL) && last_unit < UNIT_VMUL) + unit = UNIT_VMUL; + else if ((units & UNIT_VADD) && !(control & UNIT_VADD)) + unit = UNIT_VADD; + else if (units & UNIT_VLUT) + unit = UNIT_VLUT; + else + break; + } + } else { + if (last_unit >= UNIT_VADD) { + if ((units & UNIT_SMUL) && !(control & UNIT_SMUL)) + unit = UNIT_SMUL; + else if (units & UNIT_VLUT) + unit = UNIT_VLUT; + else + break; + } else { + if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains)) + unit = UNIT_SADD; + else if (units & UNIT_SMUL) + unit = ((units & UNIT_VMUL) && !(control & UNIT_VMUL)) ? UNIT_VMUL : UNIT_SMUL; + else if ((units & UNIT_VADD) && !(control & UNIT_VADD)) + unit = UNIT_VADD; + else + break; + } + } + + assert(unit & units); + } + + /* Late unit check, this time for encoding (not parallelism) */ + if (unit <= last_unit) break; + + /* Clear the segment */ + if (last_unit < UNIT_VADD && unit >= UNIT_VADD) + segment_size = 0; + + if (midgard_has_hazard(segment, segment_size, ains)) + break; + + /* We're good to go -- emit the instruction */ + ains->unit = unit; + + segment[segment_size++] = ains; + + /* We try to reuse constants if possible, by adjusting + * the swizzle */ + + if (ains->has_blend_constant) { + /* Everything conflicts with the blend constant */ + if (bundle.has_embedded_constants) + break; + + bundle.has_blend_constant = 1; + bundle.has_embedded_constants = 1; + } else if (ains->has_constants && ains->alu.reg_mode == midgard_reg_mode_16) { + /* TODO: DRY with the analysis pass */ + + if (bundle.has_blend_constant) + break; + + if (constant_count) + break; + + /* TODO: Fix packing XXX */ + uint16_t *bundles = (uint16_t *) bundle.constants; + uint32_t *constants = (uint32_t *) ains->constants; + + /* Copy them wholesale */ + for (unsigned i = 0; i < 4; ++i) + bundles[i] = constants[i]; + + bundle.has_embedded_constants = true; + constant_count = 4; + } else if (ains->has_constants) { + /* By definition, blend constants conflict with + * everything, so if there are already + * constants we break the bundle *now* */ + + if (bundle.has_blend_constant) + break; + + /* For anything but blend constants, we can do + * proper analysis, however */ + + /* TODO: Mask by which are used */ + uint32_t *constants = (uint32_t *) ains->constants; + uint32_t *bundles = (uint32_t *) bundle.constants; + + uint32_t indices[4] = { 0 }; + bool break_bundle = false; + + for (unsigned i = 0; i < 4; ++i) { + uint32_t cons = constants[i]; + bool constant_found = false; + + /* Search for the constant */ + for (unsigned j = 0; j < constant_count; ++j) { + if (bundles[j] != cons) + continue; + + /* We found it, reuse */ + indices[i] = j; + constant_found = true; + break; + } + + if (constant_found) + continue; + + /* We didn't find it, so allocate it */ + unsigned idx = constant_count++; + + if (idx >= 4) { + /* Uh-oh, out of space */ + break_bundle = true; + break; + } + + /* We have space, copy it in! */ + bundles[idx] = cons; + indices[i] = idx; + } + + if (break_bundle) + break; + + /* Cool, we have it in. So use indices as a + * swizzle */ + + unsigned swizzle = SWIZZLE_FROM_ARRAY(indices); + unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + + if (ains->ssa_args.src0 == r_constant) + ains->alu.src1 = vector_alu_apply_swizzle(ains->alu.src1, swizzle); + + if (ains->ssa_args.src1 == r_constant) + ains->alu.src2 = vector_alu_apply_swizzle(ains->alu.src2, swizzle); + + bundle.has_embedded_constants = true; + } + + if (ains->unit & UNITS_ANY_VECTOR) { + bytes_emitted += sizeof(midgard_reg_info); + bytes_emitted += sizeof(midgard_vector_alu); + } else if (ains->compact_branch) { + /* All of r0 has to be written out along with + * the branch writeout */ + + if (ains->writeout) { + /* The rules for when "bare" writeout + * is safe are when all components are + * r0 are written out in the final + * bundle, earlier than VLUT, where any + * register dependencies of r0 are from + * an earlier bundle. We can't verify + * this before RA, so we don't try. */ + + if (index != 0) + break; + + /* Inject a move */ + midgard_instruction ins = v_mov(0, blank_alu_src, SSA_FIXED_REGISTER(0)); + ins.unit = UNIT_VMUL; + control |= ins.unit; + + /* TODO don't leak */ + midgard_instruction *move = + mem_dup(&ins, sizeof(midgard_instruction)); + bytes_emitted += sizeof(midgard_reg_info); + bytes_emitted += sizeof(midgard_vector_alu); + bundle.instructions[packed_idx++] = move; + } + + if (ains->unit == ALU_ENAB_BRANCH) { + bytes_emitted += sizeof(midgard_branch_extended); + } else { + bytes_emitted += sizeof(ains->br_compact); + } + } else { + bytes_emitted += sizeof(midgard_reg_info); + bytes_emitted += sizeof(midgard_scalar_alu); + } + + /* Defer marking until after writing to allow for break */ + control |= ains->unit; + last_unit = ains->unit; + ++instructions_emitted; + ++index; + } + + int padding = 0; + + /* Pad ALU op to nearest word */ + + if (bytes_emitted & 15) { + padding = 16 - (bytes_emitted & 15); + bytes_emitted += padding; + } + + /* Constants must always be quadwords */ + if (bundle.has_embedded_constants) + bytes_emitted += 16; + + /* Size ALU instruction for tag */ + bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1; + bundle.padding = padding; + bundle.control = bundle.tag | control; + + break; + } + + case TAG_LOAD_STORE_4: { + /* Load store instructions have two words at once. If + * we only have one queued up, we need to NOP pad. + * Otherwise, we store both in succession to save space + * and cycles -- letting them go in parallel -- skip + * the next. The usefulness of this optimisation is + * greatly dependent on the quality of the instruction + * scheduler. + */ + + midgard_instruction *next_op = mir_next_op(ins); + + if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) { + /* TODO: Concurrency check */ + instructions_emitted++; + } + + break; + } + + case TAG_TEXTURE_4: { + /* Which tag we use depends on the shader stage */ + bool in_frag = ctx->stage == MESA_SHADER_FRAGMENT; + bundle.tag = in_frag ? TAG_TEXTURE_4 : TAG_TEXTURE_4_VTX; + break; + } + + default: + unreachable("Unknown tag"); + break; + } + + /* Copy the instructions into the bundle */ + bundle.instruction_count = instructions_emitted + 1 + packed_idx; + + midgard_instruction *uins = ins; + for (; packed_idx < bundle.instruction_count; ++packed_idx) { + bundle.instructions[packed_idx] = uins; + uins = mir_next_op(uins); + } + + *skip = instructions_emitted; + + return bundle; +} + +/* Schedule a single block by iterating its instruction to create bundles. + * While we go, tally about the bundle sizes to compute the block size. */ + +static void +schedule_block(compiler_context *ctx, midgard_block *block) +{ + util_dynarray_init(&block->bundles, NULL); + + block->quadword_count = 0; + + mir_foreach_instr_in_block(block, ins) { + int skip; + midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip); + util_dynarray_append(&block->bundles, midgard_bundle, bundle); + + if (bundle.has_blend_constant) { + /* TODO: Multiblock? */ + int quadwords_within_block = block->quadword_count + quadword_size(bundle.tag) - 1; + ctx->blend_constant_offset = quadwords_within_block * 0x10; + } + + while(skip--) + ins = mir_next_op(ins); + + block->quadword_count += quadword_size(bundle.tag); + } + + block->is_scheduled = true; +} + +void +schedule_program(compiler_context *ctx) +{ + /* We run RA prior to scheduling */ + + mir_foreach_block(ctx, block) { + schedule_block(ctx, block); + } + + /* Pipeline registers creation is a prepass before RA */ + mir_create_pipeline_registers(ctx); + + struct ra_graph *g = allocate_registers(ctx); + install_registers(ctx, g); +} diff --git a/src/panfrost/midgard/mir.c b/src/panfrost/midgard/mir.c new file mode 100644 index 00000000000..6adc1350c0a --- /dev/null +++ b/src/panfrost/midgard/mir.c @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2019 Alyssa Rosenzweig <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "compiler.h" + +void +mir_rewrite_index_src(compiler_context *ctx, unsigned old, unsigned new) +{ + mir_foreach_instr_global(ctx, ins) { + if (ins->ssa_args.src0 == old) + ins->ssa_args.src0 = new; + + if (ins->ssa_args.src1 == old && + !ins->ssa_args.inline_constant) + ins->ssa_args.src1 = new; + } +} + +void +mir_rewrite_index_dst(compiler_context *ctx, unsigned old, unsigned new) +{ + mir_foreach_instr_global(ctx, ins) { + if (ins->ssa_args.dest == old) + ins->ssa_args.dest = new; + } +} + +void +mir_rewrite_index(compiler_context *ctx, unsigned old, unsigned new) +{ + mir_rewrite_index_src(ctx, old, new); + mir_rewrite_index_dst(ctx, old, new); +} |