diff options
Diffstat (limited to 'src/panfrost/midgard/midgard_ra.c')
-rw-r--r-- | src/panfrost/midgard/midgard_ra.c | 506 |
1 files changed, 506 insertions, 0 deletions
diff --git a/src/panfrost/midgard/midgard_ra.c b/src/panfrost/midgard/midgard_ra.c new file mode 100644 index 00000000000..cfe091326ed --- /dev/null +++ b/src/panfrost/midgard/midgard_ra.c @@ -0,0 +1,506 @@ +/* + * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]> + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "compiler.h" +#include "midgard_ops.h" +#include "util/register_allocate.h" +#include "util/u_math.h" + +/* For work registers, we can subdivide in various ways. So we create + * classes for the various sizes and conflict accordingly, keeping in + * mind that physical registers are divided along 128-bit boundaries. + * The important part is that 128-bit boundaries are not crossed. + * + * For each 128-bit register, we can subdivide to 32-bits 10 ways + * + * vec4: xyzw + * vec3: xyz, yzw + * vec2: xy, yz, zw, + * vec1: x, y, z, w + * + * For each 64-bit register, we can subdivide similarly to 16-bit + * (TODO: half-float RA, not that we support fp16 yet) + */ + +#define WORK_STRIDE 10 + +/* Prepacked masks/swizzles for virtual register types */ +static unsigned reg_type_to_mask[WORK_STRIDE] = { + 0xF, /* xyzw */ + 0x7, 0x7 << 1, /* xyz */ + 0x3, 0x3 << 1, 0x3 << 2, /* xy */ + 0x1, 0x1 << 1, 0x1 << 2, 0x1 << 3 /* x */ +}; + +static unsigned reg_type_to_swizzle[WORK_STRIDE] = { + SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), + + SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), + SWIZZLE(COMPONENT_Y, COMPONENT_Z, COMPONENT_W, COMPONENT_W), + + SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), + SWIZZLE(COMPONENT_Y, COMPONENT_Z, COMPONENT_Z, COMPONENT_W), + SWIZZLE(COMPONENT_Z, COMPONENT_W, COMPONENT_Z, COMPONENT_W), + + SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), + SWIZZLE(COMPONENT_Y, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), + SWIZZLE(COMPONENT_Z, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), + SWIZZLE(COMPONENT_W, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), +}; + +struct phys_reg { + unsigned reg; + unsigned mask; + unsigned swizzle; +}; + +/* Given the mask/swizzle of both the register and the original source, + * compose to find the actual mask/swizzle to give the hardware */ + +static unsigned +compose_writemask(unsigned mask, struct phys_reg reg) +{ + /* Note: the reg mask is guaranteed to be contiguous. So we shift + * into the X place, compose via a simple AND, and shift back */ + + unsigned shift = __builtin_ctz(reg.mask); + return ((reg.mask >> shift) & mask) << shift; +} + +static unsigned +compose_swizzle(unsigned swizzle, unsigned mask, + struct phys_reg reg, struct phys_reg dst) +{ + unsigned out = pan_compose_swizzle(swizzle, reg.swizzle); + + /* Based on the register mask, we need to adjust over. E.g if we're + * writing to yz, a base swizzle of xy__ becomes _xy_. Save the + * original first component (x). But to prevent duplicate shifting + * (only applies to ALU -- mask param is set to xyzw out on L/S to + * prevent changes), we have to account for the shift inherent to the + * original writemask */ + + unsigned rep = out & 0x3; + unsigned shift = __builtin_ctz(dst.mask) - __builtin_ctz(mask); + unsigned shifted = out << (2*shift); + + /* ..but we fill in the gaps so it appears to replicate */ + + for (unsigned s = 0; s < shift; ++s) + shifted |= rep << (2*s); + + return shifted; +} + +/* When we're 'squeezing down' the values in the IR, we maintain a hash + * as such */ + +static unsigned +find_or_allocate_temp(compiler_context *ctx, unsigned hash) +{ + if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM)) + return hash; + + unsigned temp = (uintptr_t) _mesa_hash_table_u64_search( + ctx->hash_to_temp, hash + 1); + + if (temp) + return temp - 1; + + /* If no temp is find, allocate one */ + temp = ctx->temp_count++; + ctx->max_hash = MAX2(ctx->max_hash, hash); + + _mesa_hash_table_u64_insert(ctx->hash_to_temp, + hash + 1, (void *) ((uintptr_t) temp + 1)); + + return temp; +} + +/* Callback for register allocation selection, trivial default for now */ + +static unsigned int +midgard_ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data) +{ + /* Choose the first available register to minimise register pressure */ + + for (int i = 0; i < (16 * WORK_STRIDE); ++i) { + if (BITSET_TEST(regs, i)) { + return i; + } + } + + assert(0); + return 0; +} + +/* Helper to return the default phys_reg for a given register */ + +static struct phys_reg +default_phys_reg(int reg) +{ + struct phys_reg r = { + .reg = reg, + .mask = 0xF, /* xyzw */ + .swizzle = 0xE4 /* xyzw */ + }; + + return r; +} + +/* Determine which physical register, swizzle, and mask a virtual + * register corresponds to */ + +static struct phys_reg +index_to_reg(compiler_context *ctx, struct ra_graph *g, int reg) +{ + /* Check for special cases */ + if (reg >= SSA_FIXED_MINIMUM) + return default_phys_reg(SSA_REG_FROM_FIXED(reg)); + else if ((reg < 0) || !g) + return default_phys_reg(REGISTER_UNUSED); + + /* Special cases aside, we pick the underlying register */ + int virt = ra_get_node_reg(g, reg); + + /* Divide out the register and classification */ + int phys = virt / WORK_STRIDE; + int type = virt % WORK_STRIDE; + + struct phys_reg r = { + .reg = phys, + .mask = reg_type_to_mask[type], + .swizzle = reg_type_to_swizzle[type] + }; + + /* Report that we actually use this register, and return it */ + ctx->work_registers = MAX2(ctx->work_registers, phys); + return r; +} + +/* This routine performs the actual register allocation. It should be succeeded + * by install_registers */ + +struct ra_graph * +allocate_registers(compiler_context *ctx) +{ + /* The number of vec4 work registers available depends on when the + * uniforms start, so compute that first */ + + int work_count = 16 - MAX2((ctx->uniform_cutoff - 8), 0); + + int virtual_count = work_count * WORK_STRIDE; + + /* First, initialize the RA */ + struct ra_regs *regs = ra_alloc_reg_set(NULL, virtual_count, true); + + int work_vec4 = ra_alloc_reg_class(regs); + int work_vec3 = ra_alloc_reg_class(regs); + int work_vec2 = ra_alloc_reg_class(regs); + int work_vec1 = ra_alloc_reg_class(regs); + + unsigned classes[4] = { + work_vec1, + work_vec2, + work_vec3, + work_vec4 + }; + + /* Add the full set of work registers */ + for (unsigned i = 0; i < work_count; ++i) { + int base = WORK_STRIDE * i; + + /* Build a full set of subdivisions */ + ra_class_add_reg(regs, work_vec4, base); + ra_class_add_reg(regs, work_vec3, base + 1); + ra_class_add_reg(regs, work_vec3, base + 2); + ra_class_add_reg(regs, work_vec2, base + 3); + ra_class_add_reg(regs, work_vec2, base + 4); + ra_class_add_reg(regs, work_vec2, base + 5); + ra_class_add_reg(regs, work_vec1, base + 6); + ra_class_add_reg(regs, work_vec1, base + 7); + ra_class_add_reg(regs, work_vec1, base + 8); + ra_class_add_reg(regs, work_vec1, base + 9); + + for (unsigned a = 0; a < 10; ++a) { + unsigned mask1 = reg_type_to_mask[a]; + + for (unsigned b = 0; b < 10; ++b) { + unsigned mask2 = reg_type_to_mask[b]; + + if (mask1 & mask2) + ra_add_reg_conflict(regs, + base + a, base + b); + } + } + } + + /* We're done setting up */ + ra_set_finalize(regs, NULL); + + /* Transform the MIR into squeezed index form */ + mir_foreach_block(ctx, block) { + mir_foreach_instr_in_block(block, ins) { + if (ins->compact_branch) continue; + + ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest); + ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0); + + if (!ins->ssa_args.inline_constant) + ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1); + + } + } + + /* No register allocation to do with no SSA */ + + if (!ctx->temp_count) + return NULL; + + /* Let's actually do register allocation */ + int nodes = ctx->temp_count; + struct ra_graph *g = ra_alloc_interference_graph(regs, nodes); + + /* Determine minimum size needed to hold values, to indirectly + * determine class */ + + unsigned *found_class = calloc(sizeof(unsigned), ctx->temp_count); + + mir_foreach_block(ctx, block) { + mir_foreach_instr_in_block(block, ins) { + if (ins->compact_branch) continue; + if (ins->ssa_args.dest < 0) continue; + if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue; + + int class = util_logbase2(ins->mask) + 1; + + /* Use the largest class if there's ambiguity, this + * handles partial writes */ + + int dest = ins->ssa_args.dest; + found_class[dest] = MAX2(found_class[dest], class); + } + } + + for (unsigned i = 0; i < ctx->temp_count; ++i) { + unsigned class = found_class[i]; + if (!class) continue; + ra_set_node_class(g, i, classes[class - 1]); + } + + /* Determine liveness */ + + int *live_start = malloc(nodes * sizeof(int)); + int *live_end = malloc(nodes * sizeof(int)); + + /* Initialize as non-existent */ + + for (int i = 0; i < nodes; ++i) { + live_start[i] = live_end[i] = -1; + } + + int d = 0; + + mir_foreach_block(ctx, block) { + mir_foreach_instr_in_block(block, ins) { + if (ins->compact_branch) continue; + + /* Dest is < 0 for st_vary instructions, which break + * the usual SSA conventions. Liveness analysis doesn't + * make sense on these instructions, so skip them to + * avoid memory corruption */ + + if (ins->ssa_args.dest < 0) continue; + + if (ins->ssa_args.dest < SSA_FIXED_MINIMUM) { + /* If this destination is not yet live, it is + * now since we just wrote it */ + + int dest = ins->ssa_args.dest; + + if (live_start[dest] == -1) + live_start[dest] = d; + } + + /* Since we just used a source, the source might be + * dead now. Scan the rest of the block for + * invocations, and if there are none, the source dies + * */ + + int sources[2] = { + ins->ssa_args.src0, ins->ssa_args.src1 + }; + + for (int src = 0; src < 2; ++src) { + int s = sources[src]; + + if (s < 0) continue; + + if (s >= SSA_FIXED_MINIMUM) continue; + + if (!mir_is_live_after(ctx, block, ins, s)) { + live_end[s] = d; + } + } + + ++d; + } + } + + /* If a node still hasn't been killed, kill it now */ + + for (int i = 0; i < nodes; ++i) { + /* live_start == -1 most likely indicates a pinned output */ + + if (live_end[i] == -1) + live_end[i] = d; + } + + /* Setup interference between nodes that are live at the same time */ + + for (int i = 0; i < nodes; ++i) { + for (int j = i + 1; j < nodes; ++j) { + bool j_overlaps_i = live_start[j] < live_end[i]; + bool i_overlaps_j = live_end[j] < live_start[i]; + + if (i_overlaps_j || j_overlaps_i) + ra_add_node_interference(g, i, j); + } + } + + ra_set_select_reg_callback(g, midgard_ra_select_callback, NULL); + + if (!ra_allocate(g)) { + unreachable("Error allocating registers\n"); + } + + /* Cleanup */ + free(live_start); + free(live_end); + + return g; +} + +/* Once registers have been decided via register allocation + * (allocate_registers), we need to rewrite the MIR to use registers instead of + * indices */ + +static void +install_registers_instr( + compiler_context *ctx, + struct ra_graph *g, + midgard_instruction *ins) +{ + ssa_args args = ins->ssa_args; + + switch (ins->type) { + case TAG_ALU_4: { + int adjusted_src = args.inline_constant ? -1 : args.src1; + struct phys_reg src1 = index_to_reg(ctx, g, args.src0); + struct phys_reg src2 = index_to_reg(ctx, g, adjusted_src); + struct phys_reg dest = index_to_reg(ctx, g, args.dest); + + unsigned uncomposed_mask = ins->mask; + ins->mask = compose_writemask(uncomposed_mask, dest); + + /* Adjust the dest mask if necessary. Mostly this is a no-op + * but it matters for dot products */ + dest.mask = effective_writemask(&ins->alu, ins->mask); + + midgard_vector_alu_src mod1 = + vector_alu_from_unsigned(ins->alu.src1); + mod1.swizzle = compose_swizzle(mod1.swizzle, uncomposed_mask, src1, dest); + ins->alu.src1 = vector_alu_srco_unsigned(mod1); + + ins->registers.src1_reg = src1.reg; + + ins->registers.src2_imm = args.inline_constant; + + if (args.inline_constant) { + /* Encode inline 16-bit constant. See disassembler for + * where the algorithm is from */ + + ins->registers.src2_reg = ins->inline_constant >> 11; + + int lower_11 = ins->inline_constant & ((1 << 12) - 1); + uint16_t imm = ((lower_11 >> 8) & 0x7) | + ((lower_11 & 0xFF) << 3); + + ins->alu.src2 = imm << 2; + } else { + midgard_vector_alu_src mod2 = + vector_alu_from_unsigned(ins->alu.src2); + mod2.swizzle = compose_swizzle( + mod2.swizzle, uncomposed_mask, src2, dest); + ins->alu.src2 = vector_alu_srco_unsigned(mod2); + + ins->registers.src2_reg = src2.reg; + } + + ins->registers.out_reg = dest.reg; + break; + } + + case TAG_LOAD_STORE_4: { + if (OP_IS_STORE_VARY(ins->load_store.op)) { + /* TODO: use ssa_args for st_vary */ + ins->load_store.reg = 0; + } else { + /* Which physical register we read off depends on + * whether we are loading or storing -- think about the + * logical dataflow */ + + unsigned r = OP_IS_STORE(ins->load_store.op) ? + args.src0 : args.dest; + struct phys_reg src = index_to_reg(ctx, g, r); + + ins->load_store.reg = src.reg; + + ins->load_store.swizzle = compose_swizzle( + ins->load_store.swizzle, 0xF, + default_phys_reg(0), src); + + ins->mask = compose_writemask( + ins->mask, src); + } + + break; + } + + default: + break; + } +} + +void +install_registers(compiler_context *ctx, struct ra_graph *g) +{ + mir_foreach_block(ctx, block) { + mir_foreach_instr_in_block(block, ins) { + if (ins->compact_branch) continue; + install_registers_instr(ctx, g, ins); + } + } + +} |