diff options
-rw-r--r-- | src/gallium/drivers/panfrost/meson.build | 2 | ||||
-rw-r--r-- | src/gallium/drivers/panfrost/midgard/compiler.h | 16 | ||||
-rw-r--r-- | src/gallium/drivers/panfrost/midgard/helpers.h | 20 | ||||
-rw-r--r-- | src/gallium/drivers/panfrost/midgard/midgard_compile.c | 705 | ||||
-rw-r--r-- | src/gallium/drivers/panfrost/midgard/midgard_emit.c | 229 | ||||
-rw-r--r-- | src/gallium/drivers/panfrost/midgard/midgard_schedule.c | 479 |
6 files changed, 744 insertions, 707 deletions
diff --git a/src/gallium/drivers/panfrost/meson.build b/src/gallium/drivers/panfrost/meson.build index fb92954854a..5adf24282c4 100644 --- a/src/gallium/drivers/panfrost/meson.build +++ b/src/gallium/drivers/panfrost/meson.build @@ -28,6 +28,8 @@ files_panfrost = files( 'midgard/midgard_compile.c', 'midgard/midgard_print.c', + 'midgard/midgard_schedule.c', + 'midgard/midgard_emit.c', 'midgard/midgard_ra.c', 'midgard/midgard_liveness.c', 'midgard/midgard_ops.c', diff --git a/src/gallium/drivers/panfrost/midgard/compiler.h b/src/gallium/drivers/panfrost/midgard/compiler.h index d3d64d37c49..96760d964b0 100644 --- a/src/gallium/drivers/panfrost/midgard/compiler.h +++ b/src/gallium/drivers/panfrost/midgard/compiler.h @@ -169,7 +169,7 @@ typedef struct midgard_bundle { /* Instructions contained by the bundle */ int instruction_count; - midgard_instruction instructions[5]; + midgard_instruction *instructions[5]; /* Bundle-wide ALU configuration */ int padding; @@ -177,13 +177,6 @@ typedef struct midgard_bundle { bool has_embedded_constants; float constants[4]; bool has_blend_constant; - - uint16_t register_words[8]; - int register_words_count; - - uint64_t body_words[8]; - size_t body_size[8]; - int body_words_count; } midgard_bundle; typedef struct compiler_context { @@ -422,4 +415,11 @@ struct ra_graph* allocate_registers(compiler_context *ctx); void install_registers(compiler_context *ctx, struct ra_graph *g); bool mir_is_live_after(compiler_context *ctx, midgard_block *block, midgard_instruction *start, int src); +/* Final emission */ + +void emit_binary_bundle( + compiler_context *ctx, + midgard_bundle *bundle, + struct util_dynarray *emission, + int next_tag); #endif diff --git a/src/gallium/drivers/panfrost/midgard/helpers.h b/src/gallium/drivers/panfrost/midgard/helpers.h index cf3a63e7587..9adc5b35195 100644 --- a/src/gallium/drivers/panfrost/midgard/helpers.h +++ b/src/gallium/drivers/panfrost/midgard/helpers.h @@ -22,6 +22,7 @@ #ifndef __MDG_HELPERS_H #define __MDG_HELPERS_H +#include "util/macros.h" #include <string.h> #define OP_IS_STORE_VARY(op) (\ @@ -92,6 +93,25 @@ #define TAG_ALU_12 0xA #define TAG_ALU_16 0xB +static inline int +quadword_size(int tag) +{ + switch (tag) { + case TAG_ALU_4: + case TAG_LOAD_STORE_4: + case TAG_TEXTURE_4: + return 1; + case TAG_ALU_8: + return 2; + case TAG_ALU_12: + return 3; + case TAG_ALU_16: + return 4; + default: + unreachable("Unknown tag"); + } +} + /* Special register aliases */ #define MAX_WORK_REGISTERS 16 diff --git a/src/gallium/drivers/panfrost/midgard/midgard_compile.c b/src/gallium/drivers/panfrost/midgard/midgard_compile.c index 6e650db2b1c..fab50d671a8 100644 --- a/src/gallium/drivers/panfrost/midgard/midgard_compile.c +++ b/src/gallium/drivers/panfrost/midgard/midgard_compile.c @@ -1484,109 +1484,6 @@ emit_instr(compiler_context *ctx, struct nir_instr *instr) } } -/* Midgard IR only knows vector ALU types, but we sometimes need to actually - * use scalar ALU instructions, for functional or performance reasons. To do - * this, we just demote vector ALU payloads to scalar. */ - -static int -component_from_mask(unsigned mask) -{ - for (int c = 0; c < 4; ++c) { - if (mask & (3 << (2 * c))) - return c; - } - - assert(0); - return 0; -} - -static bool -is_single_component_mask(unsigned mask) -{ - int components = 0; - - for (int c = 0; c < 4; ++c) - if (mask & (3 << (2 * c))) - components++; - - return components == 1; -} - -/* Create a mask of accessed components from a swizzle to figure out vector - * dependencies */ - -static unsigned -swizzle_to_access_mask(unsigned swizzle) -{ - unsigned component_mask = 0; - - for (int i = 0; i < 4; ++i) { - unsigned c = (swizzle >> (2 * i)) & 3; - component_mask |= (1 << c); - } - - return component_mask; -} - -static unsigned -vector_to_scalar_source(unsigned u, bool is_int) -{ - midgard_vector_alu_src v; - memcpy(&v, &u, sizeof(v)); - - /* TODO: Integers */ - - midgard_scalar_alu_src s = { - .full = !v.half, - .component = (v.swizzle & 3) << 1 - }; - - if (is_int) { - /* TODO */ - } else { - s.abs = v.mod & MIDGARD_FLOAT_MOD_ABS; - s.negate = v.mod & MIDGARD_FLOAT_MOD_NEG; - } - - unsigned o; - memcpy(&o, &s, sizeof(s)); - - return o & ((1 << 6) - 1); -} - -static midgard_scalar_alu -vector_to_scalar_alu(midgard_vector_alu v, midgard_instruction *ins) -{ - bool is_int = midgard_is_integer_op(v.op); - - /* The output component is from the mask */ - midgard_scalar_alu s = { - .op = v.op, - .src1 = vector_to_scalar_source(v.src1, is_int), - .src2 = vector_to_scalar_source(v.src2, is_int), - .unknown = 0, - .outmod = v.outmod, - .output_full = 1, /* TODO: Half */ - .output_component = component_from_mask(v.mask) << 1, - }; - - /* Inline constant is passed along rather than trying to extract it - * from v */ - - if (ins->ssa_args.inline_constant) { - uint16_t imm = 0; - int lower_11 = ins->inline_constant & ((1 << 12) - 1); - imm |= (lower_11 >> 9) & 3; - imm |= (lower_11 >> 6) & 4; - imm |= (lower_11 >> 2) & 0x38; - imm |= (lower_11 & 63) << 6; - - s.src2 = imm; - } - - return s; -} - /* Midgard prefetches instruction types, so during emission we need to * lookahead too. Unless this is the last instruction, in which we return 1. Or * if this is the second to last and the last is an ALU, then it's also 1... */ @@ -1594,599 +1491,6 @@ vector_to_scalar_alu(midgard_vector_alu v, midgard_instruction *ins) #define IS_ALU(tag) (tag == TAG_ALU_4 || tag == TAG_ALU_8 || \ tag == TAG_ALU_12 || tag == TAG_ALU_16) -#define EMIT_AND_COUNT(type, val) util_dynarray_append(emission, type, val); \ - bytes_emitted += sizeof(type) - -static void -emit_binary_vector_instruction(midgard_instruction *ains, - uint16_t *register_words, int *register_words_count, - uint64_t *body_words, size_t *body_size, int *body_words_count, - size_t *bytes_emitted) -{ - memcpy(®ister_words[(*register_words_count)++], &ains->registers, sizeof(ains->registers)); - *bytes_emitted += sizeof(midgard_reg_info); - - body_size[*body_words_count] = sizeof(midgard_vector_alu); - memcpy(&body_words[(*body_words_count)++], &ains->alu, sizeof(ains->alu)); - *bytes_emitted += sizeof(midgard_vector_alu); -} - -/* Checks for an SSA data hazard between two adjacent instructions, keeping in - * mind that we are a vector architecture and we can write to different - * components simultaneously */ - -static bool -can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second) -{ - /* Each instruction reads some registers and writes to a register. See - * where the first writes */ - - /* Figure out where exactly we wrote to */ - int source = first->ssa_args.dest; - int source_mask = first->type == TAG_ALU_4 ? squeeze_writemask(first->alu.mask) : 0xF; - - /* As long as the second doesn't read from the first, we're okay */ - if (second->ssa_args.src0 == source) { - if (first->type == TAG_ALU_4) { - /* Figure out which components we just read from */ - - int q = second->alu.src1; - midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q; - - /* Check if there are components in common, and fail if so */ - if (swizzle_to_access_mask(m->swizzle) & source_mask) - return false; - } else - return false; - - } - - if (second->ssa_args.src1 == source) - return false; - - /* Otherwise, it's safe in that regard. Another data hazard is both - * writing to the same place, of course */ - - if (second->ssa_args.dest == source) { - /* ...but only if the components overlap */ - int dest_mask = second->type == TAG_ALU_4 ? squeeze_writemask(second->alu.mask) : 0xF; - - if (dest_mask & source_mask) - return false; - } - - /* ...That's it */ - return true; -} - -static bool -midgard_has_hazard( - midgard_instruction **segment, unsigned segment_size, - midgard_instruction *ains) -{ - for (int s = 0; s < segment_size; ++s) - if (!can_run_concurrent_ssa(segment[s], ains)) - return true; - - return false; - - -} - -/* Schedules, but does not emit, a single basic block. After scheduling, the - * final tag and size of the block are known, which are necessary for branching - * */ - -static midgard_bundle -schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip) -{ - int instructions_emitted = 0, instructions_consumed = -1; - midgard_bundle bundle = { 0 }; - - uint8_t tag = ins->type; - - /* Default to the instruction's tag */ - bundle.tag = tag; - - switch (ins->type) { - case TAG_ALU_4: { - uint32_t control = 0; - size_t bytes_emitted = sizeof(control); - - /* TODO: Constant combining */ - int index = 0, last_unit = 0; - - /* Previous instructions, for the purpose of parallelism */ - midgard_instruction *segment[4] = {0}; - int segment_size = 0; - - instructions_emitted = -1; - midgard_instruction *pins = ins; - - for (;;) { - midgard_instruction *ains = pins; - - /* Advance instruction pointer */ - if (index) { - ains = mir_next_op(pins); - pins = ains; - } - - /* Out-of-work condition */ - if ((struct list_head *) ains == &block->instructions) - break; - - /* Ensure that the chain can continue */ - if (ains->type != TAG_ALU_4) break; - - /* If there's already something in the bundle and we - * have weird scheduler constraints, break now */ - if (ains->precede_break && index) break; - - /* According to the presentation "The ARM - * Mali-T880 Mobile GPU" from HotChips 27, - * there are two pipeline stages. Branching - * position determined experimentally. Lines - * are executed in parallel: - * - * [ VMUL ] [ SADD ] - * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ] - * - * Verify that there are no ordering dependencies here. - * - * TODO: Allow for parallelism!!! - */ - - /* Pick a unit for it if it doesn't force a particular unit */ - - int unit = ains->unit; - - if (!unit) { - int op = ains->alu.op; - int units = alu_opcode_props[op].props; - - bool vectorable = units & UNITS_ANY_VECTOR; - bool scalarable = units & UNITS_SCALAR; - bool could_scalar = is_single_component_mask(ains->alu.mask); - bool vector = vectorable && !(could_scalar && scalarable); - - if (!vector) - assert(units & UNITS_SCALAR); - - if (vector) { - if (last_unit >= UNIT_VADD) { - if (units & UNIT_VLUT) - unit = UNIT_VLUT; - else - break; - } else { - if ((units & UNIT_VMUL) && !(control & UNIT_VMUL)) - unit = UNIT_VMUL; - else if ((units & UNIT_VADD) && !(control & UNIT_VADD)) - unit = UNIT_VADD; - else if (units & UNIT_VLUT) - unit = UNIT_VLUT; - else - break; - } - } else { - if (last_unit >= UNIT_VADD) { - if ((units & UNIT_SMUL) && !(control & UNIT_SMUL)) - unit = UNIT_SMUL; - else if (units & UNIT_VLUT) - unit = UNIT_VLUT; - else - break; - } else { - if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains)) - unit = UNIT_SADD; - else if (units & UNIT_SMUL) - unit = ((units & UNIT_VMUL) && !(control & UNIT_VMUL)) ? UNIT_VMUL : UNIT_SMUL; - else if ((units & UNIT_VADD) && !(control & UNIT_VADD)) - unit = UNIT_VADD; - else - break; - } - } - - assert(unit & units); - } - - /* Late unit check, this time for encoding (not parallelism) */ - if (unit <= last_unit) break; - - /* Clear the segment */ - if (last_unit < UNIT_VADD && unit >= UNIT_VADD) - segment_size = 0; - - if (midgard_has_hazard(segment, segment_size, ains)) - break; - - /* We're good to go -- emit the instruction */ - ains->unit = unit; - - segment[segment_size++] = ains; - - /* Only one set of embedded constants per - * bundle possible; if we have more, we must - * break the chain early, unfortunately */ - - if (ains->has_constants) { - if (bundle.has_embedded_constants) { - /* The blend constant needs to be - * alone, since it conflicts with - * everything by definition*/ - - if (ains->has_blend_constant || bundle.has_blend_constant) - break; - - /* ...but if there are already - * constants but these are the - * *same* constants, we let it - * through */ - - if (memcmp(bundle.constants, ains->constants, sizeof(bundle.constants))) - break; - } else { - bundle.has_embedded_constants = true; - memcpy(bundle.constants, ains->constants, sizeof(bundle.constants)); - - /* If this is a blend shader special constant, track it for patching */ - bundle.has_blend_constant |= ains->has_blend_constant; - } - } - - if (ains->unit & UNITS_ANY_VECTOR) { - emit_binary_vector_instruction(ains, bundle.register_words, - &bundle.register_words_count, bundle.body_words, - bundle.body_size, &bundle.body_words_count, &bytes_emitted); - } else if (ains->compact_branch) { - /* All of r0 has to be written out - * along with the branch writeout. - * (slow!) */ - - if (ains->writeout) { - if (index == 0) { - midgard_instruction ins = v_fmov(0, blank_alu_src, SSA_FIXED_REGISTER(0)); - ins.unit = UNIT_VMUL; - - control |= ins.unit; - - emit_binary_vector_instruction(&ins, bundle.register_words, - &bundle.register_words_count, bundle.body_words, - bundle.body_size, &bundle.body_words_count, &bytes_emitted); - } else { - /* Analyse the group to see if r0 is written in full, on-time, without hanging dependencies*/ - bool written_late = false; - bool components[4] = { 0 }; - uint16_t register_dep_mask = 0; - uint16_t written_mask = 0; - - midgard_instruction *qins = ins; - for (int t = 0; t < index; ++t) { - if (qins->registers.out_reg != 0) { - /* Mark down writes */ - - written_mask |= (1 << qins->registers.out_reg); - } else { - /* Mark down the register dependencies for errata check */ - - if (qins->registers.src1_reg < 16) - register_dep_mask |= (1 << qins->registers.src1_reg); - - if (qins->registers.src2_reg < 16) - register_dep_mask |= (1 << qins->registers.src2_reg); - - int mask = qins->alu.mask; - - for (int c = 0; c < 4; ++c) - if (mask & (0x3 << (2 * c))) - components[c] = true; - - /* ..but if the writeout is too late, we have to break up anyway... for some reason */ - - if (qins->unit == UNIT_VLUT) - written_late = true; - } - - /* Advance instruction pointer */ - qins = mir_next_op(qins); - } - - - /* Register dependencies of r0 must be out of fragment writeout bundle */ - if (register_dep_mask & written_mask) - break; - - if (written_late) - break; - - /* If even a single component is not written, break it up (conservative check). */ - bool breakup = false; - - for (int c = 0; c < 4; ++c) - if (!components[c]) - breakup = true; - - if (breakup) - break; - - /* Otherwise, we're free to proceed */ - } - } - - if (ains->unit == ALU_ENAB_BRANCH) { - bundle.body_size[bundle.body_words_count] = sizeof(midgard_branch_extended); - memcpy(&bundle.body_words[bundle.body_words_count++], &ains->branch_extended, sizeof(midgard_branch_extended)); - bytes_emitted += sizeof(midgard_branch_extended); - } else { - bundle.body_size[bundle.body_words_count] = sizeof(ains->br_compact); - memcpy(&bundle.body_words[bundle.body_words_count++], &ains->br_compact, sizeof(ains->br_compact)); - bytes_emitted += sizeof(ains->br_compact); - } - } else { - memcpy(&bundle.register_words[bundle.register_words_count++], &ains->registers, sizeof(ains->registers)); - bytes_emitted += sizeof(midgard_reg_info); - - bundle.body_size[bundle.body_words_count] = sizeof(midgard_scalar_alu); - bundle.body_words_count++; - bytes_emitted += sizeof(midgard_scalar_alu); - } - - /* Defer marking until after writing to allow for break */ - control |= ains->unit; - last_unit = ains->unit; - ++instructions_emitted; - ++index; - } - - /* Bubble up the number of instructions for skipping */ - instructions_consumed = index - 1; - - int padding = 0; - - /* Pad ALU op to nearest word */ - - if (bytes_emitted & 15) { - padding = 16 - (bytes_emitted & 15); - bytes_emitted += padding; - } - - /* Constants must always be quadwords */ - if (bundle.has_embedded_constants) - bytes_emitted += 16; - - /* Size ALU instruction for tag */ - bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1; - bundle.padding = padding; - bundle.control = bundle.tag | control; - - break; - } - - case TAG_LOAD_STORE_4: { - /* Load store instructions have two words at once. If - * we only have one queued up, we need to NOP pad. - * Otherwise, we store both in succession to save space - * and cycles -- letting them go in parallel -- skip - * the next. The usefulness of this optimisation is - * greatly dependent on the quality of the instruction - * scheduler. - */ - - midgard_instruction *next_op = mir_next_op(ins); - - if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) { - /* As the two operate concurrently, make sure - * they are not dependent */ - - if (can_run_concurrent_ssa(ins, next_op) || true) { - /* Skip ahead, since it's redundant with the pair */ - instructions_consumed = 1 + (instructions_emitted++); - } - } - - break; - } - - default: - /* Texture ops default to single-op-per-bundle scheduling */ - break; - } - - /* Copy the instructions into the bundle */ - bundle.instruction_count = instructions_emitted + 1; - - int used_idx = 0; - - midgard_instruction *uins = ins; - for (int i = 0; used_idx < bundle.instruction_count; ++i) { - bundle.instructions[used_idx++] = *uins; - uins = mir_next_op(uins); - } - - *skip = (instructions_consumed == -1) ? instructions_emitted : instructions_consumed; - - return bundle; -} - -static int -quadword_size(int tag) -{ - switch (tag) { - case TAG_ALU_4: - return 1; - - case TAG_ALU_8: - return 2; - - case TAG_ALU_12: - return 3; - - case TAG_ALU_16: - return 4; - - case TAG_LOAD_STORE_4: - return 1; - - case TAG_TEXTURE_4: - return 1; - - default: - assert(0); - return 0; - } -} - -/* Schedule a single block by iterating its instruction to create bundles. - * While we go, tally about the bundle sizes to compute the block size. */ - -static void -schedule_block(compiler_context *ctx, midgard_block *block) -{ - util_dynarray_init(&block->bundles, NULL); - - block->quadword_count = 0; - - mir_foreach_instr_in_block(block, ins) { - int skip; - midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip); - util_dynarray_append(&block->bundles, midgard_bundle, bundle); - - if (bundle.has_blend_constant) { - /* TODO: Multiblock? */ - int quadwords_within_block = block->quadword_count + quadword_size(bundle.tag) - 1; - ctx->blend_constant_offset = quadwords_within_block * 0x10; - } - - while(skip--) - ins = mir_next_op(ins); - - block->quadword_count += quadword_size(bundle.tag); - } - - block->is_scheduled = true; -} - -static void -schedule_program(compiler_context *ctx) -{ - /* We run RA prior to scheduling */ - struct ra_graph *g = allocate_registers(ctx); - install_registers(ctx, g); - - mir_foreach_block(ctx, block) { - schedule_block(ctx, block); - } -} - -/* After everything is scheduled, emit whole bundles at a time */ - -static void -emit_binary_bundle(compiler_context *ctx, midgard_bundle *bundle, struct util_dynarray *emission, int next_tag) -{ - int lookahead = next_tag << 4; - - switch (bundle->tag) { - case TAG_ALU_4: - case TAG_ALU_8: - case TAG_ALU_12: - case TAG_ALU_16: { - /* Actually emit each component */ - util_dynarray_append(emission, uint32_t, bundle->control | lookahead); - - for (int i = 0; i < bundle->register_words_count; ++i) - util_dynarray_append(emission, uint16_t, bundle->register_words[i]); - - /* Emit body words based on the instructions bundled */ - for (int i = 0; i < bundle->instruction_count; ++i) { - midgard_instruction *ins = &bundle->instructions[i]; - - if (ins->unit & UNITS_ANY_VECTOR) { - memcpy(util_dynarray_grow(emission, sizeof(midgard_vector_alu)), &ins->alu, sizeof(midgard_vector_alu)); - } else if (ins->compact_branch) { - /* Dummy move, XXX DRY */ - if ((i == 0) && ins->writeout) { - midgard_instruction ins = v_fmov(0, blank_alu_src, SSA_FIXED_REGISTER(0)); - memcpy(util_dynarray_grow(emission, sizeof(midgard_vector_alu)), &ins.alu, sizeof(midgard_vector_alu)); - } - - if (ins->unit == ALU_ENAB_BR_COMPACT) { - memcpy(util_dynarray_grow(emission, sizeof(ins->br_compact)), &ins->br_compact, sizeof(ins->br_compact)); - } else { - memcpy(util_dynarray_grow(emission, sizeof(ins->branch_extended)), &ins->branch_extended, sizeof(ins->branch_extended)); - } - } else { - /* Scalar */ - midgard_scalar_alu scalarised = vector_to_scalar_alu(ins->alu, ins); - memcpy(util_dynarray_grow(emission, sizeof(scalarised)), &scalarised, sizeof(scalarised)); - } - } - - /* Emit padding (all zero) */ - memset(util_dynarray_grow(emission, bundle->padding), 0, bundle->padding); - - /* Tack on constants */ - - if (bundle->has_embedded_constants) { - util_dynarray_append(emission, float, bundle->constants[0]); - util_dynarray_append(emission, float, bundle->constants[1]); - util_dynarray_append(emission, float, bundle->constants[2]); - util_dynarray_append(emission, float, bundle->constants[3]); - } - - break; - } - - case TAG_LOAD_STORE_4: { - /* One or two composing instructions */ - - uint64_t current64, next64 = LDST_NOP; - - memcpy(¤t64, &bundle->instructions[0].load_store, sizeof(current64)); - - if (bundle->instruction_count == 2) - memcpy(&next64, &bundle->instructions[1].load_store, sizeof(next64)); - - midgard_load_store instruction = { - .type = bundle->tag, - .next_type = next_tag, - .word1 = current64, - .word2 = next64 - }; - - util_dynarray_append(emission, midgard_load_store, instruction); - - break; - } - - case TAG_TEXTURE_4: { - /* Texture instructions are easy, since there is no - * pipelining nor VLIW to worry about. We may need to set the .last flag */ - - midgard_instruction *ins = &bundle->instructions[0]; - - ins->texture.type = TAG_TEXTURE_4; - ins->texture.next_type = next_tag; - - ctx->texture_op_count--; - - if (!ctx->texture_op_count) { - ins->texture.cont = 0; - ins->texture.last = 1; - } - - util_dynarray_append(emission, midgard_texture_word, ins->texture); - break; - } - - default: - DBG("Unknown midgard instruction type\n"); - assert(0); - break; - } -} - /* ALU instructions can inline or embed constants, which decreases register * pressure and saves space. */ @@ -3102,7 +2406,7 @@ midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_bl mir_foreach_block(ctx, block) { util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) { for (int c = 0; c < bundle->instruction_count; ++c) { - midgard_instruction *ins = &bundle->instructions[c]; + midgard_instruction *ins = bundle->instructions[c]; if (!midgard_is_branch_unit(ins->unit)) continue; @@ -3117,10 +2421,13 @@ midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_bl /* Determine the block we're jumping to */ int target_number = ins->branch.target_block; - /* Report the destination tag. Discards don't need this */ + /* Report the destination tag */ int dest_tag = is_discard ? 0 : midgard_get_first_tag_from_block(ctx, target_number); - /* Count up the number of quadwords we're jumping over. That is, the number of quadwords in each of the blocks between (br_block_idx, target_number) */ + /* Count up the number of quadwords we're + * jumping over = number of quadwords until + * (br_block_idx, target_number) */ + int quadword_offset = 0; if (is_discard) { diff --git a/src/gallium/drivers/panfrost/midgard/midgard_emit.c b/src/gallium/drivers/panfrost/midgard/midgard_emit.c new file mode 100644 index 00000000000..ffa08735ff0 --- /dev/null +++ b/src/gallium/drivers/panfrost/midgard/midgard_emit.c @@ -0,0 +1,229 @@ +/* + * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "compiler.h" +#include "midgard_ops.h" + +/* Midgard IR only knows vector ALU types, but we sometimes need to actually + * use scalar ALU instructions, for functional or performance reasons. To do + * this, we just demote vector ALU payloads to scalar. */ + +static int +component_from_mask(unsigned mask) +{ + for (int c = 0; c < 4; ++c) { + if (mask & (3 << (2 * c))) + return c; + } + + assert(0); + return 0; +} + +static unsigned +vector_to_scalar_source(unsigned u, bool is_int) +{ + midgard_vector_alu_src v; + memcpy(&v, &u, sizeof(v)); + + /* TODO: Integers */ + + midgard_scalar_alu_src s = { + .full = !v.half, + .component = (v.swizzle & 3) << 1 + }; + + if (is_int) { + /* TODO */ + } else { + s.abs = v.mod & MIDGARD_FLOAT_MOD_ABS; + s.negate = v.mod & MIDGARD_FLOAT_MOD_NEG; + } + + unsigned o; + memcpy(&o, &s, sizeof(s)); + + return o & ((1 << 6) - 1); +} + +static midgard_scalar_alu +vector_to_scalar_alu(midgard_vector_alu v, midgard_instruction *ins) +{ + bool is_int = midgard_is_integer_op(v.op); + + /* The output component is from the mask */ + midgard_scalar_alu s = { + .op = v.op, + .src1 = vector_to_scalar_source(v.src1, is_int), + .src2 = vector_to_scalar_source(v.src2, is_int), + .unknown = 0, + .outmod = v.outmod, + .output_full = 1, /* TODO: Half */ + .output_component = component_from_mask(v.mask) << 1, + }; + + /* Inline constant is passed along rather than trying to extract it + * from v */ + + if (ins->ssa_args.inline_constant) { + uint16_t imm = 0; + int lower_11 = ins->inline_constant & ((1 << 12) - 1); + imm |= (lower_11 >> 9) & 3; + imm |= (lower_11 >> 6) & 4; + imm |= (lower_11 >> 2) & 0x38; + imm |= (lower_11 & 63) << 6; + + s.src2 = imm; + } + + return s; +} + +static void +emit_alu_bundle(compiler_context *ctx, + midgard_bundle *bundle, + struct util_dynarray *emission, + unsigned lookahead) +{ + /* Emit the control word */ + util_dynarray_append(emission, uint32_t, bundle->control | lookahead); + + /* Next up, emit register words */ + for (unsigned i = 0; i < bundle->instruction_count; ++i) { + midgard_instruction *ins = bundle->instructions[i]; + + /* Check if this instruction has registers */ + if (ins->compact_branch || ins->prepacked_branch) continue; + + /* Otherwise, just emit the registers */ + uint16_t reg_word = 0; + memcpy(®_word, &ins->registers, sizeof(uint16_t)); + util_dynarray_append(emission, uint16_t, reg_word); + } + + /* Now, we emit the body itself */ + for (unsigned i = 0; i < bundle->instruction_count; ++i) { + midgard_instruction *ins = bundle->instructions[i]; + + /* Where is this body */ + unsigned size = 0; + void *source = NULL; + + /* In case we demote to a scalar */ + midgard_scalar_alu scalarized; + + if (ins->unit & UNITS_ANY_VECTOR) { + size = sizeof(midgard_vector_alu); + source = &ins->alu; + } else if (ins->unit == ALU_ENAB_BR_COMPACT) { + size = sizeof(midgard_branch_cond); + source = &ins->br_compact; + } else if (ins->compact_branch) { /* misnomer */ + size = sizeof(midgard_branch_extended); + source = &ins->branch_extended; + } else { + size = sizeof(midgard_scalar_alu); + scalarized = vector_to_scalar_alu(ins->alu, ins); + source = &scalarized; + } + + memcpy(util_dynarray_grow(emission, size), source, size); + } + + /* Emit padding (all zero) */ + memset(util_dynarray_grow(emission, bundle->padding), 0, bundle->padding); + + /* Tack on constants */ + + if (bundle->has_embedded_constants) { + util_dynarray_append(emission, float, bundle->constants[0]); + util_dynarray_append(emission, float, bundle->constants[1]); + util_dynarray_append(emission, float, bundle->constants[2]); + util_dynarray_append(emission, float, bundle->constants[3]); + } +} + +/* After everything is scheduled, emit whole bundles at a time */ + +void +emit_binary_bundle(compiler_context *ctx, + midgard_bundle *bundle, + struct util_dynarray *emission, + int next_tag) +{ + int lookahead = next_tag << 4; + + switch (bundle->tag) { + case TAG_ALU_4: + case TAG_ALU_8: + case TAG_ALU_12: + case TAG_ALU_16: + emit_alu_bundle(ctx, bundle, emission, lookahead); + break; + + case TAG_LOAD_STORE_4: { + /* One or two composing instructions */ + + uint64_t current64, next64 = LDST_NOP; + + memcpy(¤t64, &bundle->instructions[0]->load_store, sizeof(current64)); + + if (bundle->instruction_count == 2) + memcpy(&next64, &bundle->instructions[1]->load_store, sizeof(next64)); + + midgard_load_store instruction = { + .type = bundle->tag, + .next_type = next_tag, + .word1 = current64, + .word2 = next64 + }; + + util_dynarray_append(emission, midgard_load_store, instruction); + + break; + } + + case TAG_TEXTURE_4: { + /* Texture instructions are easy, since there is no pipelining + * nor VLIW to worry about. We may need to set .last flag */ + + midgard_instruction *ins = bundle->instructions[0]; + + ins->texture.type = TAG_TEXTURE_4; + ins->texture.next_type = next_tag; + + ctx->texture_op_count--; + + if (!ctx->texture_op_count) { + ins->texture.cont = 0; + ins->texture.last = 1; + } + + util_dynarray_append(emission, midgard_texture_word, ins->texture); + break; + } + + default: + unreachable("Unknown midgard instruction type\n"); + } +} diff --git a/src/gallium/drivers/panfrost/midgard/midgard_schedule.c b/src/gallium/drivers/panfrost/midgard/midgard_schedule.c new file mode 100644 index 00000000000..385b8bcdbc0 --- /dev/null +++ b/src/gallium/drivers/panfrost/midgard/midgard_schedule.c @@ -0,0 +1,479 @@ +/* + * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "compiler.h" +#include "midgard_ops.h" +#include "util/u_memory.h" + +/* Create a mask of accessed components from a swizzle to figure out vector + * dependencies */ + +static unsigned +swizzle_to_access_mask(unsigned swizzle) +{ + unsigned component_mask = 0; + + for (int i = 0; i < 4; ++i) { + unsigned c = (swizzle >> (2 * i)) & 3; + component_mask |= (1 << c); + } + + return component_mask; +} + +/* Does the mask cover more than a scalar? */ + +static bool +is_single_component_mask(unsigned mask) +{ + int components = 0; + + for (int c = 0; c < 4; ++c) + if (mask & (3 << (2 * c))) + components++; + + return components == 1; +} + +/* Checks for an SSA data hazard between two adjacent instructions, keeping in + * mind that we are a vector architecture and we can write to different + * components simultaneously */ + +static bool +can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second) +{ + /* Each instruction reads some registers and writes to a register. See + * where the first writes */ + + /* Figure out where exactly we wrote to */ + int source = first->ssa_args.dest; + int source_mask = first->type == TAG_ALU_4 ? squeeze_writemask(first->alu.mask) : 0xF; + + /* As long as the second doesn't read from the first, we're okay */ + if (second->ssa_args.src0 == source) { + if (first->type == TAG_ALU_4) { + /* Figure out which components we just read from */ + + int q = second->alu.src1; + midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q; + + /* Check if there are components in common, and fail if so */ + if (swizzle_to_access_mask(m->swizzle) & source_mask) + return false; + } else + return false; + + } + + if (second->ssa_args.src1 == source) + return false; + + /* Otherwise, it's safe in that regard. Another data hazard is both + * writing to the same place, of course */ + + if (second->ssa_args.dest == source) { + /* ...but only if the components overlap */ + int dest_mask = second->type == TAG_ALU_4 ? squeeze_writemask(second->alu.mask) : 0xF; + + if (dest_mask & source_mask) + return false; + } + + /* ...That's it */ + return true; +} + +static bool +midgard_has_hazard( + midgard_instruction **segment, unsigned segment_size, + midgard_instruction *ains) +{ + for (int s = 0; s < segment_size; ++s) + if (!can_run_concurrent_ssa(segment[s], ains)) + return true; + + return false; + + +} + +/* Schedules, but does not emit, a single basic block. After scheduling, the + * final tag and size of the block are known, which are necessary for branching + * */ + +static midgard_bundle +schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip) +{ + int instructions_emitted = 0, packed_idx = 0; + midgard_bundle bundle = { 0 }; + + uint8_t tag = ins->type; + + /* Default to the instruction's tag */ + bundle.tag = tag; + + switch (ins->type) { + case TAG_ALU_4: { + uint32_t control = 0; + size_t bytes_emitted = sizeof(control); + + /* TODO: Constant combining */ + int index = 0, last_unit = 0; + + /* Previous instructions, for the purpose of parallelism */ + midgard_instruction *segment[4] = {0}; + int segment_size = 0; + + instructions_emitted = -1; + midgard_instruction *pins = ins; + + for (;;) { + midgard_instruction *ains = pins; + + /* Advance instruction pointer */ + if (index) { + ains = mir_next_op(pins); + pins = ains; + } + + /* Out-of-work condition */ + if ((struct list_head *) ains == &block->instructions) + break; + + /* Ensure that the chain can continue */ + if (ains->type != TAG_ALU_4) break; + + /* If there's already something in the bundle and we + * have weird scheduler constraints, break now */ + if (ains->precede_break && index) break; + + /* According to the presentation "The ARM + * Mali-T880 Mobile GPU" from HotChips 27, + * there are two pipeline stages. Branching + * position determined experimentally. Lines + * are executed in parallel: + * + * [ VMUL ] [ SADD ] + * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ] + * + * Verify that there are no ordering dependencies here. + * + * TODO: Allow for parallelism!!! + */ + + /* Pick a unit for it if it doesn't force a particular unit */ + + int unit = ains->unit; + + if (!unit) { + int op = ains->alu.op; + int units = alu_opcode_props[op].props; + + bool vectorable = units & UNITS_ANY_VECTOR; + bool scalarable = units & UNITS_SCALAR; + bool could_scalar = is_single_component_mask(ains->alu.mask); + bool vector = vectorable && !(could_scalar && scalarable); + + if (!vector) + assert(units & UNITS_SCALAR); + + if (vector) { + if (last_unit >= UNIT_VADD) { + if (units & UNIT_VLUT) + unit = UNIT_VLUT; + else + break; + } else { + if ((units & UNIT_VMUL) && !(control & UNIT_VMUL)) + unit = UNIT_VMUL; + else if ((units & UNIT_VADD) && !(control & UNIT_VADD)) + unit = UNIT_VADD; + else if (units & UNIT_VLUT) + unit = UNIT_VLUT; + else + break; + } + } else { + if (last_unit >= UNIT_VADD) { + if ((units & UNIT_SMUL) && !(control & UNIT_SMUL)) + unit = UNIT_SMUL; + else if (units & UNIT_VLUT) + unit = UNIT_VLUT; + else + break; + } else { + if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains)) + unit = UNIT_SADD; + else if (units & UNIT_SMUL) + unit = ((units & UNIT_VMUL) && !(control & UNIT_VMUL)) ? UNIT_VMUL : UNIT_SMUL; + else if ((units & UNIT_VADD) && !(control & UNIT_VADD)) + unit = UNIT_VADD; + else + break; + } + } + + assert(unit & units); + } + + /* Late unit check, this time for encoding (not parallelism) */ + if (unit <= last_unit) break; + + /* Clear the segment */ + if (last_unit < UNIT_VADD && unit >= UNIT_VADD) + segment_size = 0; + + if (midgard_has_hazard(segment, segment_size, ains)) + break; + + /* We're good to go -- emit the instruction */ + ains->unit = unit; + + segment[segment_size++] = ains; + + /* Only one set of embedded constants per + * bundle possible; if we have more, we must + * break the chain early, unfortunately */ + + if (ains->has_constants) { + if (bundle.has_embedded_constants) { + /* The blend constant needs to be + * alone, since it conflicts with + * everything by definition */ + + if (ains->has_blend_constant || bundle.has_blend_constant) + break; + + /* ...but if there are already + * constants but these are the + * *same* constants, we let it + * through */ + + if (memcmp(bundle.constants, ains->constants, sizeof(bundle.constants))) + break; + } else { + bundle.has_embedded_constants = true; + memcpy(bundle.constants, ains->constants, sizeof(bundle.constants)); + + /* If this is a blend shader special constant, track it for patching */ + bundle.has_blend_constant |= ains->has_blend_constant; + } + } + + if (ains->unit & UNITS_ANY_VECTOR) { + bytes_emitted += sizeof(midgard_reg_info); + bytes_emitted += sizeof(midgard_vector_alu); + } else if (ains->compact_branch) { + /* All of r0 has to be written out along with + * the branch writeout */ + + if (ains->writeout) { + if (index == 0) { + /* Inject a move */ + midgard_instruction ins = v_fmov(0, blank_alu_src, SSA_FIXED_REGISTER(0)); + ins.unit = UNIT_VMUL; + control |= ins.unit; + + /* TODO don't leak */ + midgard_instruction *move = + mem_dup(&ins, sizeof(midgard_instruction)); + bytes_emitted += sizeof(midgard_reg_info); + bytes_emitted += sizeof(midgard_vector_alu); + bundle.instructions[packed_idx++] = move; + } else { + /* Analyse the group to see if r0 is written in full, on-time, without hanging dependencies */ + bool written_late = false; + bool components[4] = { 0 }; + uint16_t register_dep_mask = 0; + uint16_t written_mask = 0; + + midgard_instruction *qins = ins; + for (int t = 0; t < index; ++t) { + if (qins->registers.out_reg != 0) { + /* Mark down writes */ + + written_mask |= (1 << qins->registers.out_reg); + } else { + /* Mark down the register dependencies for errata check */ + + if (qins->registers.src1_reg < 16) + register_dep_mask |= (1 << qins->registers.src1_reg); + + if (qins->registers.src2_reg < 16) + register_dep_mask |= (1 << qins->registers.src2_reg); + + int mask = qins->alu.mask; + + for (int c = 0; c < 4; ++c) + if (mask & (0x3 << (2 * c))) + components[c] = true; + + /* ..but if the writeout is too late, we have to break up anyway... for some reason */ + + if (qins->unit == UNIT_VLUT) + written_late = true; + } + + /* Advance instruction pointer */ + qins = mir_next_op(qins); + } + + /* Register dependencies of r0 must be out of fragment writeout bundle */ + if (register_dep_mask & written_mask) + break; + + if (written_late) + break; + + /* If even a single component is not written, break it up (conservative check). */ + bool breakup = false; + + for (int c = 0; c < 4; ++c) + if (!components[c]) + breakup = true; + + if (breakup) + break; + + /* Otherwise, we're free to proceed */ + } + } + + if (ains->unit == ALU_ENAB_BRANCH) { + bytes_emitted += sizeof(midgard_branch_extended); + } else { + bytes_emitted += sizeof(ains->br_compact); + } + } else { + bytes_emitted += sizeof(midgard_reg_info); + bytes_emitted += sizeof(midgard_scalar_alu); + } + + /* Defer marking until after writing to allow for break */ + control |= ains->unit; + last_unit = ains->unit; + ++instructions_emitted; + ++index; + } + + int padding = 0; + + /* Pad ALU op to nearest word */ + + if (bytes_emitted & 15) { + padding = 16 - (bytes_emitted & 15); + bytes_emitted += padding; + } + + /* Constants must always be quadwords */ + if (bundle.has_embedded_constants) + bytes_emitted += 16; + + /* Size ALU instruction for tag */ + bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1; + bundle.padding = padding; + bundle.control = bundle.tag | control; + + break; + } + + case TAG_LOAD_STORE_4: { + /* Load store instructions have two words at once. If + * we only have one queued up, we need to NOP pad. + * Otherwise, we store both in succession to save space + * and cycles -- letting them go in parallel -- skip + * the next. The usefulness of this optimisation is + * greatly dependent on the quality of the instruction + * scheduler. + */ + + midgard_instruction *next_op = mir_next_op(ins); + + if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) { + /* TODO: Concurrency check */ + instructions_emitted++; + } + + break; + } + + default: + /* Texture ops default to single-op-per-bundle scheduling */ + break; + } + + /* Copy the instructions into the bundle */ + bundle.instruction_count = instructions_emitted + 1 + packed_idx; + + midgard_instruction *uins = ins; + for (; packed_idx < bundle.instruction_count; ++packed_idx) { + bundle.instructions[packed_idx] = uins; + uins = mir_next_op(uins); + } + + *skip = instructions_emitted; + + return bundle; +} + +/* Schedule a single block by iterating its instruction to create bundles. + * While we go, tally about the bundle sizes to compute the block size. */ + +static void +schedule_block(compiler_context *ctx, midgard_block *block) +{ + util_dynarray_init(&block->bundles, NULL); + + block->quadword_count = 0; + + mir_foreach_instr_in_block(block, ins) { + int skip; + midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip); + util_dynarray_append(&block->bundles, midgard_bundle, bundle); + + if (bundle.has_blend_constant) { + /* TODO: Multiblock? */ + int quadwords_within_block = block->quadword_count + quadword_size(bundle.tag) - 1; + ctx->blend_constant_offset = quadwords_within_block * 0x10; + } + + while(skip--) + ins = mir_next_op(ins); + + block->quadword_count += quadword_size(bundle.tag); + } + + block->is_scheduled = true; +} + +void +schedule_program(compiler_context *ctx) +{ + /* We run RA prior to scheduling */ + struct ra_graph *g = allocate_registers(ctx); + install_registers(ctx, g); + + mir_foreach_block(ctx, block) { + schedule_block(ctx, block); + } +} |