diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3.c | 60 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3.h | 92 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c | 428 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_cp.c | 14 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_depth.c | 14 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_group.c | 7 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_legalize.c | 184 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_print.c | 38 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_ra.c | 226 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_sched.c | 88 |
10 files changed, 1025 insertions, 126 deletions
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c index ba5851c6c82..a166b67d7cf 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.c +++ b/src/gallium/drivers/freedreno/ir3/ir3.c @@ -80,6 +80,8 @@ struct ir3 * ir3_create(struct ir3_compiler *compiler, shader->noutputs = nout; shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout); + list_inithead(&shader->block_list); + return shader; } @@ -548,7 +550,6 @@ static int (*emit[])(struct ir3_instruction *instr, void *ptr, void * ir3_assemble(struct ir3 *shader, struct ir3_info *info, uint32_t gpu_id) { - struct ir3_block *block = shader->block; uint32_t *ptr, *dwords; info->gpu_id = gpu_id; @@ -558,8 +559,10 @@ void * ir3_assemble(struct ir3 *shader, struct ir3_info *info, info->instrs_count = 0; info->sizedwords = 0; - list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { - info->sizedwords += 2; + list_for_each_entry (struct ir3_block, block, &shader->block_list, node) { + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + info->sizedwords += 2; + } } /* need a integer number of instruction "groups" (sets of 16 @@ -574,12 +577,14 @@ void * ir3_assemble(struct ir3 *shader, struct ir3_info *info, ptr = dwords = calloc(4, info->sizedwords); - list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { - int ret = emit[instr->category](instr, dwords, info); - if (ret) - goto fail; - info->instrs_count += 1 + instr->repeat; - dwords += 2; + list_for_each_entry (struct ir3_block, block, &shader->block_list, node) { + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + int ret = emit[instr->category](instr, dwords, info); + if (ret) + goto fail; + info->instrs_count += 1 + instr->repeat; + dwords += 2; + } } return ptr; @@ -617,7 +622,12 @@ static void insert_instr(struct ir3_block *block, struct ir3_block * ir3_block_create(struct ir3 *shader) { struct ir3_block *block = ir3_alloc(shader, sizeof(*block)); +#ifdef DEBUG + static uint32_t serialno = 0; + block->serialno = ++serialno; +#endif block->shader = shader; + list_inithead(&block->node); list_inithead(&block->instr_list); return block; } @@ -688,10 +698,40 @@ struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr) struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, int num, int flags) { - struct ir3_register *reg = reg_create(instr->block->shader, num, flags); + struct ir3 *shader = instr->block->shader; + struct ir3_register *reg = reg_create(shader, num, flags); #ifdef DEBUG debug_assert(instr->regs_count < instr->regs_max); #endif instr->regs[instr->regs_count++] = reg; return reg; } + +void +ir3_block_clear_mark(struct ir3_block *block) +{ + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) + instr->flags &= ~IR3_INSTR_MARK; +} + +void +ir3_clear_mark(struct ir3 *ir) +{ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + ir3_block_clear_mark(block); + } +} + +/* note: this will destroy instr->depth, don't do it until after sched! */ +void +ir3_count_instructions(struct ir3 *ir) +{ + unsigned ip = 0; + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + instr->ip = ip++; + } + block->start_ip = list_first_entry(&block->instr_list, struct ir3_instruction, node)->ip; + block->end_ip = list_last_entry(&block->instr_list, struct ir3_instruction, node)->ip; + } +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h index 95b866988b8..9c35a763d58 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.h +++ b/src/gallium/drivers/freedreno/ir3/ir3.h @@ -83,7 +83,8 @@ struct ir3_register { * before register assignment is done: */ IR3_REG_SSA = 0x2000, /* 'instr' is ptr to assigning instr */ - IR3_REG_IA = 0x4000, /* meta-input dst is "assigned" */ + IR3_REG_PHI_SRC= 0x4000, /* phi src, regs[0]->instr points to phi */ + } flags; union { /* normal registers: @@ -187,6 +188,7 @@ struct ir3_instruction { char inv; char comp; int immed; + struct ir3_block *target; } cat0; struct { type_t src_type, dst_type; @@ -220,14 +222,14 @@ struct ir3_instruction { int aid; } fi; struct { - struct ir3_block *if_block, *else_block; - } flow; + /* used to temporarily hold reference to nir_phi_instr + * until we resolve the phi srcs + */ + void *nphi; + } phi; struct { struct ir3_block *block; } inout; - - /* XXX keep this as big as all other union members! */ - uint32_t info[3]; }; /* transient values used during various algorithms: */ @@ -363,16 +365,40 @@ struct ir3 { unsigned predicates_count, predicates_sz; struct ir3_instruction **predicates; - struct ir3_block *block; + /* List of blocks: */ + struct list_head block_list; + unsigned heap_idx; struct ir3_heap_chunk *chunk; }; +typedef struct nir_block nir_block; + struct ir3_block { + struct list_head node; struct ir3 *shader; - /* only a single address register: */ - struct ir3_instruction *address; - struct list_head instr_list; + + nir_block *nblock; + + struct list_head instr_list; /* list of ir3_instruction */ + + /* each block has either one or two successors.. in case of + * two successors, 'condition' decides which one to follow. + * A block preceding an if/else has two successors. + */ + struct ir3_instruction *condition; + struct ir3_block *successors[2]; + + uint16_t start_ip, end_ip; + + /* used for per-pass extra block data. Mainly used right + * now in RA step to track livein/liveout. + */ + void *bd; + +#ifdef DEBUG + uint32_t serialno; +#endif }; struct ir3 * ir3_create(struct ir3_compiler *compiler, @@ -394,7 +420,6 @@ const char *ir3_instr_name(struct ir3_instruction *instr); struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, int num, int flags); - static inline bool ir3_instr_check_mark(struct ir3_instruction *instr) { if (instr->flags & IR3_INSTR_MARK) @@ -403,19 +428,10 @@ static inline bool ir3_instr_check_mark(struct ir3_instruction *instr) return false; } -static inline void ir3_clear_mark(struct ir3 *shader) -{ - /* TODO would be nice to drop the instruction array.. for - * new compiler, _clear_mark() is all we use it for, and - * we could probably manage a linked list instead.. - * - * Also, we'll probably want to mark instructions within - * a block, so tracking the list of instrs globally is - * unlikely to be what we want. - */ - list_for_each_entry (struct ir3_instruction, instr, &shader->block->instr_list, node) - instr->flags &= ~IR3_INSTR_MARK; -} +void ir3_block_clear_mark(struct ir3_block *block); +void ir3_clear_mark(struct ir3 *shader); + +void ir3_count_instructions(struct ir3 *ir); static inline int ir3_instr_regno(struct ir3_instruction *instr, struct ir3_register *reg) @@ -593,6 +609,22 @@ static inline bool reg_gpr(struct ir3_register *r) return true; } +static inline type_t half_type(type_t type) +{ + switch (type) { + case TYPE_F32: return TYPE_F16; + case TYPE_U32: return TYPE_U16; + case TYPE_S32: return TYPE_S16; + case TYPE_F16: + case TYPE_U16: + case TYPE_S16: + return type; + default: + assert(0); + return ~0; + } +} + /* some cat2 instructions (ie. those which are not float) can embed an * immediate: */ @@ -837,6 +869,15 @@ ir3_NOP(struct ir3_block *block) return ir3_instr_create(block, 0, OPC_NOP); } +#define INSTR0(CAT, name) \ +static inline struct ir3_instruction * \ +ir3_##name(struct ir3_block *block) \ +{ \ + struct ir3_instruction *instr = \ + ir3_instr_create(block, CAT, OPC_##name); \ + return instr; \ +} + #define INSTR1(CAT, name) \ static inline struct ir3_instruction * \ ir3_##name(struct ir3_block *block, \ @@ -880,7 +921,10 @@ ir3_##name(struct ir3_block *block, \ } /* cat0 instructions: */ +INSTR0(0, BR); +INSTR0(0, JUMP); INSTR1(0, KILL); +INSTR0(0, END); /* cat2 instructions, most 2 src but some 1 src: */ INSTR2(2, ADD_F) diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index f62a5ec2b26..4165e2d6aa7 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -48,8 +48,6 @@ #include "ir3.h" -static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val); - struct ir3_compile { struct ir3_compiler *compiler; @@ -62,7 +60,10 @@ struct ir3_compile { /* bitmask of which samplers are integer: */ uint16_t integer_s; - struct ir3_block *block; + struct ir3_block *block; /* the current block */ + struct ir3_block *in_block; /* block created for shader inputs */ + + nir_function_impl *impl; /* For fragment shaders, from the hw perspective the only * actual input is r0.xy position register passed to bary.f. @@ -94,6 +95,11 @@ struct ir3_compile { */ struct hash_table *addr_ht; + /* maps nir_block to ir3_block, mostly for the purposes of + * figuring out the blocks successors + */ + struct hash_table *block_ht; + /* for calculating input/output positions/linkages: */ unsigned next_inloc; @@ -120,6 +126,9 @@ struct ir3_compile { }; +static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val); +static struct ir3_block * get_block(struct ir3_compile *ctx, nir_block *nblock); + static struct nir_shader *to_nir(const struct tgsi_token *tokens) { struct nir_shader_compiler_options options = { @@ -148,6 +157,7 @@ static struct nir_shader *to_nir(const struct tgsi_token *tokens) nir_lower_vars_to_ssa(s); nir_lower_alu_to_scalar(s); + nir_lower_phis_to_scalar(s); progress |= nir_copy_prop(s); progress |= nir_opt_dce(s); @@ -244,6 +254,8 @@ compile_init(struct ir3_compiler *compiler, _mesa_hash_pointer, _mesa_key_pointer_equal); ctx->addr_ht = _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal); + ctx->block_ht = _mesa_hash_table_create(ctx, + _mesa_hash_pointer, _mesa_key_pointer_equal); lowered_tokens = lower_tgsi(ctx, tokens, so); if (!lowered_tokens) @@ -287,33 +299,206 @@ compile_free(struct ir3_compile *ctx) ralloc_free(ctx); } - +/* global per-array information: */ struct ir3_array { unsigned length, aid; +}; + +/* per-block array state: */ +struct ir3_array_value { + /* TODO drop length/aid, and just have ptr back to ir3_array */ + unsigned length, aid; + /* initial array element values are phi's, other than for the + * entry block. The phi src's get added later in a resolve step + * after we have visited all the blocks, to account for back + * edges in the cfg. + */ + struct ir3_instruction **phis; + /* current array element values (as block is processed). When + * the array phi's are resolved, it will contain the array state + * at exit of block, so successor blocks can use it to add their + * phi srcs. + */ struct ir3_instruction *arr[]; }; +/* track array assignments per basic block. When an array is read + * outside of the same basic block, we can use NIR's dominance-frontier + * information to figure out where phi nodes are needed. + */ +struct ir3_nir_block_data { + unsigned foo; + /* indexed by array-id (aid): */ + struct ir3_array_value *arrs[]; +}; + +static struct ir3_nir_block_data * +get_block_data(struct ir3_compile *ctx, struct ir3_block *block) +{ + if (!block->bd) { + struct ir3_nir_block_data *bd = ralloc_size(ctx, sizeof(*bd) + + ((ctx->num_arrays + 1) * sizeof(bd->arrs[0]))); + block->bd = bd; + } + return block->bd; +} + static void declare_var(struct ir3_compile *ctx, nir_variable *var) { unsigned length = glsl_get_length(var->type) * 4; /* always vec4, at least with ttn */ - struct ir3_array *arr = ralloc_size(ctx, sizeof(*arr) + - (length * sizeof(arr->arr[0]))); + struct ir3_array *arr = ralloc(ctx, struct ir3_array); arr->length = length; arr->aid = ++ctx->num_arrays; - /* Some shaders end up reading array elements without first writing.. - * so initialize things to prevent null instr ptrs later: - */ - for (unsigned i = 0; i < length; i++) - arr->arr[i] = create_immed(ctx->block, 0); _mesa_hash_table_insert(ctx->var_ht, var, arr); } -static struct ir3_array * +static nir_block * +nir_block_pred(nir_block *block) +{ + assert(block->predecessors->entries < 2); + if (block->predecessors->entries == 0) + return NULL; + return (nir_block *)_mesa_set_next_entry(block->predecessors, NULL)->key; +} + +static struct ir3_array_value * get_var(struct ir3_compile *ctx, nir_variable *var) { struct hash_entry *entry = _mesa_hash_table_search(ctx->var_ht, var); - return entry->data; + struct ir3_block *block = ctx->block; + struct ir3_nir_block_data *bd = get_block_data(ctx, block); + struct ir3_array *arr = entry->data; + + if (!bd->arrs[arr->aid]) { + struct ir3_array_value *av = ralloc_size(bd, sizeof(*av) + + (arr->length * sizeof(av->arr[0]))); + struct ir3_array_value *defn = NULL; + nir_block *pred_block; + + av->length = arr->length; + av->aid = arr->aid; + + /* For loops, we have to consider that we have not visited some + * of the blocks who should feed into the phi (ie. back-edges in + * the cfg).. for example: + * + * loop { + * block { load_var; ... } + * if then block {} else block {} + * block { store_var; ... } + * if then block {} else block {} + * block {...} + * } + * + * We can skip the phi if we can chase the block predecessors + * until finding the block previously defining the array without + * crossing a block that has more than one predecessor. + * + * Otherwise create phi's and resolve them as a post-pass after + * all the blocks have been visited (to handle back-edges). + */ + + for (pred_block = block->nblock; + pred_block && (pred_block->predecessors->entries < 2) && !defn; + pred_block = nir_block_pred(pred_block)) { + struct ir3_block *pblock = get_block(ctx, pred_block); + struct ir3_nir_block_data *pbd = pblock->bd; + if (!pbd) + continue; + defn = pbd->arrs[arr->aid]; + } + + if (defn) { + /* only one possible definer: */ + for (unsigned i = 0; i < arr->length; i++) + av->arr[i] = defn->arr[i]; + } else if (pred_block) { + /* not the first block, and multiple potential definers: */ + av->phis = ralloc_size(av, arr->length * sizeof(av->phis[0])); + + for (unsigned i = 0; i < arr->length; i++) { + struct ir3_instruction *phi; + + phi = ir3_instr_create2(block, -1, OPC_META_PHI, + 1 + ctx->impl->num_blocks); + ir3_reg_create(phi, 0, 0); /* dst */ + + /* phi's should go at head of block: */ + list_delinit(&phi->node); + list_add(&phi->node, &block->instr_list); + + av->phis[i] = av->arr[i] = phi; + } + } else { + /* Some shaders end up reading array elements without + * first writing.. so initialize things to prevent null + * instr ptrs later: + */ + for (unsigned i = 0; i < arr->length; i++) + av->arr[i] = create_immed(block, 0); + } + + bd->arrs[arr->aid] = av; + } + + return bd->arrs[arr->aid]; +} + +static void +add_array_phi_srcs(struct ir3_compile *ctx, nir_block *nblock, + struct ir3_array_value *av, BITSET_WORD *visited) +{ + struct ir3_block *block; + struct ir3_nir_block_data *bd; + + if (BITSET_TEST(visited, nblock->index)) + return; + + BITSET_SET(visited, nblock->index); + + block = get_block(ctx, nblock); + bd = block->bd; + + if (bd && bd->arrs[av->aid]) { + struct ir3_array_value *dav = bd->arrs[av->aid]; + for (unsigned i = 0; i < av->length; i++) { + ir3_reg_create(av->phis[i], 0, IR3_REG_SSA)->instr = + dav->arr[i]; + } + } else { + /* didn't find defn, recurse predecessors: */ + struct set_entry *entry; + set_foreach(nblock->predecessors, entry) { + add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited); + } + } +} + +static void +resolve_array_phis(struct ir3_compile *ctx, struct ir3_block *block) +{ + struct ir3_nir_block_data *bd = block->bd; + unsigned bitset_words = BITSET_WORDS(ctx->impl->num_blocks); + + if (!bd) + return; + + /* TODO use nir dom_frontier to help us with this? */ + + for (unsigned i = 1; i <= ctx->num_arrays; i++) { + struct ir3_array_value *av = bd->arrs[i]; + BITSET_WORD visited[bitset_words]; + struct set_entry *entry; + + if (!(av && av->phis)) + continue; + + memset(visited, 0, sizeof(visited)); + set_foreach(block->nblock->predecessors, entry) { + add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited); + } + } } /* allocate a n element value array (to be populated by caller) and @@ -417,6 +602,22 @@ get_addr(struct ir3_compile *ctx, struct ir3_instruction *src) } static struct ir3_instruction * +get_predicate(struct ir3_compile *ctx, struct ir3_instruction *src) +{ + struct ir3_block *b = ctx->block; + struct ir3_instruction *cond; + + /* NOTE: only cmps.*.* can write p0.x: */ + cond = ir3_CMPS_S(b, src, 0, create_immed(b, 0), 0); + cond->cat2.condition = IR3_COND_NE; + + /* condition always goes in predicate register: */ + cond->regs[0]->num = regid(REG_P0, 0); + + return cond; +} + +static struct ir3_instruction * create_uniform(struct ir3_compile *ctx, unsigned n) { struct ir3_instruction *mov; @@ -1029,7 +1230,7 @@ emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr, { nir_deref_var *dvar = intr->variables[0]; nir_deref_array *darr = nir_deref_as_array(dvar->deref.child); - struct ir3_array *arr = get_var(ctx, dvar->var); + struct ir3_array_value *arr = get_var(ctx, dvar->var); compile_assert(ctx, dvar->deref.child && (dvar->deref.child->deref_type == nir_deref_type_array)); @@ -1069,7 +1270,7 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) { nir_deref_var *dvar = intr->variables[0]; nir_deref_array *darr = nir_deref_as_array(dvar->deref.child); - struct ir3_array *arr = get_var(ctx, dvar->var); + struct ir3_array_value *arr = get_var(ctx, dvar->var); struct ir3_instruction **src; compile_assert(ctx, dvar->deref.child && @@ -1245,6 +1446,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) cond = create_immed(b, 1); } + /* NOTE: only cmps.*.* can write p0.x: */ cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0); cond->cat2.condition = IR3_COND_NE; @@ -1558,6 +1760,71 @@ emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex) } static void +emit_phi(struct ir3_compile *ctx, nir_phi_instr *nphi) +{ + struct ir3_instruction *phi, **dst; + + /* NOTE: phi's should be lowered to scalar at this point */ + compile_assert(ctx, nphi->dest.ssa.num_components == 1); + + dst = get_dst(ctx, &nphi->dest, 1); + + phi = ir3_instr_create2(ctx->block, -1, OPC_META_PHI, + 1 + exec_list_length(&nphi->srcs)); + ir3_reg_create(phi, 0, 0); /* dst */ + phi->phi.nphi = nphi; + + dst[0] = phi; +} + +/* phi instructions are left partially constructed. We don't resolve + * their srcs until the end of the block, since (eg. loops) one of + * the phi's srcs might be defined after the phi due to back edges in + * the CFG. + */ +static void +resolve_phis(struct ir3_compile *ctx, struct ir3_block *block) +{ + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + nir_phi_instr *nphi; + + /* phi's only come at start of block: */ + if (!(is_meta(instr) && (instr->opc == OPC_META_PHI))) + break; + + if (!instr->phi.nphi) + break; + + nphi = instr->phi.nphi; + instr->phi.nphi = NULL; + + foreach_list_typed(nir_phi_src, nsrc, node, &nphi->srcs) { + struct ir3_instruction *src = get_src(ctx, &nsrc->src)[0]; + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; + } + } + + resolve_array_phis(ctx, block); +} + +static void +emit_jump(struct ir3_compile *ctx, nir_jump_instr *jump) +{ + switch (jump->type) { + case nir_jump_break: + case nir_jump_continue: + /* I *think* we can simply just ignore this, and use the + * successor block link to figure out where we need to + * jump to for break/continue + */ + break; + default: + compile_error(ctx, "Unhandled NIR jump type: %d\n", jump->type); + break; + } +} + +static void emit_instr(struct ir3_compile *ctx, nir_instr *instr) { switch (instr->type) { @@ -1590,45 +1857,112 @@ emit_instr(struct ir3_compile *ctx, nir_instr *instr) } break; } - case nir_instr_type_call: - case nir_instr_type_jump: case nir_instr_type_phi: + emit_phi(ctx, nir_instr_as_phi(instr)); + break; + case nir_instr_type_jump: + emit_jump(ctx, nir_instr_as_jump(instr)); + break; + case nir_instr_type_call: case nir_instr_type_parallel_copy: compile_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type); break; } } +static struct ir3_block * +get_block(struct ir3_compile *ctx, nir_block *nblock) +{ + struct ir3_block *block; + struct hash_entry *entry; + entry = _mesa_hash_table_search(ctx->block_ht, nblock); + if (entry) + return entry->data; + + block = ir3_block_create(ctx->ir); + block->nblock = nblock; + _mesa_hash_table_insert(ctx->block_ht, nblock, block); + + return block; +} + static void -emit_block(struct ir3_compile *ctx, nir_block *block) +emit_block(struct ir3_compile *ctx, nir_block *nblock) { - nir_foreach_instr(block, instr) { + struct ir3_block *block = get_block(ctx, nblock); + + for (int i = 0; i < ARRAY_SIZE(block->successors); i++) { + if (nblock->successors[i]) { + block->successors[i] = + get_block(ctx, nblock->successors[i]); + } + } + + ctx->block = block; + list_addtail(&block->node, &ctx->ir->block_list); + + nir_foreach_instr(nblock, instr) { emit_instr(ctx, instr); if (ctx->error) return; } } +static void emit_cf_list(struct ir3_compile *ctx, struct exec_list *list); + static void -emit_function(struct ir3_compile *ctx, nir_function_impl *impl) +emit_if(struct ir3_compile *ctx, nir_if *nif) +{ + struct ir3_instruction *condition = get_src(ctx, &nif->condition)[0]; + + ctx->block->condition = + get_predicate(ctx, ir3_b2n(condition->block, condition)); + + emit_cf_list(ctx, &nif->then_list); + emit_cf_list(ctx, &nif->else_list); +} + +static void +emit_loop(struct ir3_compile *ctx, nir_loop *nloop) +{ + emit_cf_list(ctx, &nloop->body); +} + +static void +emit_cf_list(struct ir3_compile *ctx, struct exec_list *list) { - foreach_list_typed(nir_cf_node, node, node, &impl->body) { + foreach_list_typed(nir_cf_node, node, node, list) { switch (node->type) { case nir_cf_node_block: emit_block(ctx, nir_cf_node_as_block(node)); break; case nir_cf_node_if: + emit_if(ctx, nir_cf_node_as_if(node)); + break; case nir_cf_node_loop: + emit_loop(ctx, nir_cf_node_as_loop(node)); + break; case nir_cf_node_function: compile_error(ctx, "TODO\n"); break; } - if (ctx->error) - return; } } static void +emit_function(struct ir3_compile *ctx, nir_function_impl *impl) +{ + emit_cf_list(ctx, &impl->body); + emit_block(ctx, impl->end_block); + + /* at this point, we should have a single empty block, + * into which we emit the 'end' instruction. + */ + compile_assert(ctx, list_empty(&ctx->block->instr_list)); + ir3_END(ctx->block); +} + +static void setup_input(struct ir3_compile *ctx, nir_variable *in) { struct ir3_shader_variant *so = ctx->so; @@ -1787,8 +2121,19 @@ setup_output(struct ir3_compile *ctx, nir_variable *out) static void emit_instructions(struct ir3_compile *ctx) { - unsigned ninputs = exec_list_length(&ctx->s->inputs) * 4; - unsigned noutputs = exec_list_length(&ctx->s->outputs) * 4; + unsigned ninputs, noutputs; + nir_function_impl *fxn = NULL; + + /* Find the main function: */ + nir_foreach_overload(ctx->s, overload) { + compile_assert(ctx, strcmp(overload->function->name, "main") == 0); + compile_assert(ctx, overload->impl); + fxn = overload->impl; + break; + } + + ninputs = exec_list_length(&ctx->s->inputs) * 4; + noutputs = exec_list_length(&ctx->s->outputs) * 4; /* we need to allocate big enough outputs array so that * we can stuff the kill's at the end. Likewise for vtx @@ -1801,8 +2146,11 @@ emit_instructions(struct ir3_compile *ctx) } ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs); - ctx->block = ir3_block_create(ctx->ir); - ctx->ir->block = ctx->block; + + /* Create inputs in first block: */ + ctx->block = get_block(ctx, fxn->start_block); + ctx->in_block = ctx->block; + list_addtail(&ctx->block->node, &ctx->ir->block_list); if (ctx->so->type == SHADER_FRAGMENT) { ctx->ir->noutputs -= ARRAY_SIZE(ctx->kill); @@ -1838,13 +2186,12 @@ emit_instructions(struct ir3_compile *ctx) declare_var(ctx, var); } - /* Find the main function and emit the body: */ - nir_foreach_overload(ctx->s, overload) { - compile_assert(ctx, strcmp(overload->function->name, "main") == 0); - compile_assert(ctx, overload->impl); - emit_function(ctx, overload->impl); - if (ctx->error) - return; + /* And emit the body: */ + ctx->impl = fxn; + emit_function(ctx, fxn); + + list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { + resolve_phis(ctx, block); } } @@ -1906,13 +2253,13 @@ fixup_frag_inputs(struct ir3_compile *ctx) so->pos_regid = regid; /* r0.x */ - instr = create_input(ctx->block, NULL, ir->ninputs); + instr = create_input(ctx->in_block, NULL, ir->ninputs); instr->regs[0]->num = regid++; inputs[ir->ninputs++] = instr; ctx->frag_pos->regs[1]->instr = instr; /* r0.y */ - instr = create_input(ctx->block, NULL, ir->ninputs); + instr = create_input(ctx->in_block, NULL, ir->ninputs); instr->regs[0]->num = regid++; inputs[ir->ninputs++] = instr; ctx->frag_pos->regs[2]->instr = instr; @@ -1998,6 +2345,10 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, out = out->regs[1]->instr; out->regs[0]->flags |= IR3_REG_HALF; } + + if (out->category == 1) { + out->cat1.dst_type = half_type(out->cat1.dst_type); + } } } @@ -2058,6 +2409,11 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, ir3_legalize(ir, &so->has_samp, &max_bary); + if (fd_mesa_debug & FD_DBG_OPTMSGS) { + printf("AFTER LEGALIZE:\n"); + ir3_print(ir); + } + /* fixup input/outputs: */ for (i = 0; i < so->outputs_count; i++) { so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c index a477bd4b237..8c7c80f7aae 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c @@ -54,6 +54,13 @@ static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags) /* TODO: remove this hack: */ if (is_meta(src_instr) && (src_instr->opc == OPC_META_FO)) return false; + /* TODO: we currently don't handle left/right neighbors + * very well when inserting parallel-copies into phi.. + * to avoid problems don't eliminate a mov coming out + * of phi.. + */ + if (is_meta(src_instr) && (src_instr->opc == OPC_META_PHI)) + return false; return true; } return false; @@ -390,7 +397,7 @@ instr_cp(struct ir3_instruction *instr, unsigned *flags) void ir3_cp(struct ir3 *ir) { - ir3_clear_mark(ir->block->shader); + ir3_clear_mark(ir); for (unsigned i = 0; i < ir->noutputs; i++) { if (ir->outputs[i]) { @@ -400,4 +407,9 @@ ir3_cp(struct ir3 *ir) ir->outputs[i] = out; } } + + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + if (block->condition) + block->condition = instr_cp(block->condition, NULL); + } } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c index 6fc8b1762ff..3a108243479 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c @@ -134,6 +134,8 @@ remove_unused_by_block(struct ir3_block *block) { list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) { if (!ir3_instr_check_mark(instr)) { + if (is_flow(instr) && (instr->opc == OPC_END)) + continue; /* mark it, in case it is input, so we can * remove unused inputs: */ @@ -149,13 +151,21 @@ ir3_depth(struct ir3 *ir) { unsigned i; - ir3_clear_mark(ir->block->shader); + ir3_clear_mark(ir); for (i = 0; i < ir->noutputs; i++) if (ir->outputs[i]) ir3_instr_depth(ir->outputs[i]); + /* We also need to account for if-condition: */ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + if (block->condition) + ir3_instr_depth(block->condition); + } + /* mark un-used instructions: */ - remove_unused_by_block(ir->block); + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + remove_unused_by_block(block); + } /* cleanup unused inputs: */ for (i = 0; i < ir->ninputs; i++) { diff --git a/src/gallium/drivers/freedreno/ir3/ir3_group.c b/src/gallium/drivers/freedreno/ir3/ir3_group.c index 1fe09cc11e5..70d9b08e019 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_group.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_group.c @@ -116,6 +116,10 @@ restart: conflict = conflicts(instr->cp.left, left) || conflicts(instr->cp.right, right); + /* RA can't yet deal very well w/ group'd phi's: */ + if (is_meta(instr) && (instr->opc == OPC_META_PHI)) + conflict = true; + /* we also can't have an instr twice in the group: */ for (j = i + 1; (j < n) && !conflict; j++) if (ops->get(arr, j) == instr) @@ -226,7 +230,6 @@ find_neighbors(struct ir3 *ir) for (i = 0; i < ir->noutputs; i += 4) group_n(&arr_ops_out, &ir->outputs[i], 4); - for (i = 0; i < ir->noutputs; i++) { if (ir->outputs[i]) { struct ir3_instruction *instr = ir->outputs[i]; @@ -238,6 +241,6 @@ find_neighbors(struct ir3 *ir) void ir3_group(struct ir3 *ir) { - ir3_clear_mark(ir->block->shader); + ir3_clear_mark(ir); find_neighbors(ir); } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c index 34055f4c612..f4a4223ae17 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c @@ -42,15 +42,28 @@ */ struct ir3_legalize_ctx { - struct ir3_block *block; bool has_samp; int max_bary; }; +/* We want to evaluate each block from the position of any other + * predecessor block, in order that the flags set are the union + * of all possible program paths. For stopping condition, we + * want to stop when the pair of <pred-block, current-block> has + * been visited already. + * + * XXX is that completely true? We could have different needs_xyz + * flags set depending on path leading to pred-block.. we could + * do *most* of this based on chasing src instructions ptrs (and + * following all phi srcs).. except the write-after-read hazzard. + * + * For now we just set ss/sy flag on first instruction on block, + * and handle everything within the block as before. + */ + static void -legalize(struct ir3_legalize_ctx *ctx) +legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) { - struct ir3_block *block = ctx->block; struct ir3_instruction *last_input = NULL; struct ir3_instruction *last_rel = NULL; struct list_head instr_list; @@ -203,6 +216,7 @@ legalize(struct ir3_legalize_ctx *ctx) ir3_reg_create(baryf, regid(0, 0), 0); /* insert the dummy bary.f after last_input: */ + list_delinit(&baryf->node); list_add(&baryf->node, &last_input->node); last_input = baryf; @@ -213,23 +227,177 @@ legalize(struct ir3_legalize_ctx *ctx) if (last_rel) last_rel->flags |= IR3_INSTR_UL; - /* create/add 'end' instruction: */ - ir3_instr_create(block, 0, OPC_END); - list_first_entry(&block->instr_list, struct ir3_instruction, node) ->flags |= IR3_INSTR_SS | IR3_INSTR_SY; } +/* NOTE: branch instructions are always the last instruction(s) + * in the block. We take advantage of this as we resolve the + * branches, since "if (foo) break;" constructs turn into + * something like: + * + * block3 { + * ... + * 0029:021: mov.s32s32 r62.x, r1.y + * 0082:022: br !p0.x, target=block5 + * 0083:023: br p0.x, target=block4 + * // succs: if _[0029:021: mov.s32s32] block4; else block5; + * } + * block4 { + * 0084:024: jump, target=block6 + * // succs: block6; + * } + * block5 { + * 0085:025: jump, target=block7 + * // succs: block7; + * } + * + * ie. only instruction in block4/block5 is a jump, so when + * resolving branches we can easily detect this by checking + * that the first instruction in the target block is itself + * a jump, and setup the br directly to the jump's target + * (and strip back out the now unreached jump) + * + * TODO sometimes we end up with things like: + * + * br !p0.x, #2 + * br p0.x, #12 + * add.u r0.y, r0.y, 1 + * + * If we swapped the order of the branches, we could drop one. + */ +static struct ir3_block * +resolve_dest_block(struct ir3_block *block) +{ + /* special case for last block: */ + if (!block->successors[0]) + return block; + + /* NOTE that we may or may not have inserted the jump + * in the target block yet, so conditions to resolve + * the dest to the dest block's successor are: + * + * (1) successor[1] == NULL && + * (2) (block-is-empty || only-instr-is-jump) + */ + if (block->successors[1] == NULL) { + if (list_empty(&block->instr_list)) { + return block->successors[0]; + } else if (list_length(&block->instr_list) == 1) { + struct ir3_instruction *instr = list_first_entry( + &block->instr_list, struct ir3_instruction, node); + if (is_flow(instr) && (instr->opc == OPC_JUMP)) + return block->successors[0]; + } + } + return block; +} + +static bool +resolve_jump(struct ir3_instruction *instr) +{ + struct ir3_block *tblock = + resolve_dest_block(instr->cat0.target); + struct ir3_instruction *target; + + if (tblock != instr->cat0.target) { + list_delinit(&instr->cat0.target->node); + instr->cat0.target = tblock; + return true; + } + + target = list_first_entry(&tblock->instr_list, + struct ir3_instruction, node); + + if ((!target) || (target->ip == (instr->ip + 1))) { + list_delinit(&instr->node); + return true; + } else { + instr->cat0.immed = + (int)target->ip - (int)instr->ip; + } + return false; +} + +/* resolve jumps, removing jumps/branches to immediately following + * instruction which we end up with from earlier stages. Since + * removing an instruction can invalidate earlier instruction's + * branch offsets, we need to do this iteratively until no more + * branches are removed. + */ +static bool +resolve_jumps(struct ir3 *ir) +{ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) + if (is_flow(instr) && instr->cat0.target) + if (resolve_jump(instr)) + return true; + + return false; +} + +/* we want to mark points where divergent flow control re-converges + * with (jp) flags. For now, since we don't do any optimization for + * things that start out as a 'do {} while()', re-convergence points + * will always be a branch or jump target. Note that this is overly + * conservative, since unconditional jump targets are not convergence + * points, we are just assuming that the other path to reach the jump + * target was divergent. If we were clever enough to optimize the + * jump at end of a loop back to a conditional branch into a single + * conditional branch, ie. like: + * + * add.f r1.w, r0.x, (neg)(r)c2.x <= loop start + * mul.f r1.z, r1.z, r0.x + * mul.f r1.y, r1.y, r0.x + * mul.f r0.z, r1.x, r0.x + * mul.f r0.w, r0.y, r0.x + * cmps.f.ge r0.x, (r)c2.y, (r)r1.w + * add.s r0.x, (r)r0.x, (r)-1 + * sel.f32 r0.x, (r)c3.y, (r)r0.x, c3.x + * cmps.f.eq p0.x, r0.x, c3.y + * mov.f32f32 r0.x, r1.w + * mov.f32f32 r0.y, r0.w + * mov.f32f32 r1.x, r0.z + * (rpt2)nop + * br !p0.x, #-13 + * (jp)mul.f r0.x, c263.y, r1.y + * + * Then we'd have to be more clever, as the convergence point is no + * longer a branch or jump target. + */ +static void +mark_convergence_points(struct ir3 *ir) +{ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + if (is_flow(instr) && instr->cat0.target) { + struct ir3_instruction *target = + list_first_entry(&instr->cat0.target->instr_list, + struct ir3_instruction, node); + target->flags |= IR3_INSTR_JP; + } + } + } +} + void ir3_legalize(struct ir3 *ir, bool *has_samp, int *max_bary) { struct ir3_legalize_ctx ctx = { - .block = ir->block, .max_bary = -1, }; - legalize(&ctx); + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + legalize_block(&ctx, block); + } *has_samp = ctx.has_samp; *max_bary = ctx.max_bary; + + do { + ir3_count_instructions(ir); + } while(resolve_jumps(ir)); + + mark_convergence_points(ir); } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c index 965c834b8aa..f377982dd5e 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_print.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c @@ -137,6 +137,16 @@ tab(int lvl) printf("\t"); } +static uint32_t +block_id(struct ir3_block *block) +{ +#ifdef DEBUG + return block->serialno; +#else + return (uint32_t)(uint64_t)block; +#endif +} + static void print_instr(struct ir3_instruction *instr, int lvl) { @@ -173,6 +183,14 @@ print_instr(struct ir3_instruction *instr, int lvl) } } + if (is_flow(instr) && instr->cat0.target) { + /* the predicate register src is implied: */ + if (instr->opc == OPC_BR) { + printf(" %sp0.x", instr->cat0.inv ? "!" : ""); + } + printf(", target=block%u", block_id(instr->cat0.target)); + } + printf("\n"); } @@ -184,19 +202,31 @@ void ir3_print_instr(struct ir3_instruction *instr) static void print_block(struct ir3_block *block, int lvl) { - tab(lvl); printf("block {\n"); + tab(lvl); printf("block%u {\n", block_id(block)); list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { print_instr(instr, lvl+1); } + if (block->successors[1]) { + /* leading into if/else: */ + tab(lvl+1); + printf("/* succs: if _["); + print_instr_name(block->condition); + printf("] block%u; else block%u; */\n", + block_id(block->successors[0]), + block_id(block->successors[1])); + } else if (block->successors[0]) { + tab(lvl+1); + printf("/* succs: block%u; */\n", + block_id(block->successors[0])); + } tab(lvl); printf("}\n"); } void ir3_print(struct ir3 *ir) { - struct ir3_block *block = ir->block; - - print_block(block, 0); + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) + print_block(block, 0); for (unsigned i = 0; i < ir->noutputs; i++) { if (!ir->outputs[i]) diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c index 394c63f646d..359cd9a0d5d 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c @@ -29,6 +29,7 @@ #include "util/u_math.h" #include "util/register_allocate.h" #include "util/ralloc.h" +#include "util/bitset.h" #include "ir3.h" #include "ir3_compiler.h" @@ -255,6 +256,14 @@ struct ir3_ra_ctx { unsigned *def, *use; /* def/use table */ }; +/* additional block-data (per-block) */ +struct ir3_ra_block_data { + BITSET_WORD *def; /* variables defined before used in block */ + BITSET_WORD *use; /* variables used before defined in block */ + BITSET_WORD *livein; /* which defs reach entry point of block */ + BITSET_WORD *liveout; /* which defs reach exit point of block */ +}; + static bool is_half(struct ir3_instruction *instr) { @@ -369,7 +378,39 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off) *sz = util_last_bit(instr->regs[0]->wrmask); } *off = 0; - return instr; + d = instr; + } + + if (d->regs[0]->flags & IR3_REG_PHI_SRC) { + struct ir3_instruction *phi = d->regs[0]->instr; + struct ir3_instruction *dd; + int dsz, doff; + + dd = get_definer(phi, &dsz, &doff); + + *sz = MAX2(*sz, dsz); + *off = doff; + + if (dd->ip < d->ip) { + d = dd; + } + } + + if (is_meta(d) && (d->opc == OPC_META_PHI)) { + /* we have already inserted parallel-copies into + * the phi, so we don't need to chase definers + */ + struct ir3_register *src; + + /* note: don't use foreach_ssa_src as this gets called once + * while assigning regs (which clears SSA flag) + */ + foreach_src(src, d) { + if (!src->instr) + continue; + if (src->instr->ip < d->ip) + d = src->instr; + } } if (is_meta(d) && (d->opc == OPC_META_FO)) { @@ -396,13 +437,11 @@ static void ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block) { list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { - instr->ip = ctx->instr_cnt++; - } - - list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { struct ir3_instruction *defn; int cls, sz, off; + ctx->instr_cnt++; + if (instr->regs_count == 0) continue; @@ -431,8 +470,11 @@ static void ra_init(struct ir3_ra_ctx *ctx) { ir3_clear_mark(ctx->ir); + ir3_count_instructions(ctx->ir); - ra_block_name_instructions(ctx, ctx->ir->block); + list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { + ra_block_name_instructions(ctx, block); + } /* figure out the base register name for each class. The * actual ra name is class_base[cls] + instr->name; @@ -448,6 +490,16 @@ ra_init(struct ir3_ra_ctx *ctx) ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); } +static unsigned +ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn) +{ + unsigned name; + debug_assert(cls >= 0); + name = ctx->class_base[cls] + defn->name; + debug_assert(name < ctx->alloc_count); + return name; +} + static void ra_destroy(struct ir3_ra_ctx *ctx) { @@ -457,6 +509,18 @@ ra_destroy(struct ir3_ra_ctx *ctx) static void ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) { + struct ir3_ra_block_data *bd; + unsigned bitset_words = BITSET_WORDS(ctx->alloc_count); + + bd = rzalloc(ctx->g, struct ir3_ra_block_data); + + bd->def = rzalloc_array(bd, BITSET_WORD, bitset_words); + bd->use = rzalloc_array(bd, BITSET_WORD, bitset_words); + bd->livein = rzalloc_array(bd, BITSET_WORD, bitset_words); + bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words); + + block->bd = bd; + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { struct ir3_instruction *src; @@ -474,7 +538,15 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) * fanin: used to collect values from lower class and assemble * them together into a higher class, for example arguments * to texture sample instructions; We consider these to be - * defined at the fanin node. + * defined at the earliest fanin source. + * + * phi: used to merge values from different flow control paths + * to the same reg. Consider defined at earliest phi src, + * and update all the other phi src's (which may come later + * in the program) as users to extend the var's live range. + * + * Most of this, other than phi, is completely handled in the + * get_definer() helper. * * In either case, we trace the instruction back to the original * definer and consider that as the def/use ip. @@ -491,11 +563,15 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) */ cls = size_to_class(sz, is_half(defn)); if (cls >= 0) { - unsigned name = ctx->class_base[cls] + defn->name; + unsigned name = ra_name(ctx, cls, defn); + ctx->def[name] = defn->ip; ctx->use[name] = defn->ip; - debug_assert(name < ctx->alloc_count); + /* since we are in SSA at this point: */ + debug_assert(!BITSET_TEST(bd->use, name)); + + BITSET_SET(bd->def, name); if (is_half(defn)) { ra_set_node_class(ctx->g, name, @@ -504,6 +580,24 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) ra_set_node_class(ctx->g, name, ctx->set->classes[cls]); } + + /* extend the live range for phi srcs, which may come + * from the bottom of the loop + */ + if (defn->regs[0]->flags & IR3_REG_PHI_SRC) { + struct ir3_instruction *phi = defn->regs[0]->instr; + foreach_ssa_src(src, phi) { + /* if src is after phi, then we need to extend + * the liverange to the end of src's block: + */ + if (src->ip > phi->ip) { + struct ir3_instruction *last = + list_last_entry(&src->block->instr_list, + struct ir3_instruction, node); + ctx->use[name] = MAX2(ctx->use[name], last->ip); + } + } + } } } } @@ -516,12 +610,59 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) srcdefn = get_definer(src, &sz, &off); cls = size_to_class(sz, is_half(srcdefn)); if (cls >= 0) { - unsigned name = ctx->class_base[cls] + srcdefn->name; - ctx->use[name] = instr->ip; + unsigned name = ra_name(ctx, cls, srcdefn); + ctx->use[name] = MAX2(ctx->use[name], instr->ip); + if (!BITSET_TEST(bd->def, name)) + BITSET_SET(bd->use, name); + } + } + } + } +} + +static bool +ra_compute_livein_liveout(struct ir3_ra_ctx *ctx) +{ + unsigned bitset_words = BITSET_WORDS(ctx->alloc_count); + bool progress = false; + + list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { + struct ir3_ra_block_data *bd = block->bd; + + /* update livein: */ + for (unsigned i = 0; i < bitset_words; i++) { + BITSET_WORD new_livein = + (bd->use[i] | (bd->liveout[i] & ~bd->def[i])); + + if (new_livein & ~bd->livein[i]) { + bd->livein[i] |= new_livein; + progress = true; + } + } + + /* update liveout: */ + for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) { + struct ir3_block *succ = block->successors[j]; + struct ir3_ra_block_data *succ_bd; + + if (!succ) + continue; + + succ_bd = succ->bd; + + for (unsigned i = 0; i < bitset_words; i++) { + BITSET_WORD new_liveout = + (succ_bd->livein[i] & ~bd->liveout[i]); + + if (new_liveout) { + bd->liveout[i] |= new_liveout; + progress = true; } } } } + + return progress; } static void @@ -529,7 +670,34 @@ ra_add_interference(struct ir3_ra_ctx *ctx) { struct ir3 *ir = ctx->ir; - ra_block_compute_live_ranges(ctx, ctx->ir->block); + /* compute live ranges (use/def) on a block level, also updating + * block's def/use bitmasks (used below to calculate per-block + * livein/liveout): + */ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + ra_block_compute_live_ranges(ctx, block); + } + + /* update per-block livein/liveout: */ + while (ra_compute_livein_liveout(ctx)) {} + + /* extend start/end ranges based on livein/liveout info from cfg: */ + unsigned bitset_words = BITSET_WORDS(ctx->alloc_count); + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + struct ir3_ra_block_data *bd = block->bd; + + for (unsigned i = 0; i < bitset_words; i++) { + if (BITSET_TEST(bd->livein, i)) { + ctx->def[i] = MIN2(ctx->def[i], block->start_ip); + ctx->use[i] = MAX2(ctx->use[i], block->start_ip); + } + + if (BITSET_TEST(bd->liveout, i)) { + ctx->def[i] = MIN2(ctx->def[i], block->end_ip); + ctx->use[i] = MAX2(ctx->use[i], block->end_ip); + } + } + } /* need to fix things up to keep outputs live: */ for (unsigned i = 0; i < ir->noutputs; i++) { @@ -540,7 +708,7 @@ ra_add_interference(struct ir3_ra_ctx *ctx) defn = get_definer(instr, &sz, &off); cls = size_to_class(sz, is_half(defn)); if (cls >= 0) { - unsigned name = ctx->class_base[cls] + defn->name; + unsigned name = ra_name(ctx, cls, defn); ctx->use[name] = ctx->instr_cnt; } } @@ -555,23 +723,6 @@ ra_add_interference(struct ir3_ra_ctx *ctx) } } -static type_t half_type(type_t type) -{ - switch (type) { - case TYPE_F32: return TYPE_F16; - case TYPE_U32: return TYPE_U16; - case TYPE_S32: return TYPE_S16; - /* instructions may already be fixed up: */ - case TYPE_F16: - case TYPE_U16: - case TYPE_S16: - return type; - default: - assert(0); - return ~0; - } -} - /* some instructions need fix-up if dst register is half precision: */ static void fixup_half_instr_dst(struct ir3_instruction *instr) { @@ -633,7 +784,7 @@ reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg, defn = get_definer(instr, &sz, &off); cls = size_to_class(sz, is_half(defn)); if (cls >= 0) { - unsigned name = ctx->class_base[cls] + defn->name; + unsigned name = ra_name(ctx, cls, defn); unsigned r = ra_get_node_reg(ctx->g, name); unsigned num = ctx->set->ra_reg_to_gpr[r] + off; @@ -641,7 +792,7 @@ reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg, num += reg->offset; reg->num = num; - reg->flags &= ~IR3_REG_SSA; + reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC); if (is_half(defn)) reg->flags |= IR3_REG_HALF; @@ -686,8 +837,8 @@ ra_alloc(struct ir3_ra_ctx *ctx) unsigned i = 0, j; if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) { struct ir3_instruction *instr = ir->inputs[i]; - unsigned cls = size_to_class(1, true); - unsigned name = ctx->class_base[cls] + instr->name; + int cls = size_to_class(1, true); + unsigned name = ra_name(ctx, cls, instr); unsigned reg = ctx->set->gpr_to_ra_reg[cls][0]; /* if we have frag_face, it gets hr0.x */ @@ -706,8 +857,7 @@ ra_alloc(struct ir3_ra_ctx *ctx) unsigned name, reg; cls = size_to_class(sz, is_half(defn)); - debug_assert(cls >= 0); - name = ctx->class_base[cls] + defn->name; + name = ra_name(ctx, cls, defn); reg = ctx->set->gpr_to_ra_reg[cls][j]; ra_set_node_reg(ctx->g, name, reg); @@ -720,7 +870,9 @@ ra_alloc(struct ir3_ra_ctx *ctx) if (!ra_allocate(ctx->g)) return -1; - ra_block_alloc(ctx, ctx->ir->block); + list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { + ra_block_alloc(ctx, block); + } return 0; } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c index 0d404a83583..49a4426d163 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c @@ -205,6 +205,16 @@ instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, struct ir3_instruction *src; unsigned delay = 0; + /* Phi instructions can have a dependency on something not + * scheduled yet (for ex, loops). But OTOH we don't really + * care. By definition phi's should appear at the top of + * the block, and it's sources should be values from the + * previously executing block, so they are always ready to + * be scheduled: + */ + if (is_meta(instr) && (instr->opc == OPC_META_PHI)) + return 0; + foreach_ssa_src(src, instr) { /* if dependency not scheduled, we aren't ready yet: */ if (!is_scheduled(src)) @@ -422,13 +432,87 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) } } } + + /* And lastly, insert branch/jump instructions to take us to + * the next block. Later we'll strip back out the branches + * that simply jump to next instruction. + */ + if (block->successors[1]) { + /* if/else, conditional branches to "then" or "else": */ + struct ir3_instruction *br; + unsigned delay = 6; + + debug_assert(ctx->pred); + debug_assert(block->condition); + + delay -= distance(ctx, ctx->pred, delay); + + while (delay > 0) { + ir3_NOP(block); + delay--; + } + + /* create "else" branch first (since "then" block should + * frequently/always end up being a fall-thru): + */ + br = ir3_BR(block); + br->cat0.inv = true; + br->cat0.target = block->successors[1]; + + /* NOTE: we have to hard code delay of 6 above, since + * we want to insert the nop's before constructing the + * branch. Throw in an assert so we notice if this + * ever breaks on future generation: + */ + debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6); + + br = ir3_BR(block); + br->cat0.target = block->successors[0]; + + } else if (block->successors[0]) { + /* otherwise unconditional jump to next block: */ + struct ir3_instruction *jmp; + + jmp = ir3_JUMP(block); + jmp->cat0.target = block->successors[0]; + } + + /* NOTE: if we kept track of the predecessors, we could do a better + * job w/ (jp) flags.. every node w/ > predecessor is a join point. + * Note that as we eliminate blocks which contain only an unconditional + * jump we probably need to propagate (jp) flag.. + */ +} + +/* this is needed to ensure later RA stage succeeds: */ +static void +sched_insert_parallel_copies(struct ir3_block *block) +{ + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + if (is_meta(instr) && (instr->opc == OPC_META_PHI)) { + struct ir3_register *reg; + foreach_src(reg, instr) { + struct ir3_instruction *src = reg->instr; + struct ir3_instruction *mov = + ir3_MOV(src->block, src, TYPE_U32); + mov->regs[0]->flags |= IR3_REG_PHI_SRC; + mov->regs[0]->instr = instr; + reg->instr = mov; + } + } + } } int ir3_sched(struct ir3 *ir) { struct ir3_sched_ctx ctx = {0}; - ir3_clear_mark(ir->block->shader); - sched_block(&ctx, ir->block); + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + sched_insert_parallel_copies(block); + } + ir3_clear_mark(ir); + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + sched_block(&ctx, block); + } if (ctx.error) return -1; return 0; |