summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers')
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3.c60
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3.h92
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c428
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_cp.c14
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_depth.c14
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_group.c7
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_legalize.c184
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_print.c38
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_ra.c226
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_sched.c88
10 files changed, 1025 insertions, 126 deletions
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index ba5851c6c82..a166b67d7cf 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -80,6 +80,8 @@ struct ir3 * ir3_create(struct ir3_compiler *compiler,
shader->noutputs = nout;
shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout);
+ list_inithead(&shader->block_list);
+
return shader;
}
@@ -548,7 +550,6 @@ static int (*emit[])(struct ir3_instruction *instr, void *ptr,
void * ir3_assemble(struct ir3 *shader, struct ir3_info *info,
uint32_t gpu_id)
{
- struct ir3_block *block = shader->block;
uint32_t *ptr, *dwords;
info->gpu_id = gpu_id;
@@ -558,8 +559,10 @@ void * ir3_assemble(struct ir3 *shader, struct ir3_info *info,
info->instrs_count = 0;
info->sizedwords = 0;
- list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
- info->sizedwords += 2;
+ list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ info->sizedwords += 2;
+ }
}
/* need a integer number of instruction "groups" (sets of 16
@@ -574,12 +577,14 @@ void * ir3_assemble(struct ir3 *shader, struct ir3_info *info,
ptr = dwords = calloc(4, info->sizedwords);
- list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
- int ret = emit[instr->category](instr, dwords, info);
- if (ret)
- goto fail;
- info->instrs_count += 1 + instr->repeat;
- dwords += 2;
+ list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ int ret = emit[instr->category](instr, dwords, info);
+ if (ret)
+ goto fail;
+ info->instrs_count += 1 + instr->repeat;
+ dwords += 2;
+ }
}
return ptr;
@@ -617,7 +622,12 @@ static void insert_instr(struct ir3_block *block,
struct ir3_block * ir3_block_create(struct ir3 *shader)
{
struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
+#ifdef DEBUG
+ static uint32_t serialno = 0;
+ block->serialno = ++serialno;
+#endif
block->shader = shader;
+ list_inithead(&block->node);
list_inithead(&block->instr_list);
return block;
}
@@ -688,10 +698,40 @@ struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
int num, int flags)
{
- struct ir3_register *reg = reg_create(instr->block->shader, num, flags);
+ struct ir3 *shader = instr->block->shader;
+ struct ir3_register *reg = reg_create(shader, num, flags);
#ifdef DEBUG
debug_assert(instr->regs_count < instr->regs_max);
#endif
instr->regs[instr->regs_count++] = reg;
return reg;
}
+
+void
+ir3_block_clear_mark(struct ir3_block *block)
+{
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
+ instr->flags &= ~IR3_INSTR_MARK;
+}
+
+void
+ir3_clear_mark(struct ir3 *ir)
+{
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ ir3_block_clear_mark(block);
+ }
+}
+
+/* note: this will destroy instr->depth, don't do it until after sched! */
+void
+ir3_count_instructions(struct ir3 *ir)
+{
+ unsigned ip = 0;
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ instr->ip = ip++;
+ }
+ block->start_ip = list_first_entry(&block->instr_list, struct ir3_instruction, node)->ip;
+ block->end_ip = list_last_entry(&block->instr_list, struct ir3_instruction, node)->ip;
+ }
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index 95b866988b8..9c35a763d58 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -83,7 +83,8 @@ struct ir3_register {
* before register assignment is done:
*/
IR3_REG_SSA = 0x2000, /* 'instr' is ptr to assigning instr */
- IR3_REG_IA = 0x4000, /* meta-input dst is "assigned" */
+ IR3_REG_PHI_SRC= 0x4000, /* phi src, regs[0]->instr points to phi */
+
} flags;
union {
/* normal registers:
@@ -187,6 +188,7 @@ struct ir3_instruction {
char inv;
char comp;
int immed;
+ struct ir3_block *target;
} cat0;
struct {
type_t src_type, dst_type;
@@ -220,14 +222,14 @@ struct ir3_instruction {
int aid;
} fi;
struct {
- struct ir3_block *if_block, *else_block;
- } flow;
+ /* used to temporarily hold reference to nir_phi_instr
+ * until we resolve the phi srcs
+ */
+ void *nphi;
+ } phi;
struct {
struct ir3_block *block;
} inout;
-
- /* XXX keep this as big as all other union members! */
- uint32_t info[3];
};
/* transient values used during various algorithms: */
@@ -363,16 +365,40 @@ struct ir3 {
unsigned predicates_count, predicates_sz;
struct ir3_instruction **predicates;
- struct ir3_block *block;
+ /* List of blocks: */
+ struct list_head block_list;
+
unsigned heap_idx;
struct ir3_heap_chunk *chunk;
};
+typedef struct nir_block nir_block;
+
struct ir3_block {
+ struct list_head node;
struct ir3 *shader;
- /* only a single address register: */
- struct ir3_instruction *address;
- struct list_head instr_list;
+
+ nir_block *nblock;
+
+ struct list_head instr_list; /* list of ir3_instruction */
+
+ /* each block has either one or two successors.. in case of
+ * two successors, 'condition' decides which one to follow.
+ * A block preceding an if/else has two successors.
+ */
+ struct ir3_instruction *condition;
+ struct ir3_block *successors[2];
+
+ uint16_t start_ip, end_ip;
+
+ /* used for per-pass extra block data. Mainly used right
+ * now in RA step to track livein/liveout.
+ */
+ void *bd;
+
+#ifdef DEBUG
+ uint32_t serialno;
+#endif
};
struct ir3 * ir3_create(struct ir3_compiler *compiler,
@@ -394,7 +420,6 @@ const char *ir3_instr_name(struct ir3_instruction *instr);
struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
int num, int flags);
-
static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
{
if (instr->flags & IR3_INSTR_MARK)
@@ -403,19 +428,10 @@ static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
return false;
}
-static inline void ir3_clear_mark(struct ir3 *shader)
-{
- /* TODO would be nice to drop the instruction array.. for
- * new compiler, _clear_mark() is all we use it for, and
- * we could probably manage a linked list instead..
- *
- * Also, we'll probably want to mark instructions within
- * a block, so tracking the list of instrs globally is
- * unlikely to be what we want.
- */
- list_for_each_entry (struct ir3_instruction, instr, &shader->block->instr_list, node)
- instr->flags &= ~IR3_INSTR_MARK;
-}
+void ir3_block_clear_mark(struct ir3_block *block);
+void ir3_clear_mark(struct ir3 *shader);
+
+void ir3_count_instructions(struct ir3 *ir);
static inline int ir3_instr_regno(struct ir3_instruction *instr,
struct ir3_register *reg)
@@ -593,6 +609,22 @@ static inline bool reg_gpr(struct ir3_register *r)
return true;
}
+static inline type_t half_type(type_t type)
+{
+ switch (type) {
+ case TYPE_F32: return TYPE_F16;
+ case TYPE_U32: return TYPE_U16;
+ case TYPE_S32: return TYPE_S16;
+ case TYPE_F16:
+ case TYPE_U16:
+ case TYPE_S16:
+ return type;
+ default:
+ assert(0);
+ return ~0;
+ }
+}
+
/* some cat2 instructions (ie. those which are not float) can embed an
* immediate:
*/
@@ -837,6 +869,15 @@ ir3_NOP(struct ir3_block *block)
return ir3_instr_create(block, 0, OPC_NOP);
}
+#define INSTR0(CAT, name) \
+static inline struct ir3_instruction * \
+ir3_##name(struct ir3_block *block) \
+{ \
+ struct ir3_instruction *instr = \
+ ir3_instr_create(block, CAT, OPC_##name); \
+ return instr; \
+}
+
#define INSTR1(CAT, name) \
static inline struct ir3_instruction * \
ir3_##name(struct ir3_block *block, \
@@ -880,7 +921,10 @@ ir3_##name(struct ir3_block *block, \
}
/* cat0 instructions: */
+INSTR0(0, BR);
+INSTR0(0, JUMP);
INSTR1(0, KILL);
+INSTR0(0, END);
/* cat2 instructions, most 2 src but some 1 src: */
INSTR2(2, ADD_F)
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index f62a5ec2b26..4165e2d6aa7 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -48,8 +48,6 @@
#include "ir3.h"
-static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
-
struct ir3_compile {
struct ir3_compiler *compiler;
@@ -62,7 +60,10 @@ struct ir3_compile {
/* bitmask of which samplers are integer: */
uint16_t integer_s;
- struct ir3_block *block;
+ struct ir3_block *block; /* the current block */
+ struct ir3_block *in_block; /* block created for shader inputs */
+
+ nir_function_impl *impl;
/* For fragment shaders, from the hw perspective the only
* actual input is r0.xy position register passed to bary.f.
@@ -94,6 +95,11 @@ struct ir3_compile {
*/
struct hash_table *addr_ht;
+ /* maps nir_block to ir3_block, mostly for the purposes of
+ * figuring out the blocks successors
+ */
+ struct hash_table *block_ht;
+
/* for calculating input/output positions/linkages: */
unsigned next_inloc;
@@ -120,6 +126,9 @@ struct ir3_compile {
};
+static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
+static struct ir3_block * get_block(struct ir3_compile *ctx, nir_block *nblock);
+
static struct nir_shader *to_nir(const struct tgsi_token *tokens)
{
struct nir_shader_compiler_options options = {
@@ -148,6 +157,7 @@ static struct nir_shader *to_nir(const struct tgsi_token *tokens)
nir_lower_vars_to_ssa(s);
nir_lower_alu_to_scalar(s);
+ nir_lower_phis_to_scalar(s);
progress |= nir_copy_prop(s);
progress |= nir_opt_dce(s);
@@ -244,6 +254,8 @@ compile_init(struct ir3_compiler *compiler,
_mesa_hash_pointer, _mesa_key_pointer_equal);
ctx->addr_ht = _mesa_hash_table_create(ctx,
_mesa_hash_pointer, _mesa_key_pointer_equal);
+ ctx->block_ht = _mesa_hash_table_create(ctx,
+ _mesa_hash_pointer, _mesa_key_pointer_equal);
lowered_tokens = lower_tgsi(ctx, tokens, so);
if (!lowered_tokens)
@@ -287,33 +299,206 @@ compile_free(struct ir3_compile *ctx)
ralloc_free(ctx);
}
-
+/* global per-array information: */
struct ir3_array {
unsigned length, aid;
+};
+
+/* per-block array state: */
+struct ir3_array_value {
+ /* TODO drop length/aid, and just have ptr back to ir3_array */
+ unsigned length, aid;
+ /* initial array element values are phi's, other than for the
+ * entry block. The phi src's get added later in a resolve step
+ * after we have visited all the blocks, to account for back
+ * edges in the cfg.
+ */
+ struct ir3_instruction **phis;
+ /* current array element values (as block is processed). When
+ * the array phi's are resolved, it will contain the array state
+ * at exit of block, so successor blocks can use it to add their
+ * phi srcs.
+ */
struct ir3_instruction *arr[];
};
+/* track array assignments per basic block. When an array is read
+ * outside of the same basic block, we can use NIR's dominance-frontier
+ * information to figure out where phi nodes are needed.
+ */
+struct ir3_nir_block_data {
+ unsigned foo;
+ /* indexed by array-id (aid): */
+ struct ir3_array_value *arrs[];
+};
+
+static struct ir3_nir_block_data *
+get_block_data(struct ir3_compile *ctx, struct ir3_block *block)
+{
+ if (!block->bd) {
+ struct ir3_nir_block_data *bd = ralloc_size(ctx, sizeof(*bd) +
+ ((ctx->num_arrays + 1) * sizeof(bd->arrs[0])));
+ block->bd = bd;
+ }
+ return block->bd;
+}
+
static void
declare_var(struct ir3_compile *ctx, nir_variable *var)
{
unsigned length = glsl_get_length(var->type) * 4; /* always vec4, at least with ttn */
- struct ir3_array *arr = ralloc_size(ctx, sizeof(*arr) +
- (length * sizeof(arr->arr[0])));
+ struct ir3_array *arr = ralloc(ctx, struct ir3_array);
arr->length = length;
arr->aid = ++ctx->num_arrays;
- /* Some shaders end up reading array elements without first writing..
- * so initialize things to prevent null instr ptrs later:
- */
- for (unsigned i = 0; i < length; i++)
- arr->arr[i] = create_immed(ctx->block, 0);
_mesa_hash_table_insert(ctx->var_ht, var, arr);
}
-static struct ir3_array *
+static nir_block *
+nir_block_pred(nir_block *block)
+{
+ assert(block->predecessors->entries < 2);
+ if (block->predecessors->entries == 0)
+ return NULL;
+ return (nir_block *)_mesa_set_next_entry(block->predecessors, NULL)->key;
+}
+
+static struct ir3_array_value *
get_var(struct ir3_compile *ctx, nir_variable *var)
{
struct hash_entry *entry = _mesa_hash_table_search(ctx->var_ht, var);
- return entry->data;
+ struct ir3_block *block = ctx->block;
+ struct ir3_nir_block_data *bd = get_block_data(ctx, block);
+ struct ir3_array *arr = entry->data;
+
+ if (!bd->arrs[arr->aid]) {
+ struct ir3_array_value *av = ralloc_size(bd, sizeof(*av) +
+ (arr->length * sizeof(av->arr[0])));
+ struct ir3_array_value *defn = NULL;
+ nir_block *pred_block;
+
+ av->length = arr->length;
+ av->aid = arr->aid;
+
+ /* For loops, we have to consider that we have not visited some
+ * of the blocks who should feed into the phi (ie. back-edges in
+ * the cfg).. for example:
+ *
+ * loop {
+ * block { load_var; ... }
+ * if then block {} else block {}
+ * block { store_var; ... }
+ * if then block {} else block {}
+ * block {...}
+ * }
+ *
+ * We can skip the phi if we can chase the block predecessors
+ * until finding the block previously defining the array without
+ * crossing a block that has more than one predecessor.
+ *
+ * Otherwise create phi's and resolve them as a post-pass after
+ * all the blocks have been visited (to handle back-edges).
+ */
+
+ for (pred_block = block->nblock;
+ pred_block && (pred_block->predecessors->entries < 2) && !defn;
+ pred_block = nir_block_pred(pred_block)) {
+ struct ir3_block *pblock = get_block(ctx, pred_block);
+ struct ir3_nir_block_data *pbd = pblock->bd;
+ if (!pbd)
+ continue;
+ defn = pbd->arrs[arr->aid];
+ }
+
+ if (defn) {
+ /* only one possible definer: */
+ for (unsigned i = 0; i < arr->length; i++)
+ av->arr[i] = defn->arr[i];
+ } else if (pred_block) {
+ /* not the first block, and multiple potential definers: */
+ av->phis = ralloc_size(av, arr->length * sizeof(av->phis[0]));
+
+ for (unsigned i = 0; i < arr->length; i++) {
+ struct ir3_instruction *phi;
+
+ phi = ir3_instr_create2(block, -1, OPC_META_PHI,
+ 1 + ctx->impl->num_blocks);
+ ir3_reg_create(phi, 0, 0); /* dst */
+
+ /* phi's should go at head of block: */
+ list_delinit(&phi->node);
+ list_add(&phi->node, &block->instr_list);
+
+ av->phis[i] = av->arr[i] = phi;
+ }
+ } else {
+ /* Some shaders end up reading array elements without
+ * first writing.. so initialize things to prevent null
+ * instr ptrs later:
+ */
+ for (unsigned i = 0; i < arr->length; i++)
+ av->arr[i] = create_immed(block, 0);
+ }
+
+ bd->arrs[arr->aid] = av;
+ }
+
+ return bd->arrs[arr->aid];
+}
+
+static void
+add_array_phi_srcs(struct ir3_compile *ctx, nir_block *nblock,
+ struct ir3_array_value *av, BITSET_WORD *visited)
+{
+ struct ir3_block *block;
+ struct ir3_nir_block_data *bd;
+
+ if (BITSET_TEST(visited, nblock->index))
+ return;
+
+ BITSET_SET(visited, nblock->index);
+
+ block = get_block(ctx, nblock);
+ bd = block->bd;
+
+ if (bd && bd->arrs[av->aid]) {
+ struct ir3_array_value *dav = bd->arrs[av->aid];
+ for (unsigned i = 0; i < av->length; i++) {
+ ir3_reg_create(av->phis[i], 0, IR3_REG_SSA)->instr =
+ dav->arr[i];
+ }
+ } else {
+ /* didn't find defn, recurse predecessors: */
+ struct set_entry *entry;
+ set_foreach(nblock->predecessors, entry) {
+ add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited);
+ }
+ }
+}
+
+static void
+resolve_array_phis(struct ir3_compile *ctx, struct ir3_block *block)
+{
+ struct ir3_nir_block_data *bd = block->bd;
+ unsigned bitset_words = BITSET_WORDS(ctx->impl->num_blocks);
+
+ if (!bd)
+ return;
+
+ /* TODO use nir dom_frontier to help us with this? */
+
+ for (unsigned i = 1; i <= ctx->num_arrays; i++) {
+ struct ir3_array_value *av = bd->arrs[i];
+ BITSET_WORD visited[bitset_words];
+ struct set_entry *entry;
+
+ if (!(av && av->phis))
+ continue;
+
+ memset(visited, 0, sizeof(visited));
+ set_foreach(block->nblock->predecessors, entry) {
+ add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited);
+ }
+ }
}
/* allocate a n element value array (to be populated by caller) and
@@ -417,6 +602,22 @@ get_addr(struct ir3_compile *ctx, struct ir3_instruction *src)
}
static struct ir3_instruction *
+get_predicate(struct ir3_compile *ctx, struct ir3_instruction *src)
+{
+ struct ir3_block *b = ctx->block;
+ struct ir3_instruction *cond;
+
+ /* NOTE: only cmps.*.* can write p0.x: */
+ cond = ir3_CMPS_S(b, src, 0, create_immed(b, 0), 0);
+ cond->cat2.condition = IR3_COND_NE;
+
+ /* condition always goes in predicate register: */
+ cond->regs[0]->num = regid(REG_P0, 0);
+
+ return cond;
+}
+
+static struct ir3_instruction *
create_uniform(struct ir3_compile *ctx, unsigned n)
{
struct ir3_instruction *mov;
@@ -1029,7 +1230,7 @@ emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
{
nir_deref_var *dvar = intr->variables[0];
nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
- struct ir3_array *arr = get_var(ctx, dvar->var);
+ struct ir3_array_value *arr = get_var(ctx, dvar->var);
compile_assert(ctx, dvar->deref.child &&
(dvar->deref.child->deref_type == nir_deref_type_array));
@@ -1069,7 +1270,7 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
{
nir_deref_var *dvar = intr->variables[0];
nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
- struct ir3_array *arr = get_var(ctx, dvar->var);
+ struct ir3_array_value *arr = get_var(ctx, dvar->var);
struct ir3_instruction **src;
compile_assert(ctx, dvar->deref.child &&
@@ -1245,6 +1446,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
cond = create_immed(b, 1);
}
+ /* NOTE: only cmps.*.* can write p0.x: */
cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
cond->cat2.condition = IR3_COND_NE;
@@ -1558,6 +1760,71 @@ emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex)
}
static void
+emit_phi(struct ir3_compile *ctx, nir_phi_instr *nphi)
+{
+ struct ir3_instruction *phi, **dst;
+
+ /* NOTE: phi's should be lowered to scalar at this point */
+ compile_assert(ctx, nphi->dest.ssa.num_components == 1);
+
+ dst = get_dst(ctx, &nphi->dest, 1);
+
+ phi = ir3_instr_create2(ctx->block, -1, OPC_META_PHI,
+ 1 + exec_list_length(&nphi->srcs));
+ ir3_reg_create(phi, 0, 0); /* dst */
+ phi->phi.nphi = nphi;
+
+ dst[0] = phi;
+}
+
+/* phi instructions are left partially constructed. We don't resolve
+ * their srcs until the end of the block, since (eg. loops) one of
+ * the phi's srcs might be defined after the phi due to back edges in
+ * the CFG.
+ */
+static void
+resolve_phis(struct ir3_compile *ctx, struct ir3_block *block)
+{
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ nir_phi_instr *nphi;
+
+ /* phi's only come at start of block: */
+ if (!(is_meta(instr) && (instr->opc == OPC_META_PHI)))
+ break;
+
+ if (!instr->phi.nphi)
+ break;
+
+ nphi = instr->phi.nphi;
+ instr->phi.nphi = NULL;
+
+ foreach_list_typed(nir_phi_src, nsrc, node, &nphi->srcs) {
+ struct ir3_instruction *src = get_src(ctx, &nsrc->src)[0];
+ ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+ }
+ }
+
+ resolve_array_phis(ctx, block);
+}
+
+static void
+emit_jump(struct ir3_compile *ctx, nir_jump_instr *jump)
+{
+ switch (jump->type) {
+ case nir_jump_break:
+ case nir_jump_continue:
+ /* I *think* we can simply just ignore this, and use the
+ * successor block link to figure out where we need to
+ * jump to for break/continue
+ */
+ break;
+ default:
+ compile_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
+ break;
+ }
+}
+
+static void
emit_instr(struct ir3_compile *ctx, nir_instr *instr)
{
switch (instr->type) {
@@ -1590,45 +1857,112 @@ emit_instr(struct ir3_compile *ctx, nir_instr *instr)
}
break;
}
- case nir_instr_type_call:
- case nir_instr_type_jump:
case nir_instr_type_phi:
+ emit_phi(ctx, nir_instr_as_phi(instr));
+ break;
+ case nir_instr_type_jump:
+ emit_jump(ctx, nir_instr_as_jump(instr));
+ break;
+ case nir_instr_type_call:
case nir_instr_type_parallel_copy:
compile_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type);
break;
}
}
+static struct ir3_block *
+get_block(struct ir3_compile *ctx, nir_block *nblock)
+{
+ struct ir3_block *block;
+ struct hash_entry *entry;
+ entry = _mesa_hash_table_search(ctx->block_ht, nblock);
+ if (entry)
+ return entry->data;
+
+ block = ir3_block_create(ctx->ir);
+ block->nblock = nblock;
+ _mesa_hash_table_insert(ctx->block_ht, nblock, block);
+
+ return block;
+}
+
static void
-emit_block(struct ir3_compile *ctx, nir_block *block)
+emit_block(struct ir3_compile *ctx, nir_block *nblock)
{
- nir_foreach_instr(block, instr) {
+ struct ir3_block *block = get_block(ctx, nblock);
+
+ for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
+ if (nblock->successors[i]) {
+ block->successors[i] =
+ get_block(ctx, nblock->successors[i]);
+ }
+ }
+
+ ctx->block = block;
+ list_addtail(&block->node, &ctx->ir->block_list);
+
+ nir_foreach_instr(nblock, instr) {
emit_instr(ctx, instr);
if (ctx->error)
return;
}
}
+static void emit_cf_list(struct ir3_compile *ctx, struct exec_list *list);
+
static void
-emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
+emit_if(struct ir3_compile *ctx, nir_if *nif)
+{
+ struct ir3_instruction *condition = get_src(ctx, &nif->condition)[0];
+
+ ctx->block->condition =
+ get_predicate(ctx, ir3_b2n(condition->block, condition));
+
+ emit_cf_list(ctx, &nif->then_list);
+ emit_cf_list(ctx, &nif->else_list);
+}
+
+static void
+emit_loop(struct ir3_compile *ctx, nir_loop *nloop)
+{
+ emit_cf_list(ctx, &nloop->body);
+}
+
+static void
+emit_cf_list(struct ir3_compile *ctx, struct exec_list *list)
{
- foreach_list_typed(nir_cf_node, node, node, &impl->body) {
+ foreach_list_typed(nir_cf_node, node, node, list) {
switch (node->type) {
case nir_cf_node_block:
emit_block(ctx, nir_cf_node_as_block(node));
break;
case nir_cf_node_if:
+ emit_if(ctx, nir_cf_node_as_if(node));
+ break;
case nir_cf_node_loop:
+ emit_loop(ctx, nir_cf_node_as_loop(node));
+ break;
case nir_cf_node_function:
compile_error(ctx, "TODO\n");
break;
}
- if (ctx->error)
- return;
}
}
static void
+emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
+{
+ emit_cf_list(ctx, &impl->body);
+ emit_block(ctx, impl->end_block);
+
+ /* at this point, we should have a single empty block,
+ * into which we emit the 'end' instruction.
+ */
+ compile_assert(ctx, list_empty(&ctx->block->instr_list));
+ ir3_END(ctx->block);
+}
+
+static void
setup_input(struct ir3_compile *ctx, nir_variable *in)
{
struct ir3_shader_variant *so = ctx->so;
@@ -1787,8 +2121,19 @@ setup_output(struct ir3_compile *ctx, nir_variable *out)
static void
emit_instructions(struct ir3_compile *ctx)
{
- unsigned ninputs = exec_list_length(&ctx->s->inputs) * 4;
- unsigned noutputs = exec_list_length(&ctx->s->outputs) * 4;
+ unsigned ninputs, noutputs;
+ nir_function_impl *fxn = NULL;
+
+ /* Find the main function: */
+ nir_foreach_overload(ctx->s, overload) {
+ compile_assert(ctx, strcmp(overload->function->name, "main") == 0);
+ compile_assert(ctx, overload->impl);
+ fxn = overload->impl;
+ break;
+ }
+
+ ninputs = exec_list_length(&ctx->s->inputs) * 4;
+ noutputs = exec_list_length(&ctx->s->outputs) * 4;
/* we need to allocate big enough outputs array so that
* we can stuff the kill's at the end. Likewise for vtx
@@ -1801,8 +2146,11 @@ emit_instructions(struct ir3_compile *ctx)
}
ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs);
- ctx->block = ir3_block_create(ctx->ir);
- ctx->ir->block = ctx->block;
+
+ /* Create inputs in first block: */
+ ctx->block = get_block(ctx, fxn->start_block);
+ ctx->in_block = ctx->block;
+ list_addtail(&ctx->block->node, &ctx->ir->block_list);
if (ctx->so->type == SHADER_FRAGMENT) {
ctx->ir->noutputs -= ARRAY_SIZE(ctx->kill);
@@ -1838,13 +2186,12 @@ emit_instructions(struct ir3_compile *ctx)
declare_var(ctx, var);
}
- /* Find the main function and emit the body: */
- nir_foreach_overload(ctx->s, overload) {
- compile_assert(ctx, strcmp(overload->function->name, "main") == 0);
- compile_assert(ctx, overload->impl);
- emit_function(ctx, overload->impl);
- if (ctx->error)
- return;
+ /* And emit the body: */
+ ctx->impl = fxn;
+ emit_function(ctx, fxn);
+
+ list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+ resolve_phis(ctx, block);
}
}
@@ -1906,13 +2253,13 @@ fixup_frag_inputs(struct ir3_compile *ctx)
so->pos_regid = regid;
/* r0.x */
- instr = create_input(ctx->block, NULL, ir->ninputs);
+ instr = create_input(ctx->in_block, NULL, ir->ninputs);
instr->regs[0]->num = regid++;
inputs[ir->ninputs++] = instr;
ctx->frag_pos->regs[1]->instr = instr;
/* r0.y */
- instr = create_input(ctx->block, NULL, ir->ninputs);
+ instr = create_input(ctx->in_block, NULL, ir->ninputs);
instr->regs[0]->num = regid++;
inputs[ir->ninputs++] = instr;
ctx->frag_pos->regs[2]->instr = instr;
@@ -1998,6 +2345,10 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
out = out->regs[1]->instr;
out->regs[0]->flags |= IR3_REG_HALF;
}
+
+ if (out->category == 1) {
+ out->cat1.dst_type = half_type(out->cat1.dst_type);
+ }
}
}
@@ -2058,6 +2409,11 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
ir3_legalize(ir, &so->has_samp, &max_bary);
+ if (fd_mesa_debug & FD_DBG_OPTMSGS) {
+ printf("AFTER LEGALIZE:\n");
+ ir3_print(ir);
+ }
+
/* fixup input/outputs: */
for (i = 0; i < so->outputs_count; i++) {
so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index a477bd4b237..8c7c80f7aae 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -54,6 +54,13 @@ static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags)
/* TODO: remove this hack: */
if (is_meta(src_instr) && (src_instr->opc == OPC_META_FO))
return false;
+ /* TODO: we currently don't handle left/right neighbors
+ * very well when inserting parallel-copies into phi..
+ * to avoid problems don't eliminate a mov coming out
+ * of phi..
+ */
+ if (is_meta(src_instr) && (src_instr->opc == OPC_META_PHI))
+ return false;
return true;
}
return false;
@@ -390,7 +397,7 @@ instr_cp(struct ir3_instruction *instr, unsigned *flags)
void
ir3_cp(struct ir3 *ir)
{
- ir3_clear_mark(ir->block->shader);
+ ir3_clear_mark(ir);
for (unsigned i = 0; i < ir->noutputs; i++) {
if (ir->outputs[i]) {
@@ -400,4 +407,9 @@ ir3_cp(struct ir3 *ir)
ir->outputs[i] = out;
}
}
+
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ if (block->condition)
+ block->condition = instr_cp(block->condition, NULL);
+ }
}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
index 6fc8b1762ff..3a108243479 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
@@ -134,6 +134,8 @@ remove_unused_by_block(struct ir3_block *block)
{
list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
if (!ir3_instr_check_mark(instr)) {
+ if (is_flow(instr) && (instr->opc == OPC_END))
+ continue;
/* mark it, in case it is input, so we can
* remove unused inputs:
*/
@@ -149,13 +151,21 @@ ir3_depth(struct ir3 *ir)
{
unsigned i;
- ir3_clear_mark(ir->block->shader);
+ ir3_clear_mark(ir);
for (i = 0; i < ir->noutputs; i++)
if (ir->outputs[i])
ir3_instr_depth(ir->outputs[i]);
+ /* We also need to account for if-condition: */
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ if (block->condition)
+ ir3_instr_depth(block->condition);
+ }
+
/* mark un-used instructions: */
- remove_unused_by_block(ir->block);
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ remove_unused_by_block(block);
+ }
/* cleanup unused inputs: */
for (i = 0; i < ir->ninputs; i++) {
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_group.c b/src/gallium/drivers/freedreno/ir3/ir3_group.c
index 1fe09cc11e5..70d9b08e019 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_group.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_group.c
@@ -116,6 +116,10 @@ restart:
conflict = conflicts(instr->cp.left, left) ||
conflicts(instr->cp.right, right);
+ /* RA can't yet deal very well w/ group'd phi's: */
+ if (is_meta(instr) && (instr->opc == OPC_META_PHI))
+ conflict = true;
+
/* we also can't have an instr twice in the group: */
for (j = i + 1; (j < n) && !conflict; j++)
if (ops->get(arr, j) == instr)
@@ -226,7 +230,6 @@ find_neighbors(struct ir3 *ir)
for (i = 0; i < ir->noutputs; i += 4)
group_n(&arr_ops_out, &ir->outputs[i], 4);
-
for (i = 0; i < ir->noutputs; i++) {
if (ir->outputs[i]) {
struct ir3_instruction *instr = ir->outputs[i];
@@ -238,6 +241,6 @@ find_neighbors(struct ir3 *ir)
void
ir3_group(struct ir3 *ir)
{
- ir3_clear_mark(ir->block->shader);
+ ir3_clear_mark(ir);
find_neighbors(ir);
}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
index 34055f4c612..f4a4223ae17 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
@@ -42,15 +42,28 @@
*/
struct ir3_legalize_ctx {
- struct ir3_block *block;
bool has_samp;
int max_bary;
};
+/* We want to evaluate each block from the position of any other
+ * predecessor block, in order that the flags set are the union
+ * of all possible program paths. For stopping condition, we
+ * want to stop when the pair of <pred-block, current-block> has
+ * been visited already.
+ *
+ * XXX is that completely true? We could have different needs_xyz
+ * flags set depending on path leading to pred-block.. we could
+ * do *most* of this based on chasing src instructions ptrs (and
+ * following all phi srcs).. except the write-after-read hazzard.
+ *
+ * For now we just set ss/sy flag on first instruction on block,
+ * and handle everything within the block as before.
+ */
+
static void
-legalize(struct ir3_legalize_ctx *ctx)
+legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
{
- struct ir3_block *block = ctx->block;
struct ir3_instruction *last_input = NULL;
struct ir3_instruction *last_rel = NULL;
struct list_head instr_list;
@@ -203,6 +216,7 @@ legalize(struct ir3_legalize_ctx *ctx)
ir3_reg_create(baryf, regid(0, 0), 0);
/* insert the dummy bary.f after last_input: */
+ list_delinit(&baryf->node);
list_add(&baryf->node, &last_input->node);
last_input = baryf;
@@ -213,23 +227,177 @@ legalize(struct ir3_legalize_ctx *ctx)
if (last_rel)
last_rel->flags |= IR3_INSTR_UL;
- /* create/add 'end' instruction: */
- ir3_instr_create(block, 0, OPC_END);
-
list_first_entry(&block->instr_list, struct ir3_instruction, node)
->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
}
+/* NOTE: branch instructions are always the last instruction(s)
+ * in the block. We take advantage of this as we resolve the
+ * branches, since "if (foo) break;" constructs turn into
+ * something like:
+ *
+ * block3 {
+ * ...
+ * 0029:021: mov.s32s32 r62.x, r1.y
+ * 0082:022: br !p0.x, target=block5
+ * 0083:023: br p0.x, target=block4
+ * // succs: if _[0029:021: mov.s32s32] block4; else block5;
+ * }
+ * block4 {
+ * 0084:024: jump, target=block6
+ * // succs: block6;
+ * }
+ * block5 {
+ * 0085:025: jump, target=block7
+ * // succs: block7;
+ * }
+ *
+ * ie. only instruction in block4/block5 is a jump, so when
+ * resolving branches we can easily detect this by checking
+ * that the first instruction in the target block is itself
+ * a jump, and setup the br directly to the jump's target
+ * (and strip back out the now unreached jump)
+ *
+ * TODO sometimes we end up with things like:
+ *
+ * br !p0.x, #2
+ * br p0.x, #12
+ * add.u r0.y, r0.y, 1
+ *
+ * If we swapped the order of the branches, we could drop one.
+ */
+static struct ir3_block *
+resolve_dest_block(struct ir3_block *block)
+{
+ /* special case for last block: */
+ if (!block->successors[0])
+ return block;
+
+ /* NOTE that we may or may not have inserted the jump
+ * in the target block yet, so conditions to resolve
+ * the dest to the dest block's successor are:
+ *
+ * (1) successor[1] == NULL &&
+ * (2) (block-is-empty || only-instr-is-jump)
+ */
+ if (block->successors[1] == NULL) {
+ if (list_empty(&block->instr_list)) {
+ return block->successors[0];
+ } else if (list_length(&block->instr_list) == 1) {
+ struct ir3_instruction *instr = list_first_entry(
+ &block->instr_list, struct ir3_instruction, node);
+ if (is_flow(instr) && (instr->opc == OPC_JUMP))
+ return block->successors[0];
+ }
+ }
+ return block;
+}
+
+static bool
+resolve_jump(struct ir3_instruction *instr)
+{
+ struct ir3_block *tblock =
+ resolve_dest_block(instr->cat0.target);
+ struct ir3_instruction *target;
+
+ if (tblock != instr->cat0.target) {
+ list_delinit(&instr->cat0.target->node);
+ instr->cat0.target = tblock;
+ return true;
+ }
+
+ target = list_first_entry(&tblock->instr_list,
+ struct ir3_instruction, node);
+
+ if ((!target) || (target->ip == (instr->ip + 1))) {
+ list_delinit(&instr->node);
+ return true;
+ } else {
+ instr->cat0.immed =
+ (int)target->ip - (int)instr->ip;
+ }
+ return false;
+}
+
+/* resolve jumps, removing jumps/branches to immediately following
+ * instruction which we end up with from earlier stages. Since
+ * removing an instruction can invalidate earlier instruction's
+ * branch offsets, we need to do this iteratively until no more
+ * branches are removed.
+ */
+static bool
+resolve_jumps(struct ir3 *ir)
+{
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node)
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
+ if (is_flow(instr) && instr->cat0.target)
+ if (resolve_jump(instr))
+ return true;
+
+ return false;
+}
+
+/* we want to mark points where divergent flow control re-converges
+ * with (jp) flags. For now, since we don't do any optimization for
+ * things that start out as a 'do {} while()', re-convergence points
+ * will always be a branch or jump target. Note that this is overly
+ * conservative, since unconditional jump targets are not convergence
+ * points, we are just assuming that the other path to reach the jump
+ * target was divergent. If we were clever enough to optimize the
+ * jump at end of a loop back to a conditional branch into a single
+ * conditional branch, ie. like:
+ *
+ * add.f r1.w, r0.x, (neg)(r)c2.x <= loop start
+ * mul.f r1.z, r1.z, r0.x
+ * mul.f r1.y, r1.y, r0.x
+ * mul.f r0.z, r1.x, r0.x
+ * mul.f r0.w, r0.y, r0.x
+ * cmps.f.ge r0.x, (r)c2.y, (r)r1.w
+ * add.s r0.x, (r)r0.x, (r)-1
+ * sel.f32 r0.x, (r)c3.y, (r)r0.x, c3.x
+ * cmps.f.eq p0.x, r0.x, c3.y
+ * mov.f32f32 r0.x, r1.w
+ * mov.f32f32 r0.y, r0.w
+ * mov.f32f32 r1.x, r0.z
+ * (rpt2)nop
+ * br !p0.x, #-13
+ * (jp)mul.f r0.x, c263.y, r1.y
+ *
+ * Then we'd have to be more clever, as the convergence point is no
+ * longer a branch or jump target.
+ */
+static void
+mark_convergence_points(struct ir3 *ir)
+{
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ if (is_flow(instr) && instr->cat0.target) {
+ struct ir3_instruction *target =
+ list_first_entry(&instr->cat0.target->instr_list,
+ struct ir3_instruction, node);
+ target->flags |= IR3_INSTR_JP;
+ }
+ }
+ }
+}
+
void
ir3_legalize(struct ir3 *ir, bool *has_samp, int *max_bary)
{
struct ir3_legalize_ctx ctx = {
- .block = ir->block,
.max_bary = -1,
};
- legalize(&ctx);
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ legalize_block(&ctx, block);
+ }
*has_samp = ctx.has_samp;
*max_bary = ctx.max_bary;
+
+ do {
+ ir3_count_instructions(ir);
+ } while(resolve_jumps(ir));
+
+ mark_convergence_points(ir);
}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c
index 965c834b8aa..f377982dd5e 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_print.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c
@@ -137,6 +137,16 @@ tab(int lvl)
printf("\t");
}
+static uint32_t
+block_id(struct ir3_block *block)
+{
+#ifdef DEBUG
+ return block->serialno;
+#else
+ return (uint32_t)(uint64_t)block;
+#endif
+}
+
static void
print_instr(struct ir3_instruction *instr, int lvl)
{
@@ -173,6 +183,14 @@ print_instr(struct ir3_instruction *instr, int lvl)
}
}
+ if (is_flow(instr) && instr->cat0.target) {
+ /* the predicate register src is implied: */
+ if (instr->opc == OPC_BR) {
+ printf(" %sp0.x", instr->cat0.inv ? "!" : "");
+ }
+ printf(", target=block%u", block_id(instr->cat0.target));
+ }
+
printf("\n");
}
@@ -184,19 +202,31 @@ void ir3_print_instr(struct ir3_instruction *instr)
static void
print_block(struct ir3_block *block, int lvl)
{
- tab(lvl); printf("block {\n");
+ tab(lvl); printf("block%u {\n", block_id(block));
list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
print_instr(instr, lvl+1);
}
+ if (block->successors[1]) {
+ /* leading into if/else: */
+ tab(lvl+1);
+ printf("/* succs: if _[");
+ print_instr_name(block->condition);
+ printf("] block%u; else block%u; */\n",
+ block_id(block->successors[0]),
+ block_id(block->successors[1]));
+ } else if (block->successors[0]) {
+ tab(lvl+1);
+ printf("/* succs: block%u; */\n",
+ block_id(block->successors[0]));
+ }
tab(lvl); printf("}\n");
}
void
ir3_print(struct ir3 *ir)
{
- struct ir3_block *block = ir->block;
-
- print_block(block, 0);
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node)
+ print_block(block, 0);
for (unsigned i = 0; i < ir->noutputs; i++) {
if (!ir->outputs[i])
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index 394c63f646d..359cd9a0d5d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -29,6 +29,7 @@
#include "util/u_math.h"
#include "util/register_allocate.h"
#include "util/ralloc.h"
+#include "util/bitset.h"
#include "ir3.h"
#include "ir3_compiler.h"
@@ -255,6 +256,14 @@ struct ir3_ra_ctx {
unsigned *def, *use; /* def/use table */
};
+/* additional block-data (per-block) */
+struct ir3_ra_block_data {
+ BITSET_WORD *def; /* variables defined before used in block */
+ BITSET_WORD *use; /* variables used before defined in block */
+ BITSET_WORD *livein; /* which defs reach entry point of block */
+ BITSET_WORD *liveout; /* which defs reach exit point of block */
+};
+
static bool
is_half(struct ir3_instruction *instr)
{
@@ -369,7 +378,39 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off)
*sz = util_last_bit(instr->regs[0]->wrmask);
}
*off = 0;
- return instr;
+ d = instr;
+ }
+
+ if (d->regs[0]->flags & IR3_REG_PHI_SRC) {
+ struct ir3_instruction *phi = d->regs[0]->instr;
+ struct ir3_instruction *dd;
+ int dsz, doff;
+
+ dd = get_definer(phi, &dsz, &doff);
+
+ *sz = MAX2(*sz, dsz);
+ *off = doff;
+
+ if (dd->ip < d->ip) {
+ d = dd;
+ }
+ }
+
+ if (is_meta(d) && (d->opc == OPC_META_PHI)) {
+ /* we have already inserted parallel-copies into
+ * the phi, so we don't need to chase definers
+ */
+ struct ir3_register *src;
+
+ /* note: don't use foreach_ssa_src as this gets called once
+ * while assigning regs (which clears SSA flag)
+ */
+ foreach_src(src, d) {
+ if (!src->instr)
+ continue;
+ if (src->instr->ip < d->ip)
+ d = src->instr;
+ }
}
if (is_meta(d) && (d->opc == OPC_META_FO)) {
@@ -396,13 +437,11 @@ static void
ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
{
list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
- instr->ip = ctx->instr_cnt++;
- }
-
- list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
struct ir3_instruction *defn;
int cls, sz, off;
+ ctx->instr_cnt++;
+
if (instr->regs_count == 0)
continue;
@@ -431,8 +470,11 @@ static void
ra_init(struct ir3_ra_ctx *ctx)
{
ir3_clear_mark(ctx->ir);
+ ir3_count_instructions(ctx->ir);
- ra_block_name_instructions(ctx, ctx->ir->block);
+ list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+ ra_block_name_instructions(ctx, block);
+ }
/* figure out the base register name for each class. The
* actual ra name is class_base[cls] + instr->name;
@@ -448,6 +490,16 @@ ra_init(struct ir3_ra_ctx *ctx)
ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
}
+static unsigned
+ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
+{
+ unsigned name;
+ debug_assert(cls >= 0);
+ name = ctx->class_base[cls] + defn->name;
+ debug_assert(name < ctx->alloc_count);
+ return name;
+}
+
static void
ra_destroy(struct ir3_ra_ctx *ctx)
{
@@ -457,6 +509,18 @@ ra_destroy(struct ir3_ra_ctx *ctx)
static void
ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
{
+ struct ir3_ra_block_data *bd;
+ unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+
+ bd = rzalloc(ctx->g, struct ir3_ra_block_data);
+
+ bd->def = rzalloc_array(bd, BITSET_WORD, bitset_words);
+ bd->use = rzalloc_array(bd, BITSET_WORD, bitset_words);
+ bd->livein = rzalloc_array(bd, BITSET_WORD, bitset_words);
+ bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
+
+ block->bd = bd;
+
list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
struct ir3_instruction *src;
@@ -474,7 +538,15 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
* fanin: used to collect values from lower class and assemble
* them together into a higher class, for example arguments
* to texture sample instructions; We consider these to be
- * defined at the fanin node.
+ * defined at the earliest fanin source.
+ *
+ * phi: used to merge values from different flow control paths
+ * to the same reg. Consider defined at earliest phi src,
+ * and update all the other phi src's (which may come later
+ * in the program) as users to extend the var's live range.
+ *
+ * Most of this, other than phi, is completely handled in the
+ * get_definer() helper.
*
* In either case, we trace the instruction back to the original
* definer and consider that as the def/use ip.
@@ -491,11 +563,15 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
*/
cls = size_to_class(sz, is_half(defn));
if (cls >= 0) {
- unsigned name = ctx->class_base[cls] + defn->name;
+ unsigned name = ra_name(ctx, cls, defn);
+
ctx->def[name] = defn->ip;
ctx->use[name] = defn->ip;
- debug_assert(name < ctx->alloc_count);
+ /* since we are in SSA at this point: */
+ debug_assert(!BITSET_TEST(bd->use, name));
+
+ BITSET_SET(bd->def, name);
if (is_half(defn)) {
ra_set_node_class(ctx->g, name,
@@ -504,6 +580,24 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
ra_set_node_class(ctx->g, name,
ctx->set->classes[cls]);
}
+
+ /* extend the live range for phi srcs, which may come
+ * from the bottom of the loop
+ */
+ if (defn->regs[0]->flags & IR3_REG_PHI_SRC) {
+ struct ir3_instruction *phi = defn->regs[0]->instr;
+ foreach_ssa_src(src, phi) {
+ /* if src is after phi, then we need to extend
+ * the liverange to the end of src's block:
+ */
+ if (src->ip > phi->ip) {
+ struct ir3_instruction *last =
+ list_last_entry(&src->block->instr_list,
+ struct ir3_instruction, node);
+ ctx->use[name] = MAX2(ctx->use[name], last->ip);
+ }
+ }
+ }
}
}
}
@@ -516,12 +610,59 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
srcdefn = get_definer(src, &sz, &off);
cls = size_to_class(sz, is_half(srcdefn));
if (cls >= 0) {
- unsigned name = ctx->class_base[cls] + srcdefn->name;
- ctx->use[name] = instr->ip;
+ unsigned name = ra_name(ctx, cls, srcdefn);
+ ctx->use[name] = MAX2(ctx->use[name], instr->ip);
+ if (!BITSET_TEST(bd->def, name))
+ BITSET_SET(bd->use, name);
+ }
+ }
+ }
+ }
+}
+
+static bool
+ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
+{
+ unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+ bool progress = false;
+
+ list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+ struct ir3_ra_block_data *bd = block->bd;
+
+ /* update livein: */
+ for (unsigned i = 0; i < bitset_words; i++) {
+ BITSET_WORD new_livein =
+ (bd->use[i] | (bd->liveout[i] & ~bd->def[i]));
+
+ if (new_livein & ~bd->livein[i]) {
+ bd->livein[i] |= new_livein;
+ progress = true;
+ }
+ }
+
+ /* update liveout: */
+ for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) {
+ struct ir3_block *succ = block->successors[j];
+ struct ir3_ra_block_data *succ_bd;
+
+ if (!succ)
+ continue;
+
+ succ_bd = succ->bd;
+
+ for (unsigned i = 0; i < bitset_words; i++) {
+ BITSET_WORD new_liveout =
+ (succ_bd->livein[i] & ~bd->liveout[i]);
+
+ if (new_liveout) {
+ bd->liveout[i] |= new_liveout;
+ progress = true;
}
}
}
}
+
+ return progress;
}
static void
@@ -529,7 +670,34 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
{
struct ir3 *ir = ctx->ir;
- ra_block_compute_live_ranges(ctx, ctx->ir->block);
+ /* compute live ranges (use/def) on a block level, also updating
+ * block's def/use bitmasks (used below to calculate per-block
+ * livein/liveout):
+ */
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ ra_block_compute_live_ranges(ctx, block);
+ }
+
+ /* update per-block livein/liveout: */
+ while (ra_compute_livein_liveout(ctx)) {}
+
+ /* extend start/end ranges based on livein/liveout info from cfg: */
+ unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ struct ir3_ra_block_data *bd = block->bd;
+
+ for (unsigned i = 0; i < bitset_words; i++) {
+ if (BITSET_TEST(bd->livein, i)) {
+ ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
+ ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
+ }
+
+ if (BITSET_TEST(bd->liveout, i)) {
+ ctx->def[i] = MIN2(ctx->def[i], block->end_ip);
+ ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
+ }
+ }
+ }
/* need to fix things up to keep outputs live: */
for (unsigned i = 0; i < ir->noutputs; i++) {
@@ -540,7 +708,7 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
defn = get_definer(instr, &sz, &off);
cls = size_to_class(sz, is_half(defn));
if (cls >= 0) {
- unsigned name = ctx->class_base[cls] + defn->name;
+ unsigned name = ra_name(ctx, cls, defn);
ctx->use[name] = ctx->instr_cnt;
}
}
@@ -555,23 +723,6 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
}
}
-static type_t half_type(type_t type)
-{
- switch (type) {
- case TYPE_F32: return TYPE_F16;
- case TYPE_U32: return TYPE_U16;
- case TYPE_S32: return TYPE_S16;
- /* instructions may already be fixed up: */
- case TYPE_F16:
- case TYPE_U16:
- case TYPE_S16:
- return type;
- default:
- assert(0);
- return ~0;
- }
-}
-
/* some instructions need fix-up if dst register is half precision: */
static void fixup_half_instr_dst(struct ir3_instruction *instr)
{
@@ -633,7 +784,7 @@ reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
defn = get_definer(instr, &sz, &off);
cls = size_to_class(sz, is_half(defn));
if (cls >= 0) {
- unsigned name = ctx->class_base[cls] + defn->name;
+ unsigned name = ra_name(ctx, cls, defn);
unsigned r = ra_get_node_reg(ctx->g, name);
unsigned num = ctx->set->ra_reg_to_gpr[r] + off;
@@ -641,7 +792,7 @@ reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
num += reg->offset;
reg->num = num;
- reg->flags &= ~IR3_REG_SSA;
+ reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC);
if (is_half(defn))
reg->flags |= IR3_REG_HALF;
@@ -686,8 +837,8 @@ ra_alloc(struct ir3_ra_ctx *ctx)
unsigned i = 0, j;
if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) {
struct ir3_instruction *instr = ir->inputs[i];
- unsigned cls = size_to_class(1, true);
- unsigned name = ctx->class_base[cls] + instr->name;
+ int cls = size_to_class(1, true);
+ unsigned name = ra_name(ctx, cls, instr);
unsigned reg = ctx->set->gpr_to_ra_reg[cls][0];
/* if we have frag_face, it gets hr0.x */
@@ -706,8 +857,7 @@ ra_alloc(struct ir3_ra_ctx *ctx)
unsigned name, reg;
cls = size_to_class(sz, is_half(defn));
- debug_assert(cls >= 0);
- name = ctx->class_base[cls] + defn->name;
+ name = ra_name(ctx, cls, defn);
reg = ctx->set->gpr_to_ra_reg[cls][j];
ra_set_node_reg(ctx->g, name, reg);
@@ -720,7 +870,9 @@ ra_alloc(struct ir3_ra_ctx *ctx)
if (!ra_allocate(ctx->g))
return -1;
- ra_block_alloc(ctx, ctx->ir->block);
+ list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+ ra_block_alloc(ctx, block);
+ }
return 0;
}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index 0d404a83583..49a4426d163 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -205,6 +205,16 @@ instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
struct ir3_instruction *src;
unsigned delay = 0;
+ /* Phi instructions can have a dependency on something not
+ * scheduled yet (for ex, loops). But OTOH we don't really
+ * care. By definition phi's should appear at the top of
+ * the block, and it's sources should be values from the
+ * previously executing block, so they are always ready to
+ * be scheduled:
+ */
+ if (is_meta(instr) && (instr->opc == OPC_META_PHI))
+ return 0;
+
foreach_ssa_src(src, instr) {
/* if dependency not scheduled, we aren't ready yet: */
if (!is_scheduled(src))
@@ -422,13 +432,87 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
}
}
}
+
+ /* And lastly, insert branch/jump instructions to take us to
+ * the next block. Later we'll strip back out the branches
+ * that simply jump to next instruction.
+ */
+ if (block->successors[1]) {
+ /* if/else, conditional branches to "then" or "else": */
+ struct ir3_instruction *br;
+ unsigned delay = 6;
+
+ debug_assert(ctx->pred);
+ debug_assert(block->condition);
+
+ delay -= distance(ctx, ctx->pred, delay);
+
+ while (delay > 0) {
+ ir3_NOP(block);
+ delay--;
+ }
+
+ /* create "else" branch first (since "then" block should
+ * frequently/always end up being a fall-thru):
+ */
+ br = ir3_BR(block);
+ br->cat0.inv = true;
+ br->cat0.target = block->successors[1];
+
+ /* NOTE: we have to hard code delay of 6 above, since
+ * we want to insert the nop's before constructing the
+ * branch. Throw in an assert so we notice if this
+ * ever breaks on future generation:
+ */
+ debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
+
+ br = ir3_BR(block);
+ br->cat0.target = block->successors[0];
+
+ } else if (block->successors[0]) {
+ /* otherwise unconditional jump to next block: */
+ struct ir3_instruction *jmp;
+
+ jmp = ir3_JUMP(block);
+ jmp->cat0.target = block->successors[0];
+ }
+
+ /* NOTE: if we kept track of the predecessors, we could do a better
+ * job w/ (jp) flags.. every node w/ > predecessor is a join point.
+ * Note that as we eliminate blocks which contain only an unconditional
+ * jump we probably need to propagate (jp) flag..
+ */
+}
+
+/* this is needed to ensure later RA stage succeeds: */
+static void
+sched_insert_parallel_copies(struct ir3_block *block)
+{
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ if (is_meta(instr) && (instr->opc == OPC_META_PHI)) {
+ struct ir3_register *reg;
+ foreach_src(reg, instr) {
+ struct ir3_instruction *src = reg->instr;
+ struct ir3_instruction *mov =
+ ir3_MOV(src->block, src, TYPE_U32);
+ mov->regs[0]->flags |= IR3_REG_PHI_SRC;
+ mov->regs[0]->instr = instr;
+ reg->instr = mov;
+ }
+ }
+ }
}
int ir3_sched(struct ir3 *ir)
{
struct ir3_sched_ctx ctx = {0};
- ir3_clear_mark(ir->block->shader);
- sched_block(&ctx, ir->block);
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ sched_insert_parallel_copies(block);
+ }
+ ir3_clear_mark(ir);
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ sched_block(&ctx, block);
+ }
if (ctx.error)
return -1;
return 0;