aboutsummaryrefslogtreecommitdiffstats
path: root/src/gallium
diff options
context:
space:
mode:
authorEric Anholt <[email protected]>2014-12-01 11:48:20 -0800
committerEric Anholt <[email protected]>2014-12-01 22:29:42 -0800
commit29c7cf2b2ba850cf467167548d53383e1338fd5c (patch)
treeff978d0d716844b991b5e235483efbc53f2c9e73 /src/gallium
parent7b0067d23a6f64cf83c42e7f11b2cd4100c569fe (diff)
vc4: Pair up QPU instructions when scheduling.
We've got two mostly-independent operations in each QPU instruction, so try to pack two operations together. This is fairly naive (doesn't track read and write separately in instructions, doesn't convert ADD-based MOVs into MUL-based movs, doesn't reorder across uniform loads), but does show a decent improvement on shader-db-2. total instructions in shared programs: 59583 -> 57651 (-3.24%) instructions in affected programs: 47361 -> 45429 (-4.08%)
Diffstat (limited to 'src/gallium')
-rw-r--r--src/gallium/drivers/vc4/vc4_qpu.c62
-rw-r--r--src/gallium/drivers/vc4/vc4_qpu.h2
-rw-r--r--src/gallium/drivers/vc4/vc4_qpu_schedule.c79
3 files changed, 105 insertions, 38 deletions
diff --git a/src/gallium/drivers/vc4/vc4_qpu.c b/src/gallium/drivers/vc4/vc4_qpu.c
index 723b3613665..54c79e9d4f1 100644
--- a/src/gallium/drivers/vc4/vc4_qpu.c
+++ b/src/gallium/drivers/vc4/vc4_qpu.c
@@ -192,36 +192,58 @@ qpu_m_alu2(enum qpu_op_mul op,
return inst;
}
-static uint64_t
-merge_fields(uint64_t merge,
- uint64_t add, uint64_t mul,
+static bool
+merge_fields(uint64_t *merge,
+ uint64_t a, uint64_t b,
uint64_t mask, uint64_t ignore)
{
- if ((add & mask) == ignore)
- return (merge & ~mask) | (mul & mask);
- else if ((mul & mask) == ignore)
- return (merge & ~mask) | (add & mask);
- else {
- assert((add & mask) == (mul & mask));
- return merge;
+ if ((a & mask) == ignore) {
+ *merge = (*merge & ~mask) | (b & mask);
+ } else if ((b & mask) == ignore) {
+ *merge = (*merge & ~mask) | (a & mask);
+ } else {
+ if ((a & mask) != (b & mask))
+ return false;
}
+
+ return true;
}
uint64_t
-qpu_inst(uint64_t add, uint64_t mul)
+qpu_merge_inst(uint64_t a, uint64_t b)
{
- uint64_t merge = ((add & ~QPU_WADDR_MUL_MASK) |
- (mul & ~QPU_WADDR_ADD_MASK));
+ uint64_t merge = a | b;
+ bool ok = true;
+
+ if (QPU_GET_FIELD(a, QPU_OP_ADD) != QPU_A_NOP &&
+ QPU_GET_FIELD(b, QPU_OP_ADD) != QPU_A_NOP)
+ return 0;
- merge = merge_fields(merge, add, mul, QPU_SIG_MASK,
- QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG));
+ if (QPU_GET_FIELD(a, QPU_OP_MUL) != QPU_M_NOP &&
+ QPU_GET_FIELD(b, QPU_OP_MUL) != QPU_M_NOP)
+ return 0;
- merge = merge_fields(merge, add, mul, QPU_RADDR_A_MASK,
- QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A));
- merge = merge_fields(merge, add, mul, QPU_RADDR_B_MASK,
- QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_B));
+ ok = ok && merge_fields(&merge, a, b, QPU_SIG_MASK,
+ QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG));
- return merge;
+ /* Misc fields that have to match exactly. */
+ ok = ok && merge_fields(&merge, a, b, QPU_SF | QPU_WS | QPU_PM,
+ ~0);
+
+ ok = ok && merge_fields(&merge, a, b, QPU_RADDR_A_MASK,
+ QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A));
+ ok = ok && merge_fields(&merge, a, b, QPU_RADDR_B_MASK,
+ QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_B));
+
+ ok = ok && merge_fields(&merge, a, b, QPU_WADDR_ADD_MASK,
+ QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_ADD));
+ ok = ok && merge_fields(&merge, a, b, QPU_WADDR_MUL_MASK,
+ QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_MUL));
+
+ if (ok)
+ return merge;
+ else
+ return 0;
}
uint64_t
diff --git a/src/gallium/drivers/vc4/vc4_qpu.h b/src/gallium/drivers/vc4/vc4_qpu.h
index bf41f72c34b..eb06d1a0720 100644
--- a/src/gallium/drivers/vc4/vc4_qpu.h
+++ b/src/gallium/drivers/vc4/vc4_qpu.h
@@ -129,7 +129,7 @@ uint64_t qpu_a_alu2(enum qpu_op_add op, struct qpu_reg dst,
struct qpu_reg src0, struct qpu_reg src1);
uint64_t qpu_m_alu2(enum qpu_op_mul op, struct qpu_reg dst,
struct qpu_reg src0, struct qpu_reg src1);
-uint64_t qpu_inst(uint64_t add, uint64_t mul);
+uint64_t qpu_merge_inst(uint64_t a, uint64_t b);
uint64_t qpu_load_imm_ui(struct qpu_reg dst, uint32_t val);
uint64_t qpu_set_sig(uint64_t inst, uint32_t sig);
uint64_t qpu_set_cond_add(uint64_t inst, uint32_t cond);
diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
index f309034fba7..8aa83741ff5 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
@@ -465,7 +465,8 @@ get_instruction_priority(uint64_t inst)
static struct schedule_node *
choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
- struct simple_node *schedule_list)
+ struct simple_node *schedule_list,
+ uint64_t prev_inst)
{
struct schedule_node *chosen = NULL;
struct simple_node *node;
@@ -490,6 +491,15 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
if (pixel_scoreboard_too_soon(scoreboard, inst))
continue;
+ /* If we're trying to pair with another instruction, check
+ * that they're compatible.
+ */
+ if (prev_inst != 0) {
+ inst = qpu_merge_inst(prev_inst, inst);
+ if (!inst)
+ continue;
+ }
+
int prio = get_instruction_priority(inst);
/* Found a valid instruction. If nothing better comes along,
@@ -571,6 +581,23 @@ compute_delay(struct schedule_node *n)
}
static void
+mark_instruction_scheduled(struct simple_node *schedule_list,
+ struct schedule_node *node)
+{
+ if (!node)
+ return;
+
+ for (int i = node->child_count - 1; i >= 0; i--) {
+ struct schedule_node *child =
+ node->children[i];
+
+ child->parent_count--;
+ if (child->parent_count == 0)
+ insert_at_head(schedule_list, &child->link);
+ }
+}
+
+static void
schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
{
struct simple_node *node, *t;
@@ -598,7 +625,9 @@ schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
while (!is_empty_list(schedule_list)) {
struct schedule_node *chosen =
choose_instruction_to_schedule(&scoreboard,
- schedule_list);
+ schedule_list,
+ 0);
+ struct schedule_node *merge = NULL;
/* If there are no valid instructions to schedule, drop a NOP
* in.
@@ -610,12 +639,38 @@ schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
dump_state(schedule_list);
fprintf(stderr, "chose: ");
vc4_qpu_disasm(&inst, 1);
- fprintf(stderr, "\n\n");
+ fprintf(stderr, "\n");
}
- /* Schedule this instruction onto the QPU list. */
- if (chosen)
+ /* Schedule this instruction onto the QPU list. Also try to
+ * find an instruction to pair with it.
+ */
+ if (chosen) {
remove_from_list(&chosen->link);
+
+ merge = choose_instruction_to_schedule(&scoreboard,
+ schedule_list,
+ inst);
+ if (merge) {
+ remove_from_list(&merge->link);
+ inst = qpu_merge_inst(inst, merge->inst->inst);
+ assert(inst != 0);
+
+ if (debug) {
+ fprintf(stderr, "merging: ");
+ vc4_qpu_disasm(&merge->inst->inst, 1);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "resulting in: ");
+ vc4_qpu_disasm(&inst, 1);
+ fprintf(stderr, "\n");
+ }
+ }
+ }
+
+ if (debug) {
+ fprintf(stderr, "\n");
+ }
+
qpu_serialize_one_inst(c, inst);
update_scoreboard_for_chosen(&scoreboard, inst);
@@ -625,18 +680,8 @@ schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
* be scheduled. Update the children's unblocked time for this
* DAG edge as we do so.
*/
- if (chosen) {
- for (int i = chosen->child_count - 1; i >= 0; i--) {
- struct schedule_node *child =
- chosen->children[i];
-
- child->parent_count--;
- if (child->parent_count == 0) {
- insert_at_head(schedule_list,
- &child->link);
- }
- }
- }
+ mark_instruction_scheduled(schedule_list, chosen);
+ mark_instruction_scheduled(schedule_list, merge);
scoreboard.tick++;
}