diff options
author | Jose Maria Casanova Crespo <[email protected]> | 2019-07-02 18:31:09 +0200 |
---|---|---|
committer | Jose Maria Casanova Crespo <[email protected]> | 2019-07-22 03:00:50 +0200 |
commit | c341ab7ffbac822d3d3cbb3d3ae9d2a19ea3cc9a (patch) | |
tree | 70eba195aec80c8ba4d6f0f721fd957b386e933f | |
parent | f7224014df0d366453739356b9968ca94ad43979 (diff) |
v3d: add shader-db stat to count SFU stalls
SFU operations have a latency of 2 cicles, so if their results
are used in the following cycle to a SFU instruction, the GPU
stalls for an extra cycle until the result is available.
This adds the number of stalls to the shader-db debug mode and
sum of instruction + stalls to evaluate optimizations to schedule
instructions that avoid generating sfu-stalls.
v2: Rename v3d_qpu_generates_sfu_stalls to v3d_qpu_instr_is_sfu (Eric)
Reviewed-by: Eric Anholt <[email protected]>
-rw-r--r-- | src/broadcom/compiler/qpu_schedule.c | 45 | ||||
-rw-r--r-- | src/broadcom/compiler/v3d_compiler.h | 1 | ||||
-rw-r--r-- | src/broadcom/compiler/vir.c | 7 | ||||
-rw-r--r-- | src/broadcom/qpu/qpu_instr.c | 34 | ||||
-rw-r--r-- | src/broadcom/qpu/qpu_instr.h | 1 |
5 files changed, 74 insertions, 14 deletions
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c index b8e04f6ea13..370881b00ad 100644 --- a/src/broadcom/compiler/qpu_schedule.c +++ b/src/broadcom/compiler/qpu_schedule.c @@ -440,6 +440,8 @@ struct choose_scoreboard { struct dag *dag; int tick; int last_magic_sfu_write_tick; + int last_stallable_sfu_reg; + int last_stallable_sfu_tick; int last_ldvary_tick; int last_uniforms_reset_tick; int last_thrsw_tick; @@ -531,6 +533,33 @@ pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst)); } +static bool +qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst, + uint32_t waddr) { + + if (inst->type != V3D_QPU_INSTR_TYPE_ALU) + return false; + + if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) && + inst->raddr_a == waddr) + return true; + + if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) && + !inst->sig.small_imm && (inst->raddr_b == waddr)) + return true; + + return false; +} + +static bool +mux_read_stalls(struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst) +{ + return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 && + qpu_instruction_uses_rf(inst, + scoreboard->last_stallable_sfu_reg); +} + static int get_instruction_priority(const struct v3d_qpu_instr *inst) { @@ -852,6 +881,16 @@ update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard, } static void +update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst) +{ + if (v3d_qpu_instr_is_sfu(inst)) { + scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr; + scoreboard->last_stallable_sfu_tick = scoreboard->tick; + } +} + +static void update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, const struct v3d_qpu_instr *inst) { @@ -864,6 +903,9 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, if (inst->alu.add.magic_write) { update_scoreboard_for_magic_waddr(scoreboard, inst->alu.add.waddr); + } else { + update_scoreboard_for_sfu_stall_waddr(scoreboard, + inst); } } @@ -1298,6 +1340,8 @@ schedule_instructions(struct v3d_compile *c, fprintf(stderr, "\n"); } } + if (mux_read_stalls(scoreboard, inst)) + c->qpu_inst_stalled_count++; } /* Update the uniform index for the rewritten location -- @@ -1481,6 +1525,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c) scoreboard.last_magic_sfu_write_tick = -10; scoreboard.last_uniforms_reset_tick = -10; scoreboard.last_thrsw_tick = -10; + scoreboard.last_stallable_sfu_tick = -10; if (debug) { fprintf(stderr, "Pre-schedule instructions\n"); diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index da32d47a28d..b61119f5615 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -613,6 +613,7 @@ struct v3d_compile { uint64_t *qpu_insts; uint32_t qpu_inst_count; uint32_t qpu_inst_size; + uint32_t qpu_inst_stalled_count; /* For the FS, the number of varying inputs not counting the * point/line varyings payload diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index 04129fa522e..eed3fc18b12 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -947,7 +947,8 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler, char *shaderdb; int ret = asprintf(&shaderdb, "%s shader: %d inst, %d threads, %d loops, " - "%d uniforms, %d max-temps, %d:%d spills:fills", + "%d uniforms, %d max-temps, %d:%d spills:fills, " + "%d sfu-stalls, %d inst-and-stalls", vir_get_stage_name(c), c->qpu_inst_count, c->threads, @@ -955,7 +956,9 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler, c->num_uniforms, vir_get_max_temps(c), c->spills, - c->fills); + c->fills, + c->qpu_inst_stalled_count, + c->qpu_inst_count + c->qpu_inst_stalled_count); if (ret >= 0) { if (V3D_DEBUG & V3D_DEBUG_SHADERDB) fprintf(stderr, "SHADER-DB: %s\n", shaderdb); diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c index 66e53a6accd..09d06b3fa3e 100644 --- a/src/broadcom/qpu/qpu_instr.c +++ b/src/broadcom/qpu/qpu_instr.c @@ -645,19 +645,10 @@ v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) bool v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst) { - if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { - switch (inst->alu.add.op) { - case V3D_QPU_A_RECIP: - case V3D_QPU_A_RSQRT: - case V3D_QPU_A_EXP: - case V3D_QPU_A_LOG: - case V3D_QPU_A_SIN: - case V3D_QPU_A_RSQRT2: - return true; - default: - break; - } + if (v3d_qpu_instr_is_sfu(inst)) + return true; + if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { if (inst->alu.add.magic_write && v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr)) { return true; @@ -673,6 +664,25 @@ v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst) } bool +v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) +{ + if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { + switch (inst->alu.add.op) { + case V3D_QPU_A_RECIP: + case V3D_QPU_A_RSQRT: + case V3D_QPU_A_EXP: + case V3D_QPU_A_LOG: + case V3D_QPU_A_SIN: + case V3D_QPU_A_RSQRT2: + return true; + default: + return false; + } + } + return false; +} + +bool v3d_qpu_writes_tmu(const struct v3d_qpu_instr *inst) { return (inst->type == V3D_QPU_INSTR_TYPE_ALU && diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h index 968d0f6fd65..ad2d37b6051 100644 --- a/src/broadcom/qpu/qpu_instr.h +++ b/src/broadcom/qpu/qpu_instr.h @@ -447,6 +447,7 @@ bool v3d_qpu_magic_waddr_is_vpm(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; bool v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; bool v3d_qpu_magic_waddr_loads_unif(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; bool v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; +bool v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_writes_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_writes_tmu_not_tmuc(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; |