diff options
author | Eric Anholt <[email protected]> | 2018-12-14 14:46:48 -0800 |
---|---|---|
committer | Eric Anholt <[email protected]> | 2018-12-14 17:48:01 -0800 |
commit | 29927e7524b07d491c555b8ed06c9b89cd0856f8 (patch) | |
tree | d56ee7ab4e0a9067a7f9daa65fb2ba1559847e51 /src/broadcom/compiler/nir_to_vir.c | |
parent | 248a7fb392ba9ed0f3d25b599e214b456cefa910 (diff) |
v3d: Drop in a bunch of notes about performance improvement opportunities.
These have all been floating in my head, and while I've thought about
encoding them in issues on gitlab once they're enabled, they also make
sense to just have in the area of the code you'll need to work in.
Diffstat (limited to 'src/broadcom/compiler/nir_to_vir.c')
-rw-r--r-- | src/broadcom/compiler/nir_to_vir.c | 35 |
1 files changed, 34 insertions, 1 deletions
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index 446ac53e95f..484dc050368 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -850,6 +850,9 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) break; case nir_op_unpack_half_2x16_split_x: + /* XXX perf: It would be good to be able to merge this unpack + * with whatever uses our result. + */ result = vir_FMOV(c, src[0]); vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L); break; @@ -1489,6 +1492,10 @@ ntq_setup_registers(struct v3d_compile *c, struct exec_list *list) static void ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr) { + /* XXX perf: Experiment with using immediate loads to avoid having + * these end up in the uniform stream. Watch out for breaking the + * small immediates optimization in the process! + */ struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); for (int i = 0; i < instr->def.num_components; i++) qregs[i] = vir_uniform_ui(c, instr->value.u32[i]); @@ -1535,6 +1542,11 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) for (int i = 0; i < instr->num_components; i++) { int ubo = nir_src_as_uint(instr->src[0]); + /* XXX perf: On V3D 4.x with uniform offsets, we + * should probably try setting UBOs up in the A + * register file and doing a sequence of loads that + * way. + */ /* Adjust for where we stored the TGSI register base. */ vir_ADD_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA), @@ -1669,6 +1681,12 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) /* Clears (activates) the execute flags for any channels whose jump target * matches this block. + * + * XXX perf: Could we be using flpush/flpop somehow for our execution channel + * enabling? + * + * XXX perf: For uniform control flow, we should be able to skip c->execute + * handling entirely. */ static void ntq_activate_execute_for_block(struct v3d_compile *c) @@ -1704,6 +1722,10 @@ ntq_emit_if(struct v3d_compile *c, nir_if *if_stmt) /* Set A for executing (execute == 0) and jumping (if->condition == * 0) channels, and then update execute flags for those to point to * the ELSE block. + * + * XXX perf: we could reuse ntq_emit_comparison() to generate our if + * condition, and the .uf field to ignore non-executing channels, to + * reduce the overhead of if statements. */ vir_PF(c, vir_OR(c, c->execute, @@ -1925,6 +1947,10 @@ nir_to_vir(struct v3d_compile *c) c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1)); c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2)); + /* XXX perf: We could set the "disable implicit point/line + * varyings" field in the shader record and not emit these, if + * they're not going to be used. + */ if (c->fs_key->is_points) { c->point_x = emit_fragment_varying(c, NULL, 0); c->point_y = emit_fragment_varying(c, NULL, 0); @@ -2119,7 +2145,14 @@ v3d_nir_to_vir(struct v3d_compile *c) vir_check_payload_w(c); - /* XXX: vir_schedule_instructions(c); */ + /* XXX perf: On VC4, we do a VIR-level instruction scheduling here. + * We used that on that platform to pipeline TMU writes and reduce the + * number of thread switches, as well as try (mostly successfully) to + * reduce maximum register pressure to allow more threads. We should + * do something of that sort for V3D -- either instruction scheduling + * here, or delay the the THRSW and LDTMUs from our texture + * instructions until the results are needed. + */ if (V3D_DEBUG & (V3D_DEBUG_VIR | v3d_debug_flag_for_shader_stage(c->s->info.stage))) { |