From 4a1c8a3037cd29938b2a6e2c680c341e9903cfbe Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Sun, 27 Dec 2015 17:26:30 -0800 Subject: i965: Push most TES inputs in SIMD8 mode. Using the push model for inputs is much more efficient than pulling inputs - the hardware can simply copy a large chunk into URB registers at thread creation time, rather than having the thread send messages to request data from the L3 cache. Unfortunately, it's possible to have more TES inputs than fit in registers, so we have to fall back to the pull model in some cases. However, it turns out that most tessellation evaluation shaders are fairly simple, and don't use many inputs. An arbitrary cut-off of 32 vec4 slots (16 registers) is more than sufficient to ensure that 100% of TES inputs are pushed for Shadow of Mordor, Unigine Heaven, GPUTest/TessMark, and SynMark. Note that unlike most SIMD8 stages, this actually reads packed vec4 data, since that is what our vec4 TCS programs write. Improves performance in GPUTest's tessmark_x64 microbenchmark by 93.4426% +/- 5.35541% (n = 25) on my Lenovo X250 at 1024x768. Improves performance in Synmark's Gl40TerrainFlyTess microbenchmark by 22.74% +/- 0.309394% (n = 5). Improves performance in Shadow of Mordor at low settings with tessellation enabled at 1280x720 by 2.12197% +/- 0.478553% (n = 4). shader-db statistics for files containing tessellation shaders: total instructions in shared programs: 184358 -> 181181 (-1.72%) instructions in affected programs: 27971 -> 24794 (-11.36%) helped: 226 Signed-off-by: Kenneth Graunke Reviewed-by: Matt Turner --- src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 42 +++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 12 deletions(-) (limited to 'src') diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 788315f6c52..ad347fcdbaf 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -1849,15 +1849,33 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld, fs_inst *inst; if (indirect_offset.file == BAD_FILE) { - /* Replicate the patch handle to all enabled channels */ - const fs_reg srcs[] = { - retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD) - }; - fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); - bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0); - - inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest, patch_handle); - inst->mlen = 1; + /* Arbitrarily only push up to 32 vec4 slots worth of data, + * which is 16 registers (since each holds 2 vec4 slots). + */ + const unsigned max_push_slots = 32; + if (imm_offset < max_push_slots) { + fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type); + for (int i = 0; i < instr->num_components; i++) { + bld.MOV(offset(dest, bld, i), + component(src, 4 * (imm_offset % 2) + i)); + } + tes_prog_data->base.urb_read_length = + MAX2(tes_prog_data->base.urb_read_length, + DIV_ROUND_UP(imm_offset + 1, 2)); + } else { + /* Replicate the patch handle to all enabled channels */ + const fs_reg srcs[] = { + retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD) + }; + fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0); + + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest, patch_handle); + inst->mlen = 1; + inst->offset = imm_offset; + inst->base_mrf = -1; + inst->regs_written = instr->num_components; + } } else { /* Indirect indexing - use per-slot offsets as well. */ const fs_reg srcs[] = { @@ -1869,10 +1887,10 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld, inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest, payload); inst->mlen = 2; + inst->offset = imm_offset; + inst->base_mrf = -1; + inst->regs_written = instr->num_components; } - inst->offset = imm_offset; - inst->base_mrf = -1; - inst->regs_written = instr->num_components; break; } default: -- cgit v1.2.3