diff options
author | Kenneth Graunke <[email protected]> | 2014-02-10 17:40:24 -0800 |
---|---|---|
committer | Kenneth Graunke <[email protected]> | 2014-02-19 15:42:54 -0800 |
commit | 0c5873c9b9cfe1e7e689b2ef92a205aeee03763b (patch) | |
tree | 5f37ec498d504cf1941c8992bcccc1ad5b145b78 /src/mesa/drivers/dri/i965/gen8_ps_state.c | |
parent | 61d7ea4b16b1d5effd273027c21cf64841b67b78 (diff) |
i965: Only use the SIMD16 program for per-sample shading on Broadwell.
This restriction carries forward from earlier platforms. The code is
ported straight from gen7_wm_state.c.
v2: Actually do it right.
v3: Add missing _NEW_MULTISAMPLE bit (caught by Eric).
Signed-off-by: Kenneth Graunke <[email protected]>
Reviewed-by: Eric Anholt <[email protected]>
Reviewed-by: Anuj Phogat <[email protected]>
Diffstat (limited to 'src/mesa/drivers/dri/i965/gen8_ps_state.c')
-rw-r--r-- | src/mesa/drivers/dri/i965/gen8_ps_state.c | 41 |
1 files changed, 32 insertions, 9 deletions
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c index c25b82757c1..c2810bb21b9 100644 --- a/src/mesa/drivers/dri/i965/gen8_ps_state.c +++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c @@ -180,10 +180,6 @@ upload_ps_state(struct brw_context *brw) if (brw->wm.prog_data->base.nr_params > 0) dw6 |= GEN7_PS_PUSH_CONSTANT_ENABLE; - dw6 |= GEN7_PS_8_DISPATCH_ENABLE; - if (brw->wm.prog_data->prog_offset_16) - dw6 |= GEN7_PS_16_DISPATCH_ENABLE; - /* From the documentation for this packet: * "If the PS kernel does not need the Position XY Offsets to * compute a Position Value, then this field should be programmed @@ -202,13 +198,40 @@ upload_ps_state(struct brw_context *brw) else dw6 |= GEN7_PS_POSOFFSET_NONE; - dw7 |= - brw->wm.prog_data->first_curbe_grf << GEN7_PS_DISPATCH_START_GRF_SHIFT_0 | - brw->wm.prog_data->first_curbe_grf_16<< GEN7_PS_DISPATCH_START_GRF_SHIFT_2; + /* _NEW_MULTISAMPLE + * In case of non 1x per sample shading, only one of SIMD8 and SIMD16 + * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader + * is successfully compiled. In majority of the cases that bring us + * better performance than 'SIMD8 only' dispatch. + */ + int min_invocations_per_fragment = + _mesa_get_min_invocations_per_fragment(ctx, brw->fragment_program, false); + assert(min_invocations_per_fragment >= 1); + + if (brw->wm.prog_data->prog_offset_16) { + dw6 |= GEN7_PS_16_DISPATCH_ENABLE; + if (min_invocations_per_fragment == 1) { + dw6 |= GEN7_PS_8_DISPATCH_ENABLE; + dw7 |= (brw->wm.prog_data->first_curbe_grf << + GEN7_PS_DISPATCH_START_GRF_SHIFT_0); + dw7 |= (brw->wm.prog_data->first_curbe_grf_16 << + GEN7_PS_DISPATCH_START_GRF_SHIFT_2); + } else { + dw7 |= (brw->wm.prog_data->first_curbe_grf_16 << + GEN7_PS_DISPATCH_START_GRF_SHIFT_0); + } + } else { + dw6 |= GEN7_PS_8_DISPATCH_ENABLE; + dw7 |= (brw->wm.prog_data->first_curbe_grf << + GEN7_PS_DISPATCH_START_GRF_SHIFT_0); + } BEGIN_BATCH(12); OUT_BATCH(_3DSTATE_PS << 16 | (12 - 2)); - OUT_BATCH(brw->wm.base.prog_offset); + if (brw->wm.prog_data->prog_offset_16 && min_invocations_per_fragment > 1) + OUT_BATCH(brw->wm.base.prog_offset + brw->wm.prog_data->prog_offset_16); + else + OUT_BATCH(brw->wm.base.prog_offset); OUT_BATCH(0); OUT_BATCH(dw3); if (brw->wm.prog_data->total_scratch) { @@ -230,7 +253,7 @@ upload_ps_state(struct brw_context *brw) const struct brw_tracked_state gen8_ps_state = { .dirty = { - .mesa = _NEW_PROGRAM_CONSTANTS, + .mesa = _NEW_PROGRAM_CONSTANTS | _NEW_MULTISAMPLE, .brw = BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_PS_BINDING_TABLE | BRW_NEW_BATCH | |