diff options
Diffstat (limited to 'src/mesa/drivers')
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_context.h | 5 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_defines.h | 5 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_misc_state.c | 90 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_state_upload.c | 9 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/genX_blorp_exec.c | 6 |
5 files changed, 109 insertions, 6 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 2ac443bf032..17639bf5995 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -1219,6 +1219,9 @@ struct brw_context enum gen9_astc5x5_wa_tex_type gen9_astc5x5_wa_tex_mask; + /** Last rendering scale argument provided to brw_emit_hashing_mode(). */ + unsigned current_hash_scale; + __DRIcontext *driContext; struct intel_screen *screen; }; @@ -1265,6 +1268,8 @@ GLboolean brwCreateContext(gl_api api, */ void brw_workaround_depthstencil_alignment(struct brw_context *brw, GLbitfield clear_mask); +void brw_emit_hashing_mode(struct brw_context *brw, unsigned width, + unsigned height, unsigned scale); /* brw_object_purgeable.c */ void brw_init_object_purgeable_functions(struct dd_function_table *functions); diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index 425f5534110..33d042be869 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -1570,6 +1570,11 @@ enum brw_pixel_shader_coverage_mask_mode { # define GEN9_SUBSLICE_HASHING_8x4 (2 << 8) # define GEN9_SUBSLICE_HASHING_16x16 (3 << 8) # define GEN9_SUBSLICE_HASHING_MASK_BITS REG_MASK(3 << 8) +# define GEN9_SLICE_HASHING_NORMAL (0 << 11) +# define GEN9_SLICE_HASHING_DISABLED (1 << 11) +# define GEN9_SLICE_HASHING_32x16 (2 << 11) +# define GEN9_SLICE_HASHING_32x32 (3 << 11) +# define GEN9_SLICE_HASHING_MASK_BITS REG_MASK(3 << 11) /* Predicate registers */ #define MI_PREDICATE_SRC0 0x2400 diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c index e73cadc5d3e..1291470d479 100644 --- a/src/mesa/drivers/dri/i965/brw_misc_state.c +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c @@ -602,6 +602,96 @@ brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline) } /** + * Update the pixel hashing modes that determine the balancing of PS threads + * across subslices and slices. + * + * \param width Width bound of the rendering area (already scaled down if \p + * scale is greater than 1). + * \param height Height bound of the rendering area (already scaled down if \p + * scale is greater than 1). + * \param scale The number of framebuffer samples that could potentially be + * affected by an individual channel of the PS thread. This is + * typically one for single-sampled rendering, but for operations + * like CCS resolves and fast clears a single PS invocation may + * update a huge number of pixels, in which case a finer + * balancing is desirable in order to maximally utilize the + * bandwidth available. UINT_MAX can be used as shorthand for + * "finest hashing mode available". + */ +void +brw_emit_hashing_mode(struct brw_context *brw, unsigned width, + unsigned height, unsigned scale) +{ + const struct gen_device_info *devinfo = &brw->screen->devinfo; + + if (devinfo->gen == 9) { + const uint32_t slice_hashing[] = { + /* Because all Gen9 platforms with more than one slice require + * three-way subslice hashing, a single "normal" 16x16 slice hashing + * block is guaranteed to suffer from substantial imbalance, with one + * subslice receiving twice as much work as the other two in the + * slice. + * + * The performance impact of that would be particularly severe when + * three-way hashing is also in use for slice balancing (which is the + * case for all Gen9 GT4 platforms), because one of the slices + * receives one every three 16x16 blocks in either direction, which + * is roughly the periodicity of the underlying subslice imbalance + * pattern ("roughly" because in reality the hardware's + * implementation of three-way hashing doesn't do exact modulo 3 + * arithmetic, which somewhat decreases the magnitude of this effect + * in practice). This leads to a systematic subslice imbalance + * within that slice regardless of the size of the primitive. The + * 32x32 hashing mode guarantees that the subslice imbalance within a + * single slice hashing block is minimal, largely eliminating this + * effect. + */ + GEN9_SLICE_HASHING_32x32, + /* Finest slice hashing mode available. */ + GEN9_SLICE_HASHING_NORMAL + }; + const uint32_t subslice_hashing[] = { + /* The 16x16 subslice hashing mode is used on non-LLC platforms to + * match the performance of previous Mesa versions. 16x16 has a + * slight cache locality benefit especially visible in the sampler L1 + * cache efficiency of low-bandwidth platforms, but it comes at the + * cost of greater subslice imbalance for primitives of dimensions + * approximately intermediate between 16x4 and 16x16. + */ + (devinfo->has_llc ? GEN9_SUBSLICE_HASHING_16x4 : + GEN9_SUBSLICE_HASHING_16x16), + /* Finest subslice hashing mode available. */ + GEN9_SUBSLICE_HASHING_8x4 + }; + /* Dimensions of the smallest hashing block of a given hashing mode. If + * the rendering area is smaller than this there can't possibly be any + * benefit from switching to this mode, so we optimize out the + * transition. + */ + const unsigned min_size[][2] = { + { 16, 4 }, + { 8, 4 } + }; + const unsigned idx = scale > 1; + + if (width > min_size[idx][0] || height > min_size[idx][1]) { + const uint32_t gt_mode = + (devinfo->num_slices == 1 ? 0 : + GEN9_SLICE_HASHING_MASK_BITS | slice_hashing[idx]) | + GEN9_SUBSLICE_HASHING_MASK_BITS | subslice_hashing[idx]; + + brw_emit_pipe_control_flush(brw, + PIPE_CONTROL_STALL_AT_SCOREBOARD | + PIPE_CONTROL_CS_STALL); + + brw_load_register_imm32(brw, GEN7_GT_MODE, gt_mode); + + brw->current_hash_scale = scale; + } + } +} + +/** * Misc invariant state packets */ void diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c index ac9ee2dabf1..7a2daf4a533 100644 --- a/src/mesa/drivers/dri/i965/brw_state_upload.c +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c @@ -133,12 +133,6 @@ brw_upload_initial_gpu_state(struct brw_context *brw) REG_MASK(GEN9_PARTIAL_RESOLVE_DISABLE_IN_VC) | GEN9_FLOAT_BLEND_OPTIMIZATION_ENABLE | GEN9_PARTIAL_RESOLVE_DISABLE_IN_VC); - - if (gen_device_info_is_9lp(devinfo)) { - brw_load_register_imm32(brw, GEN7_GT_MODE, - GEN9_SUBSLICE_HASHING_MASK_BITS | - GEN9_SUBSLICE_HASHING_16x16); - } } if (devinfo->gen >= 8) { @@ -543,6 +537,9 @@ brw_upload_pipeline_state(struct brw_context *brw, brw_select_pipeline(brw, pipeline); + if (pipeline == BRW_RENDER_PIPELINE && brw->current_hash_scale != 1) + brw_emit_hashing_mode(brw, UINT_MAX, UINT_MAX, 1); + if (unlikely(INTEL_DEBUG & DEBUG_REEMIT)) { /* Always re-emit all state. */ brw->NewGLState = ~0; diff --git a/src/mesa/drivers/dri/i965/genX_blorp_exec.c b/src/mesa/drivers/dri/i965/genX_blorp_exec.c index 8f06f5e9ef2..62a8310f68b 100644 --- a/src/mesa/drivers/dri/i965/genX_blorp_exec.c +++ b/src/mesa/drivers/dri/i965/genX_blorp_exec.c @@ -343,6 +343,12 @@ retry: gen8_write_pma_stall_bits(brw, 0); #endif + const unsigned scale = params->fast_clear_op ? UINT_MAX : 1; + if (brw->current_hash_scale != scale) { + brw_emit_hashing_mode(brw, params->x1 - params->x0, + params->y1 - params->y0, scale); + } + blorp_emit(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) { rect.ClippedDrawingRectangleXMax = MAX2(params->x1, params->x0) - 1; rect.ClippedDrawingRectangleYMax = MAX2(params->y1, params->y0) - 1; |