diff options
author | Francisco Jerez <[email protected]> | 2016-06-13 14:56:22 -0700 |
---|---|---|
committer | Emil Velikov <[email protected]> | 2016-06-15 09:29:12 +0100 |
commit | 2cf78b485137a74ed3a7184f48928f3990883928 (patch) | |
tree | 54ad3a3195eb1e1e2411d8f34ab94c0abce6bdc4 | |
parent | b9f69df93dc6680b2f25477d26f462aec2a76241 (diff) |
i965: Keep track of the per-thread scratch allocation in brw_stage_state.
This will be used to find out what per-thread slot size a previously
allocated scratch BO was used with in order to fix a hardware race
condition without introducing additional stalls or memory allocations.
Instead of calling brw_get_scratch_bo() manually from the various
codegen functions, call a new helper function that keeps track of the
per-thread scratch size and conditionally allocates a larger scratch
BO.
v2: Handle BO allocation manually instead of relying on
brw_get_scratch_bo (Ken).
Cc: <[email protected]>
Reviewed-by: Kenneth Graunke <[email protected]>
(cherry picked from commit d960284e447df9b1563deef0ce950617decfba63)
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_context.h | 10 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_cs.c | 48 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_gs.c | 8 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_program.c | 22 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_tcs.c | 8 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_tes.c | 8 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_vs.c | 8 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_wm.c | 7 |
8 files changed, 70 insertions, 49 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index daa9ed2c7e1..9618b4a8182 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -677,6 +677,12 @@ struct brw_stage_state */ drm_intel_bo *scratch_bo; + /** + * Scratch slot size allocated for each thread in the buffer object given + * by \c scratch_bo. + */ + uint32_t per_thread_scratch; + /** Offset in the program cache to the program */ uint32_t prog_offset; @@ -1481,6 +1487,10 @@ brw_get_scratch_size(int size) } void brw_get_scratch_bo(struct brw_context *brw, drm_intel_bo **scratch_bo, int size); +void brw_alloc_stage_scratch(struct brw_context *brw, + struct brw_stage_state *stage_state, + unsigned per_thread_size, + unsigned thread_count); void brw_init_shader_time(struct brw_context *brw); int brw_get_shader_time_index(struct brw_context *brw, struct gl_shader_program *shader_prog, diff --git a/src/mesa/drivers/dri/i965/brw_cs.c b/src/mesa/drivers/dri/i965/brw_cs.c index 22856b64179..45128bc439a 100644 --- a/src/mesa/drivers/dri/i965/brw_cs.c +++ b/src/mesa/drivers/dri/i965/brw_cs.c @@ -148,31 +148,29 @@ brw_codegen_cs_prog(struct brw_context *brw, } } - if (prog_data.base.total_scratch) { - const unsigned subslices = MAX2(brw->intelScreen->subslice_total, 1); - - /* WaCSScratchSize:hsw - * - * Haswell's scratch space address calculation appears to be sparse - * rather than tightly packed. The Thread ID has bits indicating - * which subslice, EU within a subslice, and thread within an EU - * it is. There's a maximum of two slices and two subslices, so these - * can be stored with a single bit. Even though there are only 10 EUs - * per subslice, this is stored in 4 bits, so there's an effective - * maximum value of 16 EUs. Similarly, although there are only 7 - * threads per EU, this is stored in a 3 bit number, giving an effective - * maximum value of 8 threads per EU. - * - * This means that we need to use 16 * 8 instead of 10 * 7 for the - * number of threads per subslice. - */ - const unsigned scratch_ids_per_subslice = - brw->is_haswell ? 16 * 8 : brw->max_cs_threads; - - brw_get_scratch_bo(brw, &brw->cs.base.scratch_bo, - prog_data.base.total_scratch * - scratch_ids_per_subslice * subslices); - } + const unsigned subslices = MAX2(brw->intelScreen->subslice_total, 1); + + /* WaCSScratchSize:hsw + * + * Haswell's scratch space address calculation appears to be sparse + * rather than tightly packed. The Thread ID has bits indicating + * which subslice, EU within a subslice, and thread within an EU + * it is. There's a maximum of two slices and two subslices, so these + * can be stored with a single bit. Even though there are only 10 EUs + * per subslice, this is stored in 4 bits, so there's an effective + * maximum value of 16 EUs. Similarly, although there are only 7 + * threads per EU, this is stored in a 3 bit number, giving an effective + * maximum value of 8 threads per EU. + * + * This means that we need to use 16 * 8 instead of 10 * 7 for the + * number of threads per subslice. + */ + const unsigned scratch_ids_per_subslice = + brw->is_haswell ? 16 * 8 : brw->max_cs_threads; + + brw_alloc_stage_scratch(brw, &brw->cs.base, + prog_data.base.total_scratch, + scratch_ids_per_subslice * subslices); if (unlikely(INTEL_DEBUG & DEBUG_CS)) fprintf(stderr, "\n"); diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c index 7ead182da6c..4ac1009447c 100644 --- a/src/mesa/drivers/dri/i965/brw_gs.c +++ b/src/mesa/drivers/dri/i965/brw_gs.c @@ -180,11 +180,9 @@ brw_codegen_gs_prog(struct brw_context *brw, } /* Scratch space is used for register spilling */ - if (prog_data.base.base.total_scratch) { - brw_get_scratch_bo(brw, &stage_state->scratch_bo, - prog_data.base.base.total_scratch * - brw->max_gs_threads); - } + brw_alloc_stage_scratch(brw, stage_state, + prog_data.base.base.total_scratch, + brw->max_gs_threads); brw_upload_cache(&brw->cache, BRW_CACHE_GS_PROG, key, sizeof(*key), diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c index 792f81b80aa..a1a81165ab1 100644 --- a/src/mesa/drivers/dri/i965/brw_program.c +++ b/src/mesa/drivers/dri/i965/brw_program.c @@ -345,6 +345,28 @@ brw_get_scratch_bo(struct brw_context *brw, } } +/** + * Reserve enough scratch space for the given stage to hold \p per_thread_size + * bytes times the given \p thread_count. + */ +void +brw_alloc_stage_scratch(struct brw_context *brw, + struct brw_stage_state *stage_state, + unsigned per_thread_size, + unsigned thread_count) +{ + if (stage_state->per_thread_scratch < per_thread_size) { + stage_state->per_thread_scratch = per_thread_size; + + if (stage_state->scratch_bo) + drm_intel_bo_unreference(stage_state->scratch_bo); + + stage_state->scratch_bo = + drm_intel_bo_alloc(brw->bufmgr, "shader scratch space", + per_thread_size * thread_count, 4096); + } +} + void brwInitFragProgFuncs( struct dd_function_table *functions ) { assert(functions->ProgramStringNotify == _tnl_program_string); diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c b/src/mesa/drivers/dri/i965/brw_tcs.c index 83fc15754e1..7fc8eb40726 100644 --- a/src/mesa/drivers/dri/i965/brw_tcs.c +++ b/src/mesa/drivers/dri/i965/brw_tcs.c @@ -294,11 +294,9 @@ brw_codegen_tcs_prog(struct brw_context *brw, } /* Scratch space is used for register spilling */ - if (prog_data.base.base.total_scratch) { - brw_get_scratch_bo(brw, &stage_state->scratch_bo, - prog_data.base.base.total_scratch * - brw->max_hs_threads); - } + brw_alloc_stage_scratch(brw, stage_state, + prog_data.base.base.total_scratch, + brw->max_hs_threads); brw_upload_cache(&brw->cache, BRW_CACHE_TCS_PROG, key, sizeof(*key), diff --git a/src/mesa/drivers/dri/i965/brw_tes.c b/src/mesa/drivers/dri/i965/brw_tes.c index a4cd4daadde..d7b3e69a269 100644 --- a/src/mesa/drivers/dri/i965/brw_tes.c +++ b/src/mesa/drivers/dri/i965/brw_tes.c @@ -214,11 +214,9 @@ brw_codegen_tes_prog(struct brw_context *brw, } /* Scratch space is used for register spilling */ - if (prog_data.base.base.total_scratch) { - brw_get_scratch_bo(brw, &stage_state->scratch_bo, - prog_data.base.base.total_scratch * - brw->max_ds_threads); - } + brw_alloc_stage_scratch(brw, stage_state, + prog_data.base.base.total_scratch, + brw->max_ds_threads); brw_upload_cache(&brw->cache, BRW_CACHE_TES_PROG, key, sizeof(*key), diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c index abf03b1fb7a..d929f9b403b 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.c +++ b/src/mesa/drivers/dri/i965/brw_vs.c @@ -208,11 +208,9 @@ brw_codegen_vs_prog(struct brw_context *brw, } /* Scratch space is used for register spilling */ - if (prog_data.base.base.total_scratch) { - brw_get_scratch_bo(brw, &brw->vs.base.scratch_bo, - prog_data.base.base.total_scratch * - brw->max_vs_threads); - } + brw_alloc_stage_scratch(brw, &brw->vs.base, + prog_data.base.base.total_scratch, + brw->max_vs_threads); brw_upload_cache(&brw->cache, BRW_CACHE_VS_PROG, key, sizeof(struct brw_vs_prog_key), diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c index c9c5d5e443e..46839bc526e 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.c +++ b/src/mesa/drivers/dri/i965/brw_wm.c @@ -163,10 +163,9 @@ brw_codegen_wm_prog(struct brw_context *brw, } } - if (prog_data.base.total_scratch) { - brw_get_scratch_bo(brw, &brw->wm.base.scratch_bo, - prog_data.base.total_scratch * brw->max_wm_threads); - } + brw_alloc_stage_scratch(brw, &brw->wm.base, + prog_data.base.total_scratch, + brw->max_wm_threads); if (unlikely(INTEL_DEBUG & DEBUG_WM)) fprintf(stderr, "\n"); |