diff options
author | Kenneth Graunke <[email protected]> | 2016-11-07 17:12:54 -0800 |
---|---|---|
committer | Kenneth Graunke <[email protected]> | 2016-11-09 15:30:59 -0800 |
commit | aaee3daa90578fb711cc89186a65bc3d2c68022f (patch) | |
tree | 698e68fa8c4ecddb185f20525b27dc51fb006749 /src/intel/common/gen_device_info.c | |
parent | 1d6fe13c138efb836a28052b16260a258d113827 (diff) |
mesa: Fix pixel shader scratch space allocation on Gen9+ platforms.
We had missed a bit of errata - PS scratch needs to be computed as if
there were 4 subslices per slice, rather than 3.
Skylake Broxton Kabylake
GT1 GT2 GT3 GT4 2x6 3x6 GT1 GT1.5 GT2 GT3 GT4
Actual Slices 1 1 2 3 1 1 1 1 1 2 3
Total Subslices 3 3 6 9 2 3 2 3 3 6 9
Subsl. for PS Scratch 4 4 8 12 4 4 4 4 4 8 12
Note that Skylake GT1-3 already worked because we allocated 64 * 9
(trying to use a value that would work on GT4, with 9 subslices),
and the actual required values were 64 * 4 or 64 * 8. However, all
others (Skylake GT4, Broxton, and Kabylake GT1-4) underallocated,
which can lead to scratch writes trashing random process memory,
and rendering corruption or GPU hangs.
Fixes GPU hangs and rendering corruption on Skylake GT4 in shaders that
spill. Particularly, dEQP-GLES31.functional.ubo.all_per_block_buffers.*
now runs successfully with no hangs and renders correctly. This may
fix problems on Broxton and Kabylake as well.
Cc: "13.0" <[email protected]>
Signed-off-by: Kenneth Graunke <[email protected]>
Reviewed-by: Ben Widawsky <[email protected]>
Diffstat (limited to 'src/intel/common/gen_device_info.c')
-rw-r--r-- | src/intel/common/gen_device_info.c | 33 |
1 files changed, 19 insertions, 14 deletions
diff --git a/src/intel/common/gen_device_info.c b/src/intel/common/gen_device_info.c index 30df0b27dc1..1dc17690dc1 100644 --- a/src/intel/common/gen_device_info.c +++ b/src/intel/common/gen_device_info.c @@ -335,7 +335,6 @@ static const struct gen_device_info gen_device_info_chv = { .max_gs_threads = 336, \ .max_tcs_threads = 336, \ .max_tes_threads = 336, \ - .max_wm_threads = 64 * 9, \ .max_cs_threads = 56, \ .urb = { \ .size = 384, \ @@ -388,7 +387,6 @@ static const struct gen_device_info gen_device_info_bxt = { .max_tcs_threads = 112, .max_tes_threads = 112, .max_gs_threads = 112, - .max_wm_threads = 64 * 3, .max_cs_threads = 6 * 6, .urb = { .size = 192, @@ -411,7 +409,6 @@ static const struct gen_device_info gen_device_info_bxt_2x6 = { .max_tcs_threads = 56, /* XXX: guess */ .max_tes_threads = 56, .max_gs_threads = 56, - .max_wm_threads = 64 * 2, .max_cs_threads = 6 * 6, .urb = { .size = 128, @@ -427,18 +424,11 @@ static const struct gen_device_info gen_device_info_bxt_2x6 = { * There's no KBL entry. Using the default SKL (GEN9) GS entries value. */ -/* - * Both SKL and KBL support a maximum of 64 threads per - * Pixel Shader Dispatch (PSD) unit. - */ -#define KBL_MAX_THREADS_PER_PSD 64 - static const struct gen_device_info gen_device_info_kbl_gt1 = { GEN9_FEATURES, .gt = 1, .max_cs_threads = 7 * 6, - .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 2, .urb.size = 192, .num_slices = 1, }; @@ -448,7 +438,6 @@ static const struct gen_device_info gen_device_info_kbl_gt1_5 = { .gt = 1, .max_cs_threads = 7 * 6, - .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 3, .num_slices = 1, }; @@ -456,7 +445,6 @@ static const struct gen_device_info gen_device_info_kbl_gt2 = { GEN9_FEATURES, .gt = 2, - .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 3, .num_slices = 1, }; @@ -464,7 +452,6 @@ static const struct gen_device_info gen_device_info_kbl_gt3 = { GEN9_FEATURES, .gt = 3, - .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 6, .num_slices = 2, }; @@ -472,7 +459,6 @@ static const struct gen_device_info gen_device_info_kbl_gt4 = { GEN9_FEATURES, .gt = 4, - .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 9, /* * From the "L3 Allocation and Programming" documentation: * @@ -500,6 +486,25 @@ gen_get_device_info(int devid, struct gen_device_info *devinfo) return false; } + /* From the Skylake PRM, 3DSTATE_PS::Scratch Space Base Pointer: + * + * "Scratch Space per slice is computed based on 4 sub-slices. SW must + * allocate scratch space enough so that each slice has 4 slices allowed." + * + * The equivalent internal documentation says that this programming note + * applies to all Gen9+ platforms. + * + * The hardware typically calculates the scratch space pointer by taking + * the base address, and adding per-thread-scratch-space * thread ID. + * Extra padding can be necessary depending how the thread IDs are + * calculated for a particular shader stage. + */ + if (devinfo->gen >= 9) { + devinfo->max_wm_threads = 64 /* threads-per-PSD */ + * devinfo->num_slices + * 4; /* effective subslices per slice */ + } + return true; } |