mesa: Fix pixel shader scratch space allocation on Gen9+ platforms.

We had missed a bit of errata - PS scratch needs to be computed as if there were 4 subslices per slice, rather than 3. Skylake Broxton Kabylake GT1 GT2 GT3 GT4 2x6 3x6 GT1 GT1.5 GT2 GT3 GT4 Actual Slices 1 1 2 3 1 1 1 1 1 2 3 Total Subslices 3 3 6 9 2 3 2 3 3 6 9 Subsl. for PS Scratch 4 4 8 12 4 4 4 4 4 8 12 Note that Skylake GT1-3 already worked because we allocated 64 * 9 (trying to use a value that would work on GT4, with 9 subslices), and the actual required values were 64 * 4 or 64 * 8. However, all others (Skylake GT4, Broxton, and Kabylake GT1-4) underallocated, which can lead to scratch writes trashing random process memory, and rendering corruption or GPU hangs. Fixes GPU hangs and rendering corruption on Skylake GT4 in shaders that spill. Particularly, dEQP-GLES31.functional.ubo.all_per_block_buffers.* now runs successfully with no hangs and renders correctly. This may fix problems on Broxton and Kabylake as well. Cc: "13.0" <[email protected]> Signed-off-by: Kenneth Graunke <[email protected]> Reviewed-by: Ben Widawsky <[email protected]>
author: Kenneth Graunke <[email protected]> 2016-11-07 17:12:54 -0800
committer: Kenneth Graunke <[email protected]> 2016-11-09 15:30:59 -0800
commit: aaee3daa90578fb711cc89186a65bc3d2c68022f (patch)
tree: 698e68fa8c4ecddb185f20525b27dc51fb006749 /src/intel
parent: 1d6fe13c138efb836a28052b16260a258d113827 (diff)
1 files changed, 19 insertions, 14 deletions
diff --git a/src/intel/common/gen_device_info.c b/src/intel/common/gen_device_info.c
index 30df0b27dc1..1dc17690dc1 100644
--- a/src/intel/common/gen_device_info.c
+++ b/src/intel/common/gen_device_info.c
@@ -335,7 +335,6 @@ static const struct gen_device_info gen_device_info_chv = {
    .max_gs_threads = 336,                           \
    .max_tcs_threads = 336,                          \
    .max_tes_threads = 336,                          \
-   .max_wm_threads = 64 * 9,                        \
    .max_cs_threads = 56,                            \
    .urb = {                                         \
       .size = 384,                                  \
@@ -388,7 +387,6 @@ static const struct gen_device_info gen_device_info_bxt = {
    .max_tcs_threads = 112,
    .max_tes_threads = 112,
    .max_gs_threads = 112,
-   .max_wm_threads = 64 * 3,
    .max_cs_threads = 6 * 6,
    .urb = {
       .size = 192,
@@ -411,7 +409,6 @@ static const struct gen_device_info gen_device_info_bxt_2x6 = {
    .max_tcs_threads = 56, /* XXX: guess */
    .max_tes_threads = 56,
    .max_gs_threads = 56,
-   .max_wm_threads = 64 * 2,
    .max_cs_threads = 6 * 6,
    .urb = {
       .size = 128,
@@ -427,18 +424,11 @@ static const struct gen_device_info gen_device_info_bxt_2x6 = {
  * There's no KBL entry. Using the default SKL (GEN9) GS entries value.
  */
 
-/*
- * Both SKL and KBL support a maximum of 64 threads per
- * Pixel Shader Dispatch (PSD) unit.
- */
-#define  KBL_MAX_THREADS_PER_PSD 64
-
 static const struct gen_device_info gen_device_info_kbl_gt1 = {
    GEN9_FEATURES,
    .gt = 1,
 
    .max_cs_threads = 7 * 6,
-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 2,
    .urb.size = 192,
    .num_slices = 1,
 };
@@ -448,7 +438,6 @@ static const struct gen_device_info gen_device_info_kbl_gt1_5 = {
    .gt = 1,
 
    .max_cs_threads = 7 * 6,
-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 3,
    .num_slices = 1,
 };
 
@@ -456,7 +445,6 @@ static const struct gen_device_info gen_device_info_kbl_gt2 = {
    GEN9_FEATURES,
    .gt = 2,
 
-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 3,
    .num_slices = 1,
 };
 
@@ -464,7 +452,6 @@ static const struct gen_device_info gen_device_info_kbl_gt3 = {
    GEN9_FEATURES,
    .gt = 3,
 
-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 6,
    .num_slices = 2,
 };
 
@@ -472,7 +459,6 @@ static const struct gen_device_info gen_device_info_kbl_gt4 = {
    GEN9_FEATURES,
    .gt = 4,
 
-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 9,
    /*
     * From the "L3 Allocation and Programming" documentation:
     *
@@ -500,6 +486,25 @@ gen_get_device_info(int devid, struct gen_device_info *devinfo)
       return false;
    }
 
+   /* From the Skylake PRM, 3DSTATE_PS::Scratch Space Base Pointer:
+    *
+    * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
+    *  allocate scratch space enough so that each slice has 4 slices allowed."
+    *
+    * The equivalent internal documentation says that this programming note
+    * applies to all Gen9+ platforms.
+    *
+    * The hardware typically calculates the scratch space pointer by taking
+    * the base address, and adding per-thread-scratch-space * thread ID.
+    * Extra padding can be necessary depending how the thread IDs are
+    * calculated for a particular shader stage.
+    */
+   if (devinfo->gen >= 9) {
+      devinfo->max_wm_threads = 64 /* threads-per-PSD */
+                              * devinfo->num_slices
+                              * 4; /* effective subslices per slice */
+   }
+
    return true;
 }
author	Kenneth Graunke <[email protected]>	2016-11-07 17:12:54 -0800
committer	Kenneth Graunke <[email protected]>	2016-11-09 15:30:59 -0800
commit	aaee3daa90578fb711cc89186a65bc3d2c68022f (patch)
tree	698e68fa8c4ecddb185f20525b27dc51fb006749 /src/intel
parent	1d6fe13c138efb836a28052b16260a258d113827 (diff)