mesa: Fix pixel shader scratch space allocation on Gen9+ platforms.

We had missed a bit of errata - PS scratch needs to be computed as if there were 4 subslices per slice, rather than 3. Skylake Broxton Kabylake GT1 GT2 GT3 GT4 2x6 3x6 GT1 GT1.5 GT2 GT3 GT4 Actual Slices 1 1 2 3 1 1 1 1 1 2 3 Total Subslices 3 3 6 9 2 3 2 3 3 6 9 Subsl. for PS Scratch 4 4 8 12 4 4 4 4 4 8 12 Note that Skylake GT1-3 already worked because we allocated 64 * 9 (trying to use a value that would work on GT4, with 9 subslices), and the actual required values were 64 * 4 or 64 * 8. However, all others (Skylake GT4, Broxton, and Kabylake GT1-4) underallocated, which can lead to scratch writes trashing random process memory, and rendering corruption or GPU hangs. Fixes GPU hangs and rendering corruption on Skylake GT4 in shaders that spill. Particularly, dEQP-GLES31.functional.ubo.all_per_block_buffers.* now runs successfully with no hangs and renders correctly. This may fix problems on Broxton and Kabylake as well. Cc: "13.0" <[email protected]> Signed-off-by: Kenneth Graunke <[email protected]> Reviewed-by: Ben Widawsky <[email protected]> (cherry picked from commit aaee3daa90578fb711cc89186a65bc3d2c68022f)
author: Kenneth Graunke <[email protected]> 2016-11-07 17:12:54 -0800
committer: Emil Velikov <[email protected]> 2016-11-11 22:20:07 +0000
commit: e7de2510e5fc1a4ce602da2d41f1b7d7b9db8873 (patch)
tree: fd731e6dd2d599ce40fbcbb85d4c81e29f98c7a8
parent: 1a47251da48adee162be752b9a7ca2699f98a30b (diff)
1 files changed, 19 insertions, 14 deletions
diff --git a/src/intel/common/gen_device_info.c b/src/intel/common/gen_device_info.c
index 30df0b27dc1..1dc17690dc1 100644
--- a/src/intel/common/gen_device_info.c
+++ b/src/intel/common/gen_device_info.c
@@ -335,7 +335,6 @@ static const struct gen_device_info gen_device_info_chv = {
    .max_gs_threads = 336,                           \
    .max_tcs_threads = 336,                          \
    .max_tes_threads = 336,                          \
-   .max_wm_threads = 64 * 9,                        \
    .max_cs_threads = 56,                            \
    .urb = {                                         \
       .size = 384,                                  \
@@ -388,7 +387,6 @@ static const struct gen_device_info gen_device_info_bxt = {
    .max_tcs_threads = 112,
    .max_tes_threads = 112,
    .max_gs_threads = 112,
-   .max_wm_threads = 64 * 3,
    .max_cs_threads = 6 * 6,
    .urb = {
       .size = 192,
@@ -411,7 +409,6 @@ static const struct gen_device_info gen_device_info_bxt_2x6 = {
    .max_tcs_threads = 56, /* XXX: guess */
    .max_tes_threads = 56,
    .max_gs_threads = 56,
-   .max_wm_threads = 64 * 2,
    .max_cs_threads = 6 * 6,
    .urb = {
       .size = 128,
@@ -427,18 +424,11 @@ static const struct gen_device_info gen_device_info_bxt_2x6 = {
  * There's no KBL entry. Using the default SKL (GEN9) GS entries value.
  */
 
-/*
- * Both SKL and KBL support a maximum of 64 threads per
- * Pixel Shader Dispatch (PSD) unit.
- */
-#define  KBL_MAX_THREADS_PER_PSD 64
-
 static const struct gen_device_info gen_device_info_kbl_gt1 = {
    GEN9_FEATURES,
    .gt = 1,
 
    .max_cs_threads = 7 * 6,
-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 2,
    .urb.size = 192,
    .num_slices = 1,
 };
@@ -448,7 +438,6 @@ static const struct gen_device_info gen_device_info_kbl_gt1_5 = {
    .gt = 1,
 
    .max_cs_threads = 7 * 6,
-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 3,
    .num_slices = 1,
 };
 
@@ -456,7 +445,6 @@ static const struct gen_device_info gen_device_info_kbl_gt2 = {
    GEN9_FEATURES,
    .gt = 2,
 
-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 3,
    .num_slices = 1,
 };
 
@@ -464,7 +452,6 @@ static const struct gen_device_info gen_device_info_kbl_gt3 = {
    GEN9_FEATURES,
    .gt = 3,
 
-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 6,
    .num_slices = 2,
 };
 
@@ -472,7 +459,6 @@ static const struct gen_device_info gen_device_info_kbl_gt4 = {
    GEN9_FEATURES,
    .gt = 4,
 
-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 9,
    /*
     * From the "L3 Allocation and Programming" documentation:
     *
@@ -500,6 +486,25 @@ gen_get_device_info(int devid, struct gen_device_info *devinfo)
       return false;
    }
 
+   /* From the Skylake PRM, 3DSTATE_PS::Scratch Space Base Pointer:
+    *
+    * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
+    *  allocate scratch space enough so that each slice has 4 slices allowed."
+    *
+    * The equivalent internal documentation says that this programming note
+    * applies to all Gen9+ platforms.
+    *
+    * The hardware typically calculates the scratch space pointer by taking
+    * the base address, and adding per-thread-scratch-space * thread ID.
+    * Extra padding can be necessary depending how the thread IDs are
+    * calculated for a particular shader stage.
+    */
+   if (devinfo->gen >= 9) {
+      devinfo->max_wm_threads = 64 /* threads-per-PSD */
+                              * devinfo->num_slices
+                              * 4; /* effective subslices per slice */
+   }
+
    return true;
 }
author	Kenneth Graunke <[email protected]>	2016-11-07 17:12:54 -0800
committer	Emil Velikov <[email protected]>	2016-11-11 22:20:07 +0000
commit	e7de2510e5fc1a4ce602da2d41f1b7d7b9db8873 (patch)
tree	fd731e6dd2d599ce40fbcbb85d4c81e29f98c7a8
parent	1a47251da48adee162be752b9a7ca2699f98a30b (diff)