5 files changed, 67 insertions, 18 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 144d3e327d4..a7b612ad545 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -541,6 +541,17 @@ struct brw_vue_map {
    GLbitfield64 slots_valid;
 
    /**
+    * Is this VUE map for a separate shader pipeline?
+    *
+    * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched
+    * without the linker having a chance to dead code eliminate unused varyings.
+    *
+    * This means that we have to use a fixed slot layout, based on the output's
+    * location field, rather than assigning slots in a compact contiguous block.
+    */
+   bool separate;
+
+   /**
     * Map from gl_varying_slot value to VUE slot.  For gl_varying_slots that are
     * not stored in a slot (because they are not written, or because
     * additional processing is applied before storing them in the VUE), the
@@ -585,7 +596,8 @@ static inline GLuint brw_varying_to_offset(struct brw_vue_map *vue_map,
 
 void brw_compute_vue_map(const struct brw_device_info *devinfo,
                          struct brw_vue_map *vue_map,
-                         GLbitfield64 slots_valid);
+                         GLbitfield64 slots_valid,
+                         bool separate_shader);
 
 
 /**
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index a8f5520fb94..49dc7f65b48 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1440,7 +1440,8 @@ fs_visitor::calculate_urb_setup()
           */
          struct brw_vue_map prev_stage_vue_map;
          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
-                             key->input_slots_valid);
+                             key->input_slots_valid,
+                             shader_prog->SeparateShader);
          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 16ea6846285..38b3e3a5cd9 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -120,7 +120,8 @@ brw_codegen_gs_prog(struct brw_context *brw,
    GLbitfield64 outputs_written = gp->program.Base.OutputsWritten;
 
    brw_compute_vue_map(brw->intelScreen->devinfo,
-                       &c.prog_data.base.vue_map, outputs_written);
+                       &c.prog_data.base.vue_map, outputs_written,
+                       prog ? prog->SeparateShader : false);
 
    /* Compute the output vertex size.
     *
@@ -243,7 +244,8 @@ brw_codegen_gs_prog(struct brw_context *brw,
       get_hw_prim_for_gl_prim(gp->program.OutputType);
 
    brw_compute_vue_map(brw->intelScreen->devinfo,
-                       &c.input_vue_map, c.key.input_varyings);
+                       &c.input_vue_map, c.key.input_varyings,
+                       prog->SeparateShader);
 
    /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
     * need to program a URB read length of ceiling(num_slots / 2).
@@ -357,7 +359,9 @@ brw_upload_gs_prog(struct brw_context *brw)
    brw->gs.base.prog_data = &brw->gs.prog_data->base.base;
 
    if (brw->gs.prog_data->base.vue_map.slots_valid !=
-       brw->vue_map_geom_out.slots_valid) {
+       brw->vue_map_geom_out.slots_valid ||
+       brw->gs.prog_data->base.vue_map.separate !=
+       brw->vue_map_geom_out.separate) {
       brw->vue_map_geom_out = brw->gs.prog_data->base.vue_map;
       brw->ctx.NewDriverState |= BRW_NEW_VUE_MAP_GEOM_OUT;
    }
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 465e78f4c74..b1ec9637c32 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -180,7 +180,8 @@ brw_codegen_vs_prog(struct brw_context *brw,
    }
 
    brw_compute_vue_map(brw->intelScreen->devinfo,
-                       &prog_data.base.vue_map, outputs_written);
+                       &prog_data.base.vue_map, outputs_written,
+                       prog ? prog->SeparateShader : false);
 
    if (0) {
       _mesa_fprint_program_opt(stderr, &vp->program.Base, PROG_PRINT_DEBUG,
@@ -388,7 +389,9 @@ brw_upload_vs_prog(struct brw_context *brw)
    brw->vs.base.prog_data = &brw->vs.prog_data->base.base;
 
    if (brw->vs.prog_data->base.vue_map.slots_valid !=
-       brw->vue_map_geom_out.slots_valid) {
+       brw->vue_map_geom_out.slots_valid ||
+       brw->vs.prog_data->base.vue_map.separate !=
+       brw->vue_map_geom_out.separate) {
       brw->vue_map_vs = brw->vs.prog_data->base.vue_map;
       brw->ctx.NewDriverState |= BRW_NEW_VUE_MAP_VS;
       if (brw->gen < 6) {
diff --git a/src/mesa/drivers/dri/i965/brw_vue_map.c b/src/mesa/drivers/dri/i965/brw_vue_map.c
index 1ef52143cc5..45662bd5afc 100644
--- a/src/mesa/drivers/dri/i965/brw_vue_map.c
+++ b/src/mesa/drivers/dri/i965/brw_vue_map.c
@@ -59,10 +59,18 @@ assign_vue_slot(struct brw_vue_map *vue_map, int varying, int slot)
 void
 brw_compute_vue_map(const struct brw_device_info *devinfo,
                     struct brw_vue_map *vue_map,
-                    GLbitfield64 slots_valid)
+                    GLbitfield64 slots_valid,
+                    bool separate)
 {
+   /* Keep using the packed/contiguous layout on old hardware - we only need
+    * the SSO layout when using geometry/tessellation shaders or 32 FS input
+    * varyings, which only exist on Gen >= 6.  It's also a bit more efficient.
+    */
+   if (devinfo->gen < 6)
+      separate = false;
+
    vue_map->slots_valid = slots_valid;
-   int i;
+   vue_map->separate = separate;
 
    /* gl_Layer and gl_ViewportIndex don't get their own varying slots -- they
     * are stored in the first VUE slot (VARYING_SLOT_PSIZ).
@@ -77,7 +85,7 @@ brw_compute_vue_map(const struct brw_device_info *devinfo,
     */
    STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 127);
 
-   for (i = 0; i < BRW_VARYING_SLOT_COUNT; ++i) {
+   for (int i = 0; i < BRW_VARYING_SLOT_COUNT; ++i) {
       vue_map->varying_to_slot[i] = -1;
       vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_PAD;
    }
@@ -131,21 +139,42 @@ brw_compute_vue_map(const struct brw_device_info *devinfo,
          assign_vue_slot(vue_map, VARYING_SLOT_BFC1, slot++);
    }
 
-   /* The hardware doesn't care about the rest of the vertex outputs, so just
-    * assign them contiguously.  Don't reassign outputs that already have a
-    * slot.
+   /* The hardware doesn't care about the rest of the vertex outputs, so we
+    * can assign them however we like.  For normal programs, we simply assign
+    * them contiguously.
+    *
+    * For separate shader pipelines, we first assign built-in varyings
+    * contiguous slots.  This works because ARB_separate_shader_objects
+    * requires that all shaders have matching built-in varying interface
+    * blocks.  Next, we assign generic varyings based on their location
+    * (either explicit or linker assigned).  This guarantees a fixed layout.
     *
     * We generally don't need to assign a slot for VARYING_SLOT_CLIP_VERTEX,
     * since it's encoded as the clip distances by emit_clip_distances().
     * However, it may be output by transform feedback, and we'd rather not
     * recompute state when TF changes, so we just always include it.
     */
-   for (int i = 0; i < VARYING_SLOT_MAX; ++i) {
-      if ((slots_valid & BITFIELD64_BIT(i)) &&
-          vue_map->varying_to_slot[i] == -1) {
-         assign_vue_slot(vue_map, i, slot++);
+   GLbitfield64 builtins = slots_valid & BITFIELD64_MASK(VARYING_SLOT_VAR0);
+   while (builtins != 0) {
+      const int varying = ffsll(builtins) - 1;
+      if (vue_map->varying_to_slot[varying] == -1) {
+         assign_vue_slot(vue_map, varying, slot++);
+      }
+      builtins &= ~BITFIELD64_BIT(varying);
+   }
+
+   const int first_generic_slot = slot;
+   GLbitfield64 generics = slots_valid & ~BITFIELD64_MASK(VARYING_SLOT_VAR0);
+   while (generics != 0) {
+      const int varying = ffsll(generics) - 1;
+      if (separate) {
+         slot = first_generic_slot + varying - VARYING_SLOT_VAR0;
+         assign_vue_slot(vue_map, varying, slot);
+      } else {
+         assign_vue_slot(vue_map, varying, slot++);
       }
+      generics &= ~BITFIELD64_BIT(varying);
    }
 
-   vue_map->num_slots = slot;
+   vue_map->num_slots = separate ? slot + 1 : slot;
 }