summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/mesa/drivers/dri/i965/brw_context.h14
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.cpp3
-rw-r--r--src/mesa/drivers/dri/i965/brw_gs.c10
-rw-r--r--src/mesa/drivers/dri/i965/brw_vs.c7
-rw-r--r--src/mesa/drivers/dri/i965/brw_vue_map.c51
5 files changed, 67 insertions, 18 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 144d3e327d4..a7b612ad545 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -541,6 +541,17 @@ struct brw_vue_map {
GLbitfield64 slots_valid;
/**
+ * Is this VUE map for a separate shader pipeline?
+ *
+ * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched
+ * without the linker having a chance to dead code eliminate unused varyings.
+ *
+ * This means that we have to use a fixed slot layout, based on the output's
+ * location field, rather than assigning slots in a compact contiguous block.
+ */
+ bool separate;
+
+ /**
* Map from gl_varying_slot value to VUE slot. For gl_varying_slots that are
* not stored in a slot (because they are not written, or because
* additional processing is applied before storing them in the VUE), the
@@ -585,7 +596,8 @@ static inline GLuint brw_varying_to_offset(struct brw_vue_map *vue_map,
void brw_compute_vue_map(const struct brw_device_info *devinfo,
struct brw_vue_map *vue_map,
- GLbitfield64 slots_valid);
+ GLbitfield64 slots_valid,
+ bool separate_shader);
/**
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index a8f5520fb94..49dc7f65b48 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1440,7 +1440,8 @@ fs_visitor::calculate_urb_setup()
*/
struct brw_vue_map prev_stage_vue_map;
brw_compute_vue_map(devinfo, &prev_stage_vue_map,
- key->input_slots_valid);
+ key->input_slots_valid,
+ shader_prog->SeparateShader);
int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
assert(prev_stage_vue_map.num_slots <= first_slot + 32);
for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 16ea6846285..38b3e3a5cd9 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -120,7 +120,8 @@ brw_codegen_gs_prog(struct brw_context *brw,
GLbitfield64 outputs_written = gp->program.Base.OutputsWritten;
brw_compute_vue_map(brw->intelScreen->devinfo,
- &c.prog_data.base.vue_map, outputs_written);
+ &c.prog_data.base.vue_map, outputs_written,
+ prog ? prog->SeparateShader : false);
/* Compute the output vertex size.
*
@@ -243,7 +244,8 @@ brw_codegen_gs_prog(struct brw_context *brw,
get_hw_prim_for_gl_prim(gp->program.OutputType);
brw_compute_vue_map(brw->intelScreen->devinfo,
- &c.input_vue_map, c.key.input_varyings);
+ &c.input_vue_map, c.key.input_varyings,
+ prog->SeparateShader);
/* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
* need to program a URB read length of ceiling(num_slots / 2).
@@ -357,7 +359,9 @@ brw_upload_gs_prog(struct brw_context *brw)
brw->gs.base.prog_data = &brw->gs.prog_data->base.base;
if (brw->gs.prog_data->base.vue_map.slots_valid !=
- brw->vue_map_geom_out.slots_valid) {
+ brw->vue_map_geom_out.slots_valid ||
+ brw->gs.prog_data->base.vue_map.separate !=
+ brw->vue_map_geom_out.separate) {
brw->vue_map_geom_out = brw->gs.prog_data->base.vue_map;
brw->ctx.NewDriverState |= BRW_NEW_VUE_MAP_GEOM_OUT;
}
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 465e78f4c74..b1ec9637c32 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -180,7 +180,8 @@ brw_codegen_vs_prog(struct brw_context *brw,
}
brw_compute_vue_map(brw->intelScreen->devinfo,
- &prog_data.base.vue_map, outputs_written);
+ &prog_data.base.vue_map, outputs_written,
+ prog ? prog->SeparateShader : false);
if (0) {
_mesa_fprint_program_opt(stderr, &vp->program.Base, PROG_PRINT_DEBUG,
@@ -388,7 +389,9 @@ brw_upload_vs_prog(struct brw_context *brw)
brw->vs.base.prog_data = &brw->vs.prog_data->base.base;
if (brw->vs.prog_data->base.vue_map.slots_valid !=
- brw->vue_map_geom_out.slots_valid) {
+ brw->vue_map_geom_out.slots_valid ||
+ brw->vs.prog_data->base.vue_map.separate !=
+ brw->vue_map_geom_out.separate) {
brw->vue_map_vs = brw->vs.prog_data->base.vue_map;
brw->ctx.NewDriverState |= BRW_NEW_VUE_MAP_VS;
if (brw->gen < 6) {
diff --git a/src/mesa/drivers/dri/i965/brw_vue_map.c b/src/mesa/drivers/dri/i965/brw_vue_map.c
index 1ef52143cc5..45662bd5afc 100644
--- a/src/mesa/drivers/dri/i965/brw_vue_map.c
+++ b/src/mesa/drivers/dri/i965/brw_vue_map.c
@@ -59,10 +59,18 @@ assign_vue_slot(struct brw_vue_map *vue_map, int varying, int slot)
void
brw_compute_vue_map(const struct brw_device_info *devinfo,
struct brw_vue_map *vue_map,
- GLbitfield64 slots_valid)
+ GLbitfield64 slots_valid,
+ bool separate)
{
+ /* Keep using the packed/contiguous layout on old hardware - we only need
+ * the SSO layout when using geometry/tessellation shaders or 32 FS input
+ * varyings, which only exist on Gen >= 6. It's also a bit more efficient.
+ */
+ if (devinfo->gen < 6)
+ separate = false;
+
vue_map->slots_valid = slots_valid;
- int i;
+ vue_map->separate = separate;
/* gl_Layer and gl_ViewportIndex don't get their own varying slots -- they
* are stored in the first VUE slot (VARYING_SLOT_PSIZ).
@@ -77,7 +85,7 @@ brw_compute_vue_map(const struct brw_device_info *devinfo,
*/
STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 127);
- for (i = 0; i < BRW_VARYING_SLOT_COUNT; ++i) {
+ for (int i = 0; i < BRW_VARYING_SLOT_COUNT; ++i) {
vue_map->varying_to_slot[i] = -1;
vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_PAD;
}
@@ -131,21 +139,42 @@ brw_compute_vue_map(const struct brw_device_info *devinfo,
assign_vue_slot(vue_map, VARYING_SLOT_BFC1, slot++);
}
- /* The hardware doesn't care about the rest of the vertex outputs, so just
- * assign them contiguously. Don't reassign outputs that already have a
- * slot.
+ /* The hardware doesn't care about the rest of the vertex outputs, so we
+ * can assign them however we like. For normal programs, we simply assign
+ * them contiguously.
+ *
+ * For separate shader pipelines, we first assign built-in varyings
+ * contiguous slots. This works because ARB_separate_shader_objects
+ * requires that all shaders have matching built-in varying interface
+ * blocks. Next, we assign generic varyings based on their location
+ * (either explicit or linker assigned). This guarantees a fixed layout.
*
* We generally don't need to assign a slot for VARYING_SLOT_CLIP_VERTEX,
* since it's encoded as the clip distances by emit_clip_distances().
* However, it may be output by transform feedback, and we'd rather not
* recompute state when TF changes, so we just always include it.
*/
- for (int i = 0; i < VARYING_SLOT_MAX; ++i) {
- if ((slots_valid & BITFIELD64_BIT(i)) &&
- vue_map->varying_to_slot[i] == -1) {
- assign_vue_slot(vue_map, i, slot++);
+ GLbitfield64 builtins = slots_valid & BITFIELD64_MASK(VARYING_SLOT_VAR0);
+ while (builtins != 0) {
+ const int varying = ffsll(builtins) - 1;
+ if (vue_map->varying_to_slot[varying] == -1) {
+ assign_vue_slot(vue_map, varying, slot++);
+ }
+ builtins &= ~BITFIELD64_BIT(varying);
+ }
+
+ const int first_generic_slot = slot;
+ GLbitfield64 generics = slots_valid & ~BITFIELD64_MASK(VARYING_SLOT_VAR0);
+ while (generics != 0) {
+ const int varying = ffsll(generics) - 1;
+ if (separate) {
+ slot = first_generic_slot + varying - VARYING_SLOT_VAR0;
+ assign_vue_slot(vue_map, varying, slot);
+ } else {
+ assign_vue_slot(vue_map, varying, slot++);
}
+ generics &= ~BITFIELD64_BIT(varying);
}
- vue_map->num_slots = slot;
+ vue_map->num_slots = separate ? slot + 1 : slot;
}