diff options
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_context.h | 14 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.cpp | 3 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_gs.c | 10 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_vs.c | 7 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_vue_map.c | 51 |
5 files changed, 67 insertions, 18 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 144d3e327d4..a7b612ad545 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -541,6 +541,17 @@ struct brw_vue_map { GLbitfield64 slots_valid; /** + * Is this VUE map for a separate shader pipeline? + * + * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched + * without the linker having a chance to dead code eliminate unused varyings. + * + * This means that we have to use a fixed slot layout, based on the output's + * location field, rather than assigning slots in a compact contiguous block. + */ + bool separate; + + /** * Map from gl_varying_slot value to VUE slot. For gl_varying_slots that are * not stored in a slot (because they are not written, or because * additional processing is applied before storing them in the VUE), the @@ -585,7 +596,8 @@ static inline GLuint brw_varying_to_offset(struct brw_vue_map *vue_map, void brw_compute_vue_map(const struct brw_device_info *devinfo, struct brw_vue_map *vue_map, - GLbitfield64 slots_valid); + GLbitfield64 slots_valid, + bool separate_shader); /** diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index a8f5520fb94..49dc7f65b48 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -1440,7 +1440,8 @@ fs_visitor::calculate_urb_setup() */ struct brw_vue_map prev_stage_vue_map; brw_compute_vue_map(devinfo, &prev_stage_vue_map, - key->input_slots_valid); + key->input_slots_valid, + shader_prog->SeparateShader); int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET; assert(prev_stage_vue_map.num_slots <= first_slot + 32); for (int slot = first_slot; slot < prev_stage_vue_map.num_slots; diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c index 16ea6846285..38b3e3a5cd9 100644 --- a/src/mesa/drivers/dri/i965/brw_gs.c +++ b/src/mesa/drivers/dri/i965/brw_gs.c @@ -120,7 +120,8 @@ brw_codegen_gs_prog(struct brw_context *brw, GLbitfield64 outputs_written = gp->program.Base.OutputsWritten; brw_compute_vue_map(brw->intelScreen->devinfo, - &c.prog_data.base.vue_map, outputs_written); + &c.prog_data.base.vue_map, outputs_written, + prog ? prog->SeparateShader : false); /* Compute the output vertex size. * @@ -243,7 +244,8 @@ brw_codegen_gs_prog(struct brw_context *brw, get_hw_prim_for_gl_prim(gp->program.OutputType); brw_compute_vue_map(brw->intelScreen->devinfo, - &c.input_vue_map, c.key.input_varyings); + &c.input_vue_map, c.key.input_varyings, + prog->SeparateShader); /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we * need to program a URB read length of ceiling(num_slots / 2). @@ -357,7 +359,9 @@ brw_upload_gs_prog(struct brw_context *brw) brw->gs.base.prog_data = &brw->gs.prog_data->base.base; if (brw->gs.prog_data->base.vue_map.slots_valid != - brw->vue_map_geom_out.slots_valid) { + brw->vue_map_geom_out.slots_valid || + brw->gs.prog_data->base.vue_map.separate != + brw->vue_map_geom_out.separate) { brw->vue_map_geom_out = brw->gs.prog_data->base.vue_map; brw->ctx.NewDriverState |= BRW_NEW_VUE_MAP_GEOM_OUT; } diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c index 465e78f4c74..b1ec9637c32 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.c +++ b/src/mesa/drivers/dri/i965/brw_vs.c @@ -180,7 +180,8 @@ brw_codegen_vs_prog(struct brw_context *brw, } brw_compute_vue_map(brw->intelScreen->devinfo, - &prog_data.base.vue_map, outputs_written); + &prog_data.base.vue_map, outputs_written, + prog ? prog->SeparateShader : false); if (0) { _mesa_fprint_program_opt(stderr, &vp->program.Base, PROG_PRINT_DEBUG, @@ -388,7 +389,9 @@ brw_upload_vs_prog(struct brw_context *brw) brw->vs.base.prog_data = &brw->vs.prog_data->base.base; if (brw->vs.prog_data->base.vue_map.slots_valid != - brw->vue_map_geom_out.slots_valid) { + brw->vue_map_geom_out.slots_valid || + brw->vs.prog_data->base.vue_map.separate != + brw->vue_map_geom_out.separate) { brw->vue_map_vs = brw->vs.prog_data->base.vue_map; brw->ctx.NewDriverState |= BRW_NEW_VUE_MAP_VS; if (brw->gen < 6) { diff --git a/src/mesa/drivers/dri/i965/brw_vue_map.c b/src/mesa/drivers/dri/i965/brw_vue_map.c index 1ef52143cc5..45662bd5afc 100644 --- a/src/mesa/drivers/dri/i965/brw_vue_map.c +++ b/src/mesa/drivers/dri/i965/brw_vue_map.c @@ -59,10 +59,18 @@ assign_vue_slot(struct brw_vue_map *vue_map, int varying, int slot) void brw_compute_vue_map(const struct brw_device_info *devinfo, struct brw_vue_map *vue_map, - GLbitfield64 slots_valid) + GLbitfield64 slots_valid, + bool separate) { + /* Keep using the packed/contiguous layout on old hardware - we only need + * the SSO layout when using geometry/tessellation shaders or 32 FS input + * varyings, which only exist on Gen >= 6. It's also a bit more efficient. + */ + if (devinfo->gen < 6) + separate = false; + vue_map->slots_valid = slots_valid; - int i; + vue_map->separate = separate; /* gl_Layer and gl_ViewportIndex don't get their own varying slots -- they * are stored in the first VUE slot (VARYING_SLOT_PSIZ). @@ -77,7 +85,7 @@ brw_compute_vue_map(const struct brw_device_info *devinfo, */ STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 127); - for (i = 0; i < BRW_VARYING_SLOT_COUNT; ++i) { + for (int i = 0; i < BRW_VARYING_SLOT_COUNT; ++i) { vue_map->varying_to_slot[i] = -1; vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_PAD; } @@ -131,21 +139,42 @@ brw_compute_vue_map(const struct brw_device_info *devinfo, assign_vue_slot(vue_map, VARYING_SLOT_BFC1, slot++); } - /* The hardware doesn't care about the rest of the vertex outputs, so just - * assign them contiguously. Don't reassign outputs that already have a - * slot. + /* The hardware doesn't care about the rest of the vertex outputs, so we + * can assign them however we like. For normal programs, we simply assign + * them contiguously. + * + * For separate shader pipelines, we first assign built-in varyings + * contiguous slots. This works because ARB_separate_shader_objects + * requires that all shaders have matching built-in varying interface + * blocks. Next, we assign generic varyings based on their location + * (either explicit or linker assigned). This guarantees a fixed layout. * * We generally don't need to assign a slot for VARYING_SLOT_CLIP_VERTEX, * since it's encoded as the clip distances by emit_clip_distances(). * However, it may be output by transform feedback, and we'd rather not * recompute state when TF changes, so we just always include it. */ - for (int i = 0; i < VARYING_SLOT_MAX; ++i) { - if ((slots_valid & BITFIELD64_BIT(i)) && - vue_map->varying_to_slot[i] == -1) { - assign_vue_slot(vue_map, i, slot++); + GLbitfield64 builtins = slots_valid & BITFIELD64_MASK(VARYING_SLOT_VAR0); + while (builtins != 0) { + const int varying = ffsll(builtins) - 1; + if (vue_map->varying_to_slot[varying] == -1) { + assign_vue_slot(vue_map, varying, slot++); + } + builtins &= ~BITFIELD64_BIT(varying); + } + + const int first_generic_slot = slot; + GLbitfield64 generics = slots_valid & ~BITFIELD64_MASK(VARYING_SLOT_VAR0); + while (generics != 0) { + const int varying = ffsll(generics) - 1; + if (separate) { + slot = first_generic_slot + varying - VARYING_SLOT_VAR0; + assign_vue_slot(vue_map, varying, slot); + } else { + assign_vue_slot(vue_map, varying, slot++); } + generics &= ~BITFIELD64_BIT(varying); } - vue_map->num_slots = slot; + vue_map->num_slots = separate ? slot + 1 : slot; } |