From 1c9c42d16b4c8ab896537c32e3b2df237be69323 Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Wed, 18 Oct 2017 19:40:06 +1100 Subject: nir: add varying component packing helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v2: update shader info input/output masks when pack components v3: make sure interpolation loc matches, this is required for the radeonsi NIR backend. v4: 33dca36f4f28 fixed nir_gather_info to update outputs_read correct, make sure we also adjust this correctly when packing components. Reviewed-by: Bas Nieuwenhuizen (v1) Reviewed-by: Nicolai Hähnle (v3) --- src/compiler/nir/nir.h | 2 + src/compiler/nir/nir_linking_helpers.c | 330 +++++++++++++++++++++++++++++++++ 2 files changed, 332 insertions(+) diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 4c5d976a60d..83858afe148 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -2459,6 +2459,8 @@ void nir_assign_var_locations(struct exec_list *var_list, unsigned *size, /* Some helpers to do very simple linking */ bool nir_remove_unused_varyings(nir_shader *producer, nir_shader *consumer); +void nir_compact_varyings(nir_shader *producer, nir_shader *consumer, + bool default_to_smooth_interp); typedef enum { /* If set, this forces all non-flat fragment shader inputs to be diff --git a/src/compiler/nir/nir_linking_helpers.c b/src/compiler/nir/nir_linking_helpers.c index 4d709c1b3c5..9f0122d4519 100644 --- a/src/compiler/nir/nir_linking_helpers.c +++ b/src/compiler/nir/nir_linking_helpers.c @@ -173,3 +173,333 @@ nir_remove_unused_varyings(nir_shader *producer, nir_shader *consumer) return progress; } + +static uint8_t +get_interp_type(nir_variable *var, bool default_to_smooth_interp) +{ + if (var->data.interpolation != INTERP_MODE_NONE) + return var->data.interpolation; + else if (default_to_smooth_interp) + return INTERP_MODE_SMOOTH; + else + return INTERP_MODE_NONE; +} + +#define INTERPOLATE_LOC_SAMPLE 0 +#define INTERPOLATE_LOC_CENTROID 1 +#define INTERPOLATE_LOC_CENTER 2 + +static uint8_t +get_interp_loc(nir_variable *var) +{ + if (var->data.sample) + return INTERPOLATE_LOC_SAMPLE; + else if (var->data.centroid) + return INTERPOLATE_LOC_CENTROID; + else + return INTERPOLATE_LOC_CENTER; +} + +static void +get_slot_component_masks_and_interp_types(struct exec_list *var_list, + uint8_t *comps, + uint8_t *interp_type, + uint8_t *interp_loc, + gl_shader_stage stage, + bool default_to_smooth_interp) +{ + nir_foreach_variable_safe(var, var_list) { + assert(var->data.location >= 0); + + /* Only remap things that aren't built-ins. + * TODO: add TES patch support. + */ + if (var->data.location >= VARYING_SLOT_VAR0 && + var->data.location - VARYING_SLOT_VAR0 < 32) { + + const struct glsl_type *type = var->type; + if (nir_is_per_vertex_io(var, stage)) { + assert(glsl_type_is_array(type)); + type = glsl_get_array_element(type); + } + + unsigned location = var->data.location - VARYING_SLOT_VAR0; + unsigned elements = + glsl_get_vector_elements(glsl_without_array(type)); + + bool dual_slot = glsl_type_is_dual_slot(glsl_without_array(type)); + unsigned slots = glsl_count_attribute_slots(type, false); + unsigned comps_slot2 = 0; + for (unsigned i = 0; i < slots; i++) { + interp_type[location + i] = + get_interp_type(var, default_to_smooth_interp); + interp_loc[location + i] = get_interp_loc(var); + + if (dual_slot) { + if (i & 1) { + comps[location + i] |= ((1 << comps_slot2) - 1); + } else { + unsigned num_comps = 4 - var->data.location_frac; + comps_slot2 = (elements * 2) - num_comps; + + /* Assume ARB_enhanced_layouts packing rules for doubles */ + assert(var->data.location_frac == 0 || + var->data.location_frac == 2); + assert(comps_slot2 <= 4); + + comps[location + i] |= + ((1 << num_comps) - 1) << var->data.location_frac; + } + } else { + comps[location + i] |= + ((1 << elements) - 1) << var->data.location_frac; + } + } + } + } +} + +struct varying_loc +{ + uint8_t component; + uint32_t location; +}; + +static void +remap_slots_and_components(struct exec_list *var_list, gl_shader_stage stage, + struct varying_loc (*remap)[4], + uint64_t *slots_used, uint64_t *out_slots_read) + { + uint64_t out_slots_read_tmp = 0; + + /* We don't touch builtins so just copy the bitmask */ + uint64_t slots_used_tmp = + *slots_used & (((uint64_t)1 << (VARYING_SLOT_VAR0 - 1)) - 1); + + nir_foreach_variable(var, var_list) { + assert(var->data.location >= 0); + + /* Only remap things that aren't built-ins */ + if (var->data.location >= VARYING_SLOT_VAR0 && + var->data.location - VARYING_SLOT_VAR0 < 32) { + assert(var->data.location - VARYING_SLOT_VAR0 < 32); + assert(remap[var->data.location - VARYING_SLOT_VAR0] >= 0); + + const struct glsl_type *type = var->type; + if (nir_is_per_vertex_io(var, stage)) { + assert(glsl_type_is_array(type)); + type = glsl_get_array_element(type); + } + + unsigned num_slots = glsl_count_attribute_slots(type, false); + bool used_across_stages = false; + bool outputs_read = false; + + unsigned location = var->data.location - VARYING_SLOT_VAR0; + struct varying_loc *new_loc = &remap[location][var->data.location_frac]; + if (new_loc->location) { + uint64_t slots = (((uint64_t)1 << num_slots) - 1) << var->data.location; + if (slots & *slots_used) + used_across_stages = true; + + if (slots & *out_slots_read) + outputs_read = true; + + var->data.location = new_loc->location; + var->data.location_frac = new_loc->component; + } + + if (var->data.always_active_io) { + /* We can't apply link time optimisations (specifically array + * splitting) to these so we need to copy the existing mask + * otherwise we will mess up the mask for things like partially + * marked arrays. + */ + if (used_across_stages) { + slots_used_tmp |= + *slots_used & (((uint64_t)1 << num_slots) - 1) << var->data.location; + } + + if (outputs_read) { + out_slots_read_tmp |= + *out_slots_read & (((uint64_t)1 << num_slots) - 1) << var->data.location; + } + + } else { + for (unsigned i = 0; i < num_slots; i++) { + if (used_across_stages) + slots_used_tmp |= (uint64_t)1 << (var->data.location + i); + + if (outputs_read) + out_slots_read_tmp |= (uint64_t)1 << (var->data.location + i); + } + } + } + } + + *slots_used = slots_used_tmp; + *out_slots_read = out_slots_read_tmp; +} + +/* If there are empty components in the slot compact the remaining components + * as close to component 0 as possible. This will make it easier to fill the + * empty components with components from a different slot in a following pass. + */ +static void +compact_components(nir_shader *producer, nir_shader *consumer, uint8_t *comps, + uint8_t *interp_type, uint8_t *interp_loc, + bool default_to_smooth_interp) +{ + struct exec_list *input_list = &consumer->inputs; + struct exec_list *output_list = &producer->outputs; + struct varying_loc remap[32][4] = {{{0}, {0}}}; + + /* Create a cursor for each interpolation type */ + unsigned cursor[4] = {0}; + + /* We only need to pass over one stage and we choose the consumer as it seems + * to cause a larger reduction in instruction counts (tested on i965). + */ + nir_foreach_variable(var, input_list) { + + /* Only remap things that aren't builtins. + * TODO: add TES patch support. + */ + if (var->data.location >= VARYING_SLOT_VAR0 && + var->data.location - VARYING_SLOT_VAR0 < 32) { + + /* We can't repack xfb varyings. */ + if (var->data.always_active_io) + continue; + + const struct glsl_type *type = var->type; + if (nir_is_per_vertex_io(var, consumer->info.stage)) { + assert(glsl_type_is_array(type)); + type = glsl_get_array_element(type); + } + + /* Skip types that require more complex packing handling. + * TODO: add support for these types. + */ + if (glsl_type_is_array(type) || + glsl_type_is_dual_slot(type) || + glsl_type_is_matrix(type) || + glsl_type_is_struct(type) || + glsl_type_is_64bit(type)) + continue; + + /* We ignore complex types above and all other vector types should + * have been split into scalar variables by the lower_io_to_scalar + * pass. The only exeption should by OpenGL xfb varyings. + */ + if (glsl_get_vector_elements(type) != 1) + continue; + + unsigned location = var->data.location - VARYING_SLOT_VAR0; + uint8_t used_comps = comps[location]; + + /* If there are no empty components there is nothing more for us to do. + */ + if (used_comps == 0xf) + continue; + + bool found_new_offset = false; + uint8_t interp = get_interp_type(var, default_to_smooth_interp); + for (; cursor[interp] < 32; cursor[interp]++) { + uint8_t cursor_used_comps = comps[cursor[interp]]; + + /* We couldn't find anywhere to pack the varying continue on. */ + if (cursor[interp] == location && + (var->data.location_frac == 0 || + cursor_used_comps & ((1 << (var->data.location_frac)) - 1))) + break; + + /* We can only pack varyings with matching interpolation types */ + if (interp_type[cursor[interp]] != interp) + continue; + + /* Interpolation loc must match also. + * TODO: i965 can handle these if they don't match, but the + * radeonsi nir backend handles everything as vec4s and so expects + * this to be the same for all components. We could make this + * check driver specfific or drop it if NIR ever become the only + * radeonsi backend. + */ + if (interp_loc[cursor[interp]] != get_interp_loc(var)) + continue; + + /* If the slot is empty just skip it for now, compact_var_list() + * can be called after this function to remove empty slots for us. + * TODO: finish implementing compact_var_list() requires array and + * matrix splitting. + */ + if (!cursor_used_comps) + continue; + + uint8_t unused_comps = ~cursor_used_comps; + + for (unsigned i = 0; i < 4; i++) { + uint8_t new_var_comps = 1 << i; + if (unused_comps & new_var_comps) { + remap[location][var->data.location_frac].component = i; + remap[location][var->data.location_frac].location = + cursor[interp] + VARYING_SLOT_VAR0; + + found_new_offset = true; + + /* Turn off the mask for the component we are remapping */ + if (comps[location] & 1 << var->data.location_frac) { + comps[location] ^= 1 << var->data.location_frac; + comps[cursor[interp]] |= new_var_comps; + } + break; + } + } + + if (found_new_offset) + break; + } + } + } + + uint64_t zero = 0; + remap_slots_and_components(input_list, consumer->info.stage, remap, + &consumer->info.inputs_read, &zero); + remap_slots_and_components(output_list, producer->info.stage, remap, + &producer->info.outputs_written, + &producer->info.outputs_read); +} + +/* We assume that this has been called more-or-less directly after + * remove_unused_varyings. At this point, all of the varyings that we + * aren't going to be using have been completely removed and the + * inputs_read and outputs_written fields in nir_shader_info reflect + * this. Therefore, the total set of valid slots is the OR of the two + * sets of varyings; this accounts for varyings which one side may need + * to read/write even if the other doesn't. This can happen if, for + * instance, an array is used indirectly from one side causing it to be + * unsplittable but directly from the other. + */ +void +nir_compact_varyings(nir_shader *producer, nir_shader *consumer, + bool default_to_smooth_interp) +{ + assert(producer->info.stage != MESA_SHADER_FRAGMENT); + assert(consumer->info.stage != MESA_SHADER_VERTEX); + + uint8_t comps[32] = {0}; + uint8_t interp_type[32] = {0}; + uint8_t interp_loc[32] = {0}; + + get_slot_component_masks_and_interp_types(&producer->outputs, comps, + interp_type, interp_loc, + producer->info.stage, + default_to_smooth_interp); + get_slot_component_masks_and_interp_types(&consumer->inputs, comps, + interp_type, interp_loc, + consumer->info.stage, + default_to_smooth_interp); + + compact_components(producer, consumer, comps, interp_type, interp_loc, + default_to_smooth_interp); +} -- cgit v1.2.3