/* * Copyright © 2019 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include "nir.h" #include "nir_builder.h" #include "nir_deref.h" #include "util/bitscan.h" #include "util/list.h" #include "util/u_math.h" /* Combine stores of vectors to the same deref into a single store. * * This per-block pass keeps track of stores of vectors to the same * destination and combines them into the last store of the sequence. Dead * stores (or parts of the store) found during the process are removed. * * A pending combination becomes an actual combination in various situations: * at the end of the block, when another instruction uses the memory or due to * barriers. * * Besides vectors, the pass also look at array derefs of vectors. For direct * array derefs, it works like a write mask access to the given component. * For indirect access there's no way to know before hand what component it * will overlap with, so the combination is finished -- the indirect remains * unmodified. */ /* Keep track of a group of stores that can be combined. All stores share the * same destination. */ struct combined_store { struct list_head link; nir_component_mask_t write_mask; nir_deref_instr *dst; /* Latest store added. It is reused when combining. */ nir_intrinsic_instr *latest; /* Original store for each component. The number of times a store appear * in this array is kept in the store's pass_flags. */ nir_intrinsic_instr *stores[NIR_MAX_VEC_COMPONENTS]; }; struct combine_stores_state { nir_variable_mode modes; /* Pending store combinations. */ struct list_head pending; /* Per function impl state. */ nir_builder b; bool progress; /* Allocator and freelist to reuse structs between functions. */ void *lin_ctx; struct list_head freelist; }; static struct combined_store * alloc_combined_store(struct combine_stores_state *state) { struct combined_store *result; if (list_is_empty(&state->freelist)) { result = linear_zalloc_child(state->lin_ctx, sizeof(*result)); } else { result = list_first_entry(&state->freelist, struct combined_store, link); list_del(&result->link); memset(result, 0, sizeof(*result)); } return result; } static void free_combined_store(struct combine_stores_state *state, struct combined_store *combo) { list_del(&combo->link); combo->write_mask = 0; list_add(&combo->link, &state->freelist); } static void combine_stores(struct combine_stores_state *state, struct combined_store *combo) { assert(combo->latest); assert(combo->latest->intrinsic == nir_intrinsic_store_deref); /* If the combined writemask is the same as the latest store, we know there * is only one store in the combination, so nothing to combine. */ if ((combo->write_mask & nir_intrinsic_write_mask(combo->latest)) == combo->write_mask) return; state->b.cursor = nir_before_instr(&combo->latest->instr); /* Build a new vec, to be used as source for the combined store. As it * gets build, remove previous stores that are not needed anymore. */ nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS] = {0}; unsigned num_components = glsl_get_vector_elements(combo->dst->type); unsigned bit_size = combo->latest->src[1].ssa->bit_size; for (unsigned i = 0; i < num_components; i++) { nir_intrinsic_instr *store = combo->stores[i]; if (combo->write_mask & (1 << i)) { assert(store); assert(store->src[1].is_ssa); /* If store->num_components == 1 then we are in the deref-of-vec case * and store->src[1] is a scalar. Otherwise, we're a regular vector * load and we have to pick off a component. */ comps[i] = store->num_components == 1 ? store->src[1].ssa : nir_channel(&state->b, store->src[1].ssa, i); assert(store->instr.pass_flags > 0); if (--store->instr.pass_flags == 0 && store != combo->latest) nir_instr_remove(&store->instr); } else { comps[i] = nir_ssa_undef(&state->b, 1, bit_size); } } assert(combo->latest->instr.pass_flags == 0); nir_ssa_def *vec = nir_vec(&state->b, comps, num_components); /* Fix the latest store with the combined information. */ nir_intrinsic_instr *store = combo->latest; /* In this case, our store is as an array deref of a vector so we need to * rewrite it to use a deref to the whole vector. */ if (store->num_components == 1) { store->num_components = num_components; nir_instr_rewrite_src(&store->instr, &store->src[0], nir_src_for_ssa(&combo->dst->dest.ssa)); } assert(store->num_components == num_components); nir_intrinsic_set_write_mask(store, combo->write_mask); nir_instr_rewrite_src(&store->instr, &store->src[1], nir_src_for_ssa(vec)); state->progress = true; } static void combine_stores_with_deref(struct combine_stores_state *state, nir_deref_instr *deref) { if ((state->modes & deref->mode) == 0) return; list_for_each_entry_safe(struct combined_store, combo, &state->pending, link) { if (nir_compare_derefs(combo->dst, deref) & nir_derefs_may_alias_bit) { combine_stores(state, combo); free_combined_store(state, combo); } } } static void combine_stores_with_modes(struct combine_stores_state *state, nir_variable_mode modes) { if ((state->modes & modes) == 0) return; list_for_each_entry_safe(struct combined_store, combo, &state->pending, link) { if (combo->dst->mode & modes) { combine_stores(state, combo); free_combined_store(state, combo); } } } static struct combined_store * find_matching_combined_store(struct combine_stores_state *state, nir_deref_instr *deref) { list_for_each_entry(struct combined_store, combo, &state->pending, link) { if (nir_compare_derefs(combo->dst, deref) & nir_derefs_equal_bit) return combo; } return NULL; } static void update_combined_store(struct combine_stores_state *state, nir_intrinsic_instr *intrin) { nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]); if ((dst->mode & state->modes) == 0) return; unsigned vec_mask; nir_deref_instr *vec_dst; if (glsl_type_is_vector(dst->type)) { vec_mask = nir_intrinsic_write_mask(intrin); vec_dst = dst; } else { /* Besides vectors, only direct array derefs of vectors are handled. */ if (dst->deref_type != nir_deref_type_array || !nir_src_is_const(dst->arr.index) || !glsl_type_is_vector(nir_deref_instr_parent(dst)->type)) { combine_stores_with_deref(state, dst); return; } uint64_t index = nir_src_as_uint(dst->arr.index); vec_dst = nir_deref_instr_parent(dst); if (index >= glsl_get_vector_elements(vec_dst->type)) { /* Storing to an invalid index is a no-op. */ nir_instr_remove(&intrin->instr); state->progress = true; return; } vec_mask = 1 << index; } struct combined_store *combo = find_matching_combined_store(state, vec_dst); if (!combo) { combo = alloc_combined_store(state); combo->dst = vec_dst; list_add(&combo->link, &state->pending); } /* Use pass_flags to reference count the store based on how many * components are still used by the combination. */ intrin->instr.pass_flags = util_bitcount(vec_mask); combo->latest = intrin; /* Update the combined_store, clearing up older overlapping references. */ combo->write_mask |= vec_mask; while (vec_mask) { unsigned i = u_bit_scan(&vec_mask); nir_intrinsic_instr *prev_store = combo->stores[i]; if (prev_store) { if (--prev_store->instr.pass_flags == 0) { nir_instr_remove(&prev_store->instr); } else { assert(glsl_type_is_vector( nir_src_as_deref(prev_store->src[0])->type)); nir_component_mask_t prev_mask = nir_intrinsic_write_mask(prev_store); nir_intrinsic_set_write_mask(prev_store, prev_mask & ~(1 << i)); } state->progress = true; } combo->stores[i] = combo->latest; } } static void combine_stores_block(struct combine_stores_state *state, nir_block *block) { nir_foreach_instr_safe(instr, block) { if (instr->type == nir_instr_type_call) { combine_stores_with_modes(state, nir_var_shader_out | nir_var_shader_temp | nir_var_function_temp | nir_var_mem_ssbo | nir_var_mem_shared | nir_var_mem_global); continue; } if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); switch (intrin->intrinsic) { case nir_intrinsic_store_deref: update_combined_store(state, intrin); break; case nir_intrinsic_control_barrier: case nir_intrinsic_group_memory_barrier: case nir_intrinsic_memory_barrier: combine_stores_with_modes(state, nir_var_shader_out | nir_var_mem_ssbo | nir_var_mem_shared | nir_var_mem_global); break; case nir_intrinsic_memory_barrier_buffer: combine_stores_with_modes(state, nir_var_mem_ssbo | nir_var_mem_global); break; case nir_intrinsic_memory_barrier_shared: combine_stores_with_modes(state, nir_var_mem_shared); break; case nir_intrinsic_memory_barrier_tcs_patch: combine_stores_with_modes(state, nir_var_shader_out); break; case nir_intrinsic_scoped_memory_barrier: if (nir_intrinsic_memory_semantics(intrin) & NIR_MEMORY_RELEASE) { combine_stores_with_modes(state, nir_intrinsic_memory_modes(intrin)); } break; case nir_intrinsic_emit_vertex: case nir_intrinsic_emit_vertex_with_counter: combine_stores_with_modes(state, nir_var_shader_out); break; case nir_intrinsic_load_deref: { nir_deref_instr *src = nir_src_as_deref(intrin->src[0]); combine_stores_with_deref(state, src); break; } case nir_intrinsic_copy_deref: { nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]); nir_deref_instr *src = nir_src_as_deref(intrin->src[1]); combine_stores_with_deref(state, dst); combine_stores_with_deref(state, src); break; } case nir_intrinsic_deref_atomic_add: case nir_intrinsic_deref_atomic_imin: case nir_intrinsic_deref_atomic_umin: case nir_intrinsic_deref_atomic_imax: case nir_intrinsic_deref_atomic_umax: case nir_intrinsic_deref_atomic_and: case nir_intrinsic_deref_atomic_or: case nir_intrinsic_deref_atomic_xor: case nir_intrinsic_deref_atomic_exchange: case nir_intrinsic_deref_atomic_comp_swap: { nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]); combine_stores_with_deref(state, dst); break; } default: break; } } /* At the end of the block, try all the remaining combinations. */ combine_stores_with_modes(state, state->modes); } static bool combine_stores_impl(struct combine_stores_state *state, nir_function_impl *impl) { state->progress = false; nir_builder_init(&state->b, impl); nir_foreach_block(block, impl) combine_stores_block(state, block); if (state->progress) { nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance); } return state->progress; } bool nir_opt_combine_stores(nir_shader *shader, nir_variable_mode modes) { void *mem_ctx = ralloc_context(NULL); struct combine_stores_state state = { .modes = modes, .lin_ctx = linear_zalloc_parent(mem_ctx, 0), }; list_inithead(&state.pending); list_inithead(&state.freelist); bool progress = false; nir_foreach_function(function, shader) { if (!function->impl) continue; progress |= combine_stores_impl(&state, function->impl); } ralloc_free(mem_ctx); return progress; }