/* * Copyright © 2015 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include "anv_nir.h" #include "program/prog_parameter.h" #include "nir/nir_builder.h" #include "compiler/brw_nir.h" #include "util/set.h" /* Sampler tables don't actually have a maximum size but we pick one just so * that we don't end up emitting too much state on-the-fly. */ #define MAX_SAMPLER_TABLE_SIZE 128 #define BINDLESS_OFFSET 255 struct apply_pipeline_layout_state { const struct anv_physical_device *pdevice; nir_shader *shader; nir_builder builder; struct anv_pipeline_layout *layout; bool add_bounds_checks; nir_address_format ssbo_addr_format; /* Place to flag lowered instructions so we don't lower them twice */ struct set *lowered_instrs; int dynamic_offset_uniform_start; bool uses_constants; uint8_t constants_offset; struct { bool desc_buffer_used; uint8_t desc_offset; uint8_t *use_count; uint8_t *surface_offsets; uint8_t *sampler_offsets; } set[MAX_SETS]; }; static void add_binding(struct apply_pipeline_layout_state *state, uint32_t set, uint32_t binding) { const struct anv_descriptor_set_binding_layout *bind_layout = &state->layout->set[set].layout->binding[binding]; if (state->set[set].use_count[binding] < UINT8_MAX) state->set[set].use_count[binding]++; /* Only flag the descriptor buffer as used if there's actually data for * this binding. This lets us be lazy and call this function constantly * without worrying about unnecessarily enabling the buffer. */ if (anv_descriptor_size(bind_layout)) state->set[set].desc_buffer_used = true; } static void add_deref_src_binding(struct apply_pipeline_layout_state *state, nir_src src) { nir_deref_instr *deref = nir_src_as_deref(src); nir_variable *var = nir_deref_instr_get_variable(deref); add_binding(state, var->data.descriptor_set, var->data.binding); } static void add_tex_src_binding(struct apply_pipeline_layout_state *state, nir_tex_instr *tex, nir_tex_src_type deref_src_type) { int deref_src_idx = nir_tex_instr_src_index(tex, deref_src_type); if (deref_src_idx < 0) return; add_deref_src_binding(state, tex->src[deref_src_idx].src); } static void get_used_bindings_block(nir_block *block, struct apply_pipeline_layout_state *state) { nir_foreach_instr_safe(instr, block) { switch (instr->type) { case nir_instr_type_intrinsic: { nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); switch (intrin->intrinsic) { case nir_intrinsic_vulkan_resource_index: add_binding(state, nir_intrinsic_desc_set(intrin), nir_intrinsic_binding(intrin)); break; case nir_intrinsic_image_deref_load: case nir_intrinsic_image_deref_store: case nir_intrinsic_image_deref_atomic_add: case nir_intrinsic_image_deref_atomic_imin: case nir_intrinsic_image_deref_atomic_umin: case nir_intrinsic_image_deref_atomic_imax: case nir_intrinsic_image_deref_atomic_umax: case nir_intrinsic_image_deref_atomic_and: case nir_intrinsic_image_deref_atomic_or: case nir_intrinsic_image_deref_atomic_xor: case nir_intrinsic_image_deref_atomic_exchange: case nir_intrinsic_image_deref_atomic_comp_swap: case nir_intrinsic_image_deref_size: case nir_intrinsic_image_deref_samples: case nir_intrinsic_image_deref_load_param_intel: case nir_intrinsic_image_deref_load_raw_intel: case nir_intrinsic_image_deref_store_raw_intel: add_deref_src_binding(state, intrin->src[0]); break; case nir_intrinsic_load_constant: state->uses_constants = true; break; default: break; } break; } case nir_instr_type_tex: { nir_tex_instr *tex = nir_instr_as_tex(instr); add_tex_src_binding(state, tex, nir_tex_src_texture_deref); add_tex_src_binding(state, tex, nir_tex_src_sampler_deref); break; } default: continue; } } } static bool find_descriptor_for_index_src(nir_src src, struct apply_pipeline_layout_state *state) { nir_intrinsic_instr *intrin = nir_src_as_intrinsic(src); while (intrin && intrin->intrinsic == nir_intrinsic_vulkan_resource_reindex) intrin = nir_src_as_intrinsic(intrin->src[0]); if (!intrin || intrin->intrinsic != nir_intrinsic_vulkan_resource_index) return false; uint32_t set = nir_intrinsic_desc_set(intrin); uint32_t binding = nir_intrinsic_binding(intrin); uint32_t surface_index = state->set[set].surface_offsets[binding]; /* Only lower to a BTI message if we have a valid binding table index. */ return surface_index < MAX_BINDING_TABLE_SIZE; } static bool nir_deref_find_descriptor(nir_deref_instr *deref, struct apply_pipeline_layout_state *state) { while (1) { /* Nothing we will use this on has a variable */ assert(deref->deref_type != nir_deref_type_var); nir_deref_instr *parent = nir_src_as_deref(deref->parent); if (!parent) break; deref = parent; } assert(deref->deref_type == nir_deref_type_cast); nir_intrinsic_instr *intrin = nir_src_as_intrinsic(deref->parent); if (!intrin || intrin->intrinsic != nir_intrinsic_load_vulkan_descriptor) return false; return find_descriptor_for_index_src(intrin->src[0], state); } static nir_ssa_def * build_index_for_res_reindex(nir_intrinsic_instr *intrin, struct apply_pipeline_layout_state *state) { nir_builder *b = &state->builder; if (intrin->intrinsic == nir_intrinsic_vulkan_resource_reindex) { nir_ssa_def *bti = build_index_for_res_reindex(nir_src_as_intrinsic(intrin->src[0]), state); b->cursor = nir_before_instr(&intrin->instr); return nir_iadd(b, bti, nir_ssa_for_src(b, intrin->src[1], 1)); } assert(intrin->intrinsic == nir_intrinsic_vulkan_resource_index); uint32_t set = nir_intrinsic_desc_set(intrin); uint32_t binding = nir_intrinsic_binding(intrin); const struct anv_descriptor_set_binding_layout *bind_layout = &state->layout->set[set].layout->binding[binding]; uint32_t surface_index = state->set[set].surface_offsets[binding]; uint32_t array_size = bind_layout->array_size; b->cursor = nir_before_instr(&intrin->instr); nir_ssa_def *array_index = nir_ssa_for_src(b, intrin->src[0], 1); if (nir_src_is_const(intrin->src[0]) || state->add_bounds_checks) array_index = nir_umin(b, array_index, nir_imm_int(b, array_size - 1)); return nir_iadd_imm(b, array_index, surface_index); } static nir_ssa_def * build_index_offset_for_deref(nir_deref_instr *deref, struct apply_pipeline_layout_state *state) { nir_builder *b = &state->builder; nir_deref_instr *parent = nir_deref_instr_parent(deref); if (parent) { nir_ssa_def *addr = build_index_offset_for_deref(parent, state); b->cursor = nir_before_instr(&deref->instr); return nir_explicit_io_address_from_deref(b, deref, addr, nir_address_format_32bit_index_offset); } nir_intrinsic_instr *load_desc = nir_src_as_intrinsic(deref->parent); assert(load_desc->intrinsic == nir_intrinsic_load_vulkan_descriptor); nir_ssa_def *index = build_index_for_res_reindex(nir_src_as_intrinsic(load_desc->src[0]), state); /* Return a 0 offset which will get picked up by the recursion */ b->cursor = nir_before_instr(&deref->instr); return nir_vec2(b, index, nir_imm_int(b, 0)); } static bool try_lower_direct_buffer_intrinsic(nir_intrinsic_instr *intrin, bool is_atomic, struct apply_pipeline_layout_state *state) { nir_builder *b = &state->builder; nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); if (deref->mode != nir_var_mem_ssbo) return false; /* 64-bit atomics only support A64 messages so we can't lower them to the * index+offset model. */ if (is_atomic && nir_dest_bit_size(intrin->dest) == 64) return false; /* Normal binding table-based messages can't handle non-uniform access so * we have to fall back to A64. */ if (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM) return false; if (!nir_deref_find_descriptor(deref, state)) return false; nir_ssa_def *addr = build_index_offset_for_deref(deref, state); b->cursor = nir_before_instr(&intrin->instr); nir_lower_explicit_io_instr(b, intrin, addr, nir_address_format_32bit_index_offset); return true; } static void lower_direct_buffer_access(nir_function_impl *impl, struct apply_pipeline_layout_state *state) { nir_foreach_block(block, impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); switch (intrin->intrinsic) { case nir_intrinsic_load_deref: case nir_intrinsic_store_deref: try_lower_direct_buffer_intrinsic(intrin, false, state); break; case nir_intrinsic_deref_atomic_add: case nir_intrinsic_deref_atomic_imin: case nir_intrinsic_deref_atomic_umin: case nir_intrinsic_deref_atomic_imax: case nir_intrinsic_deref_atomic_umax: case nir_intrinsic_deref_atomic_and: case nir_intrinsic_deref_atomic_or: case nir_intrinsic_deref_atomic_xor: case nir_intrinsic_deref_atomic_exchange: case nir_intrinsic_deref_atomic_comp_swap: case nir_intrinsic_deref_atomic_fmin: case nir_intrinsic_deref_atomic_fmax: case nir_intrinsic_deref_atomic_fcomp_swap: try_lower_direct_buffer_intrinsic(intrin, true, state); break; case nir_intrinsic_get_buffer_size: { /* The get_buffer_size intrinsic always just takes a * index/reindex intrinsic. */ if (!find_descriptor_for_index_src(intrin->src[0], state)) break; nir_ssa_def *index = build_index_for_res_reindex(nir_src_as_intrinsic(intrin->src[0]), state); nir_instr_rewrite_src(&intrin->instr, &intrin->src[0], nir_src_for_ssa(index)); _mesa_set_add(state->lowered_instrs, intrin); } default: break; } } } } static nir_address_format desc_addr_format(VkDescriptorType desc_type, struct apply_pipeline_layout_state *state) { return (desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER || desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) ? state->ssbo_addr_format : nir_address_format_32bit_index_offset; } static void lower_res_index_intrinsic(nir_intrinsic_instr *intrin, struct apply_pipeline_layout_state *state) { nir_builder *b = &state->builder; b->cursor = nir_before_instr(&intrin->instr); uint32_t set = nir_intrinsic_desc_set(intrin); uint32_t binding = nir_intrinsic_binding(intrin); const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin); const struct anv_descriptor_set_binding_layout *bind_layout = &state->layout->set[set].layout->binding[binding]; uint32_t surface_index = state->set[set].surface_offsets[binding]; uint32_t array_size = bind_layout->array_size; nir_ssa_def *array_index = nir_ssa_for_src(b, intrin->src[0], 1); if (nir_src_is_const(intrin->src[0]) || state->add_bounds_checks) array_index = nir_umin(b, array_index, nir_imm_int(b, array_size - 1)); nir_ssa_def *index; if (state->pdevice->has_a64_buffer_access && (desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER || desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)) { /* We store the descriptor offset as 16.8.8 where the top 16 bits are * the offset into the descriptor set, the next 8 are the binding table * index of the descriptor buffer, and the bottom 8 bits are the offset * (in bytes) into the dynamic offset table. */ assert(bind_layout->dynamic_offset_index < MAX_DYNAMIC_BUFFERS); uint32_t dynamic_offset_index = 0xff; /* No dynamic offset */ if (bind_layout->dynamic_offset_index >= 0) { dynamic_offset_index = state->layout->set[set].dynamic_offset_start + bind_layout->dynamic_offset_index; } const uint32_t desc_offset = bind_layout->descriptor_offset << 16 | (uint32_t)state->set[set].desc_offset << 8 | dynamic_offset_index; if (state->add_bounds_checks) { assert(desc_addr_format(desc_type, state) == nir_address_format_64bit_bounded_global); assert(intrin->dest.ssa.num_components == 4); assert(intrin->dest.ssa.bit_size == 32); index = nir_vec4(b, nir_imm_int(b, desc_offset), nir_ssa_for_src(b, intrin->src[0], 1), nir_imm_int(b, array_size - 1), nir_ssa_undef(b, 1, 32)); } else { assert(desc_addr_format(desc_type, state) == nir_address_format_64bit_global); assert(intrin->dest.ssa.num_components == 1); assert(intrin->dest.ssa.bit_size == 64); index = nir_pack_64_2x32_split(b, nir_imm_int(b, desc_offset), nir_ssa_for_src(b, intrin->src[0], 1)); } } else if (bind_layout->data & ANV_DESCRIPTOR_INLINE_UNIFORM) { /* This is an inline uniform block. Just reference the descriptor set * and use the descriptor offset as the base. */ assert(desc_addr_format(desc_type, state) == nir_address_format_32bit_index_offset); assert(intrin->dest.ssa.num_components == 2); assert(intrin->dest.ssa.bit_size == 32); index = nir_imm_ivec2(b, state->set[set].desc_offset, bind_layout->descriptor_offset); } else { assert(desc_addr_format(desc_type, state) == nir_address_format_32bit_index_offset); assert(intrin->dest.ssa.num_components == 2); assert(intrin->dest.ssa.bit_size == 32); index = nir_vec2(b, nir_iadd_imm(b, array_index, surface_index), nir_imm_int(b, 0)); } assert(intrin->dest.is_ssa); nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(index)); nir_instr_remove(&intrin->instr); } static void lower_res_reindex_intrinsic(nir_intrinsic_instr *intrin, struct apply_pipeline_layout_state *state) { nir_builder *b = &state->builder; b->cursor = nir_before_instr(&intrin->instr); const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin); /* For us, the resource indices are just indices into the binding table and * array elements are sequential. A resource_reindex just turns into an * add of the two indices. */ assert(intrin->src[0].is_ssa && intrin->src[1].is_ssa); nir_ssa_def *old_index = intrin->src[0].ssa; nir_ssa_def *offset = intrin->src[1].ssa; nir_ssa_def *new_index; switch (desc_addr_format(desc_type, state)) { case nir_address_format_64bit_bounded_global: /* See also lower_res_index_intrinsic() */ assert(intrin->dest.ssa.num_components == 4); assert(intrin->dest.ssa.bit_size == 32); new_index = nir_vec4(b, nir_channel(b, old_index, 0), nir_iadd(b, nir_channel(b, old_index, 1), offset), nir_channel(b, old_index, 2), nir_ssa_undef(b, 1, 32)); break; case nir_address_format_64bit_global: { /* See also lower_res_index_intrinsic() */ assert(intrin->dest.ssa.num_components == 1); assert(intrin->dest.ssa.bit_size == 64); nir_ssa_def *base = nir_unpack_64_2x32_split_x(b, old_index); nir_ssa_def *arr_idx = nir_unpack_64_2x32_split_y(b, old_index); new_index = nir_pack_64_2x32_split(b, base, nir_iadd(b, arr_idx, offset)); break; } case nir_address_format_32bit_index_offset: assert(intrin->dest.ssa.num_components == 2); assert(intrin->dest.ssa.bit_size == 32); new_index = nir_vec2(b, nir_iadd(b, nir_channel(b, old_index, 0), offset), nir_channel(b, old_index, 1)); break; default: unreachable("Uhandled address format"); } assert(intrin->dest.is_ssa); nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(new_index)); nir_instr_remove(&intrin->instr); } static nir_ssa_def * build_ssbo_descriptor_load(const VkDescriptorType desc_type, nir_ssa_def *index, struct apply_pipeline_layout_state *state) { nir_builder *b = &state->builder; nir_ssa_def *desc_offset, *array_index; switch (state->ssbo_addr_format) { case nir_address_format_64bit_bounded_global: /* See also lower_res_index_intrinsic() */ desc_offset = nir_channel(b, index, 0); array_index = nir_umin(b, nir_channel(b, index, 1), nir_channel(b, index, 2)); break; case nir_address_format_64bit_global: /* See also lower_res_index_intrinsic() */ desc_offset = nir_unpack_64_2x32_split_x(b, index); array_index = nir_unpack_64_2x32_split_y(b, index); break; default: unreachable("Unhandled address format for SSBO"); } /* The desc_offset is actually 16.8.8 */ nir_ssa_def *desc_buffer_index = nir_extract_u8(b, desc_offset, nir_imm_int(b, 1)); nir_ssa_def *desc_offset_base = nir_extract_u16(b, desc_offset, nir_imm_int(b, 1)); /* Compute the actual descriptor offset */ const unsigned descriptor_size = anv_descriptor_type_size(state->pdevice, desc_type); desc_offset = nir_iadd(b, desc_offset_base, nir_imul_imm(b, array_index, descriptor_size)); nir_intrinsic_instr *desc_load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo); desc_load->src[0] = nir_src_for_ssa(desc_buffer_index); desc_load->src[1] = nir_src_for_ssa(desc_offset); desc_load->num_components = 4; nir_ssa_dest_init(&desc_load->instr, &desc_load->dest, 4, 32, NULL); nir_builder_instr_insert(b, &desc_load->instr); return &desc_load->dest.ssa; } static void lower_load_vulkan_descriptor(nir_intrinsic_instr *intrin, struct apply_pipeline_layout_state *state) { nir_builder *b = &state->builder; b->cursor = nir_before_instr(&intrin->instr); const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin); assert(intrin->src[0].is_ssa); nir_ssa_def *index = intrin->src[0].ssa; nir_ssa_def *desc; if (state->pdevice->has_a64_buffer_access && (desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER || desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)) { desc = build_ssbo_descriptor_load(desc_type, index, state); /* We want nir_address_format_64bit_global */ if (!state->add_bounds_checks) desc = nir_pack_64_2x32(b, nir_channels(b, desc, 0x3)); if (state->dynamic_offset_uniform_start >= 0) { /* This shader has dynamic offsets and we have no way of knowing * (save from the dynamic offset base index) if this buffer has a * dynamic offset. */ nir_ssa_def *desc_offset, *array_index; switch (state->ssbo_addr_format) { case nir_address_format_64bit_bounded_global: /* See also lower_res_index_intrinsic() */ desc_offset = nir_channel(b, index, 0); array_index = nir_umin(b, nir_channel(b, index, 1), nir_channel(b, index, 2)); break; case nir_address_format_64bit_global: /* See also lower_res_index_intrinsic() */ desc_offset = nir_unpack_64_2x32_split_x(b, index); array_index = nir_unpack_64_2x32_split_y(b, index); break; default: unreachable("Unhandled address format for SSBO"); } nir_ssa_def *dyn_offset_base = nir_extract_u8(b, desc_offset, nir_imm_int(b, 0)); nir_ssa_def *dyn_offset_idx = nir_iadd(b, dyn_offset_base, array_index); if (state->add_bounds_checks) { dyn_offset_idx = nir_umin(b, dyn_offset_idx, nir_imm_int(b, MAX_DYNAMIC_BUFFERS)); } nir_intrinsic_instr *dyn_load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform); nir_intrinsic_set_base(dyn_load, state->dynamic_offset_uniform_start); nir_intrinsic_set_range(dyn_load, MAX_DYNAMIC_BUFFERS * 4); dyn_load->src[0] = nir_src_for_ssa(nir_imul_imm(b, dyn_offset_idx, 4)); dyn_load->num_components = 1; nir_ssa_dest_init(&dyn_load->instr, &dyn_load->dest, 1, 32, NULL); nir_builder_instr_insert(b, &dyn_load->instr); nir_ssa_def *dynamic_offset = nir_bcsel(b, nir_ieq(b, dyn_offset_base, nir_imm_int(b, 0xff)), nir_imm_int(b, 0), &dyn_load->dest.ssa); switch (state->ssbo_addr_format) { case nir_address_format_64bit_bounded_global: { /* The dynamic offset gets added to the base pointer so that we * have a sliding window range. */ nir_ssa_def *base_ptr = nir_pack_64_2x32(b, nir_channels(b, desc, 0x3)); base_ptr = nir_iadd(b, base_ptr, nir_u2u64(b, dynamic_offset)); desc = nir_vec4(b, nir_unpack_64_2x32_split_x(b, base_ptr), nir_unpack_64_2x32_split_y(b, base_ptr), nir_channel(b, desc, 2), nir_channel(b, desc, 3)); break; } case nir_address_format_64bit_global: desc = nir_iadd(b, desc, nir_u2u64(b, dynamic_offset)); break; default: unreachable("Unhandled address format for SSBO"); } } } else { /* We follow the nir_address_format_32bit_index_offset model */ desc = index; } assert(intrin->dest.is_ssa); nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(desc)); nir_instr_remove(&intrin->instr); } static void lower_get_buffer_size(nir_intrinsic_instr *intrin, struct apply_pipeline_layout_state *state) { if (_mesa_set_search(state->lowered_instrs, intrin)) return; nir_builder *b = &state->builder; b->cursor = nir_before_instr(&intrin->instr); const VkDescriptorType desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; assert(intrin->src[0].is_ssa); nir_ssa_def *index = intrin->src[0].ssa; if (state->pdevice->has_a64_buffer_access) { nir_ssa_def *desc = build_ssbo_descriptor_load(desc_type, index, state); nir_ssa_def *size = nir_channel(b, desc, 2); nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(size)); nir_instr_remove(&intrin->instr); } else { /* We're following the nir_address_format_32bit_index_offset model so * the binding table index is the first component of the address. The * back-end wants a scalar binding table index source. */ nir_instr_rewrite_src(&intrin->instr, &intrin->src[0], nir_src_for_ssa(nir_channel(b, index, 0))); } } static nir_ssa_def * build_descriptor_load(nir_deref_instr *deref, unsigned offset, unsigned num_components, unsigned bit_size, struct apply_pipeline_layout_state *state) { nir_variable *var = nir_deref_instr_get_variable(deref); unsigned set = var->data.descriptor_set; unsigned binding = var->data.binding; unsigned array_size = state->layout->set[set].layout->binding[binding].array_size; const struct anv_descriptor_set_binding_layout *bind_layout = &state->layout->set[set].layout->binding[binding]; nir_builder *b = &state->builder; nir_ssa_def *desc_buffer_index = nir_imm_int(b, state->set[set].desc_offset); nir_ssa_def *desc_offset = nir_imm_int(b, bind_layout->descriptor_offset + offset); if (deref->deref_type != nir_deref_type_var) { assert(deref->deref_type == nir_deref_type_array); const unsigned descriptor_size = anv_descriptor_size(bind_layout); nir_ssa_def *arr_index = nir_ssa_for_src(b, deref->arr.index, 1); if (state->add_bounds_checks) arr_index = nir_umin(b, arr_index, nir_imm_int(b, array_size - 1)); desc_offset = nir_iadd(b, desc_offset, nir_imul_imm(b, arr_index, descriptor_size)); } nir_intrinsic_instr *desc_load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo); desc_load->src[0] = nir_src_for_ssa(desc_buffer_index); desc_load->src[1] = nir_src_for_ssa(desc_offset); desc_load->num_components = num_components; nir_ssa_dest_init(&desc_load->instr, &desc_load->dest, num_components, bit_size, NULL); nir_builder_instr_insert(b, &desc_load->instr); return &desc_load->dest.ssa; } static void lower_image_intrinsic(nir_intrinsic_instr *intrin, struct apply_pipeline_layout_state *state) { nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); nir_variable *var = nir_deref_instr_get_variable(deref); unsigned set = var->data.descriptor_set; unsigned binding = var->data.binding; unsigned binding_offset = state->set[set].surface_offsets[binding]; nir_builder *b = &state->builder; b->cursor = nir_before_instr(&intrin->instr); ASSERTED const bool use_bindless = state->pdevice->has_bindless_images; if (intrin->intrinsic == nir_intrinsic_image_deref_load_param_intel) { b->cursor = nir_instr_remove(&intrin->instr); assert(!use_bindless); /* Otherwise our offsets would be wrong */ const unsigned param = nir_intrinsic_base(intrin); nir_ssa_def *desc = build_descriptor_load(deref, param * 16, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, state); nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(desc)); } else if (binding_offset > MAX_BINDING_TABLE_SIZE) { const bool write_only = (var->data.image.access & ACCESS_NON_READABLE) != 0; nir_ssa_def *desc = build_descriptor_load(deref, 0, 2, 32, state); nir_ssa_def *handle = nir_channel(b, desc, write_only ? 1 : 0); nir_rewrite_image_intrinsic(intrin, handle, true); } else { unsigned array_size = state->layout->set[set].layout->binding[binding].array_size; nir_ssa_def *index = NULL; if (deref->deref_type != nir_deref_type_var) { assert(deref->deref_type == nir_deref_type_array); index = nir_ssa_for_src(b, deref->arr.index, 1); if (state->add_bounds_checks) index = nir_umin(b, index, nir_imm_int(b, array_size - 1)); } else { index = nir_imm_int(b, 0); } index = nir_iadd_imm(b, index, binding_offset); nir_rewrite_image_intrinsic(intrin, index, false); } } static void lower_load_constant(nir_intrinsic_instr *intrin, struct apply_pipeline_layout_state *state) { nir_builder *b = &state->builder; b->cursor = nir_before_instr(&intrin->instr); nir_ssa_def *index = nir_imm_int(b, state->constants_offset); nir_ssa_def *offset = nir_iadd(b, nir_ssa_for_src(b, intrin->src[0], 1), nir_imm_int(b, nir_intrinsic_base(intrin))); nir_intrinsic_instr *load_ubo = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo); load_ubo->num_components = intrin->num_components; load_ubo->src[0] = nir_src_for_ssa(index); load_ubo->src[1] = nir_src_for_ssa(offset); nir_ssa_dest_init(&load_ubo->instr, &load_ubo->dest, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, NULL); nir_builder_instr_insert(b, &load_ubo->instr); nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(&load_ubo->dest.ssa)); nir_instr_remove(&intrin->instr); } static void lower_tex_deref(nir_tex_instr *tex, nir_tex_src_type deref_src_type, unsigned *base_index, unsigned plane, struct apply_pipeline_layout_state *state) { int deref_src_idx = nir_tex_instr_src_index(tex, deref_src_type); if (deref_src_idx < 0) return; nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src); nir_variable *var = nir_deref_instr_get_variable(deref); unsigned set = var->data.descriptor_set; unsigned binding = var->data.binding; unsigned array_size = state->layout->set[set].layout->binding[binding].array_size; unsigned binding_offset; if (deref_src_type == nir_tex_src_texture_deref) { binding_offset = state->set[set].surface_offsets[binding]; } else { assert(deref_src_type == nir_tex_src_sampler_deref); binding_offset = state->set[set].sampler_offsets[binding]; } nir_builder *b = &state->builder; nir_tex_src_type offset_src_type; nir_ssa_def *index = NULL; if (binding_offset > MAX_BINDING_TABLE_SIZE) { const unsigned plane_offset = plane * sizeof(struct anv_sampled_image_descriptor); nir_ssa_def *desc = build_descriptor_load(deref, plane_offset, 2, 32, state); if (deref_src_type == nir_tex_src_texture_deref) { offset_src_type = nir_tex_src_texture_handle; index = nir_channel(b, desc, 0); } else { assert(deref_src_type == nir_tex_src_sampler_deref); offset_src_type = nir_tex_src_sampler_handle; index = nir_channel(b, desc, 1); } } else { if (deref_src_type == nir_tex_src_texture_deref) { offset_src_type = nir_tex_src_texture_offset; } else { assert(deref_src_type == nir_tex_src_sampler_deref); offset_src_type = nir_tex_src_sampler_offset; } *base_index = binding_offset + plane; if (deref->deref_type != nir_deref_type_var) { assert(deref->deref_type == nir_deref_type_array); if (nir_src_is_const(deref->arr.index)) { unsigned arr_index = MIN2(nir_src_as_uint(deref->arr.index), array_size - 1); struct anv_sampler **immutable_samplers = state->layout->set[set].layout->binding[binding].immutable_samplers; if (immutable_samplers) { /* Array of YCbCr samplers are tightly packed in the binding * tables, compute the offset of an element in the array by * adding the number of planes of all preceding elements. */ unsigned desc_arr_index = 0; for (int i = 0; i < arr_index; i++) desc_arr_index += immutable_samplers[i]->n_planes; *base_index += desc_arr_index; } else { *base_index += arr_index; } } else { /* From VK_KHR_sampler_ycbcr_conversion: * * If sampler Y’CBCR conversion is enabled, the combined image * sampler must be indexed only by constant integral expressions * when aggregated into arrays in shader code, irrespective of * the shaderSampledImageArrayDynamicIndexing feature. */ assert(nir_tex_instr_src_index(tex, nir_tex_src_plane) == -1); index = nir_ssa_for_src(b, deref->arr.index, 1); if (state->add_bounds_checks) index = nir_umin(b, index, nir_imm_int(b, array_size - 1)); } } } if (index) { nir_instr_rewrite_src(&tex->instr, &tex->src[deref_src_idx].src, nir_src_for_ssa(index)); tex->src[deref_src_idx].src_type = offset_src_type; } else { nir_tex_instr_remove_src(tex, deref_src_idx); } } static uint32_t tex_instr_get_and_remove_plane_src(nir_tex_instr *tex) { int plane_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_plane); if (plane_src_idx < 0) return 0; unsigned plane = nir_src_as_uint(tex->src[plane_src_idx].src); nir_tex_instr_remove_src(tex, plane_src_idx); return plane; } static nir_ssa_def * build_def_array_select(nir_builder *b, nir_ssa_def **srcs, nir_ssa_def *idx, unsigned start, unsigned end) { if (start == end - 1) { return srcs[start]; } else { unsigned mid = start + (end - start) / 2; return nir_bcsel(b, nir_ilt(b, idx, nir_imm_int(b, mid)), build_def_array_select(b, srcs, idx, start, mid), build_def_array_select(b, srcs, idx, mid, end)); } } static void lower_gen7_tex_swizzle(nir_tex_instr *tex, unsigned plane, struct apply_pipeline_layout_state *state) { assert(state->pdevice->info.gen == 7 && !state->pdevice->info.is_haswell); if (tex->sampler_dim == GLSL_SAMPLER_DIM_BUF || nir_tex_instr_is_query(tex) || tex->op == nir_texop_tg4 || /* We can't swizzle TG4 */ (tex->is_shadow && tex->is_new_style_shadow)) return; int deref_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref); assert(deref_src_idx >= 0); nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src); nir_variable *var = nir_deref_instr_get_variable(deref); unsigned set = var->data.descriptor_set; unsigned binding = var->data.binding; const struct anv_descriptor_set_binding_layout *bind_layout = &state->layout->set[set].layout->binding[binding]; if ((bind_layout->data & ANV_DESCRIPTOR_TEXTURE_SWIZZLE) == 0) return; nir_builder *b = &state->builder; b->cursor = nir_before_instr(&tex->instr); const unsigned plane_offset = plane * sizeof(struct anv_texture_swizzle_descriptor); nir_ssa_def *swiz = build_descriptor_load(deref, plane_offset, 1, 32, state); b->cursor = nir_after_instr(&tex->instr); assert(tex->dest.ssa.bit_size == 32); assert(tex->dest.ssa.num_components == 4); /* Initializing to undef is ok; nir_opt_undef will clean it up. */ nir_ssa_def *undef = nir_ssa_undef(b, 1, 32); nir_ssa_def *comps[8]; for (unsigned i = 0; i < ARRAY_SIZE(comps); i++) comps[i] = undef; comps[ISL_CHANNEL_SELECT_ZERO] = nir_imm_int(b, 0); if (nir_alu_type_get_base_type(tex->dest_type) == nir_type_float) comps[ISL_CHANNEL_SELECT_ONE] = nir_imm_float(b, 1); else comps[ISL_CHANNEL_SELECT_ONE] = nir_imm_int(b, 1); comps[ISL_CHANNEL_SELECT_RED] = nir_channel(b, &tex->dest.ssa, 0); comps[ISL_CHANNEL_SELECT_GREEN] = nir_channel(b, &tex->dest.ssa, 1); comps[ISL_CHANNEL_SELECT_BLUE] = nir_channel(b, &tex->dest.ssa, 2); comps[ISL_CHANNEL_SELECT_ALPHA] = nir_channel(b, &tex->dest.ssa, 3); nir_ssa_def *swiz_comps[4]; for (unsigned i = 0; i < 4; i++) { nir_ssa_def *comp_swiz = nir_extract_u8(b, swiz, nir_imm_int(b, i)); swiz_comps[i] = build_def_array_select(b, comps, comp_swiz, 0, 8); } nir_ssa_def *swiz_tex_res = nir_vec(b, swiz_comps, 4); /* Rewrite uses before we insert so we don't rewrite this use */ nir_ssa_def_rewrite_uses_after(&tex->dest.ssa, nir_src_for_ssa(swiz_tex_res), swiz_tex_res->parent_instr); } static void lower_tex(nir_tex_instr *tex, struct apply_pipeline_layout_state *state) { unsigned plane = tex_instr_get_and_remove_plane_src(tex); /* On Ivy Bridge and Bay Trail, we have to swizzle in the shader. Do this * before we lower the derefs away so we can still find the descriptor. */ if (state->pdevice->info.gen == 7 && !state->pdevice->info.is_haswell) lower_gen7_tex_swizzle(tex, plane, state); state->builder.cursor = nir_before_instr(&tex->instr); lower_tex_deref(tex, nir_tex_src_texture_deref, &tex->texture_index, plane, state); lower_tex_deref(tex, nir_tex_src_sampler_deref, &tex->sampler_index, plane, state); /* The backend only ever uses this to mark used surfaces. We don't care * about that little optimization so it just needs to be non-zero. */ tex->texture_array_size = 1; } static void apply_pipeline_layout_block(nir_block *block, struct apply_pipeline_layout_state *state) { nir_foreach_instr_safe(instr, block) { switch (instr->type) { case nir_instr_type_intrinsic: { nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); switch (intrin->intrinsic) { case nir_intrinsic_vulkan_resource_index: lower_res_index_intrinsic(intrin, state); break; case nir_intrinsic_vulkan_resource_reindex: lower_res_reindex_intrinsic(intrin, state); break; case nir_intrinsic_load_vulkan_descriptor: lower_load_vulkan_descriptor(intrin, state); break; case nir_intrinsic_get_buffer_size: lower_get_buffer_size(intrin, state); break; case nir_intrinsic_image_deref_load: case nir_intrinsic_image_deref_store: case nir_intrinsic_image_deref_atomic_add: case nir_intrinsic_image_deref_atomic_imin: case nir_intrinsic_image_deref_atomic_umin: case nir_intrinsic_image_deref_atomic_imax: case nir_intrinsic_image_deref_atomic_umax: case nir_intrinsic_image_deref_atomic_and: case nir_intrinsic_image_deref_atomic_or: case nir_intrinsic_image_deref_atomic_xor: case nir_intrinsic_image_deref_atomic_exchange: case nir_intrinsic_image_deref_atomic_comp_swap: case nir_intrinsic_image_deref_size: case nir_intrinsic_image_deref_samples: case nir_intrinsic_image_deref_load_param_intel: case nir_intrinsic_image_deref_load_raw_intel: case nir_intrinsic_image_deref_store_raw_intel: lower_image_intrinsic(intrin, state); break; case nir_intrinsic_load_constant: lower_load_constant(intrin, state); break; default: break; } break; } case nir_instr_type_tex: lower_tex(nir_instr_as_tex(instr), state); break; default: continue; } } } struct binding_info { uint32_t binding; uint8_t set; uint16_t score; }; static int compare_binding_infos(const void *_a, const void *_b) { const struct binding_info *a = _a, *b = _b; if (a->score != b->score) return b->score - a->score; if (a->set != b->set) return a->set - b->set; return a->binding - b->binding; } void anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, bool robust_buffer_access, struct anv_pipeline_layout *layout, nir_shader *shader, struct brw_stage_prog_data *prog_data, struct anv_pipeline_bind_map *map) { void *mem_ctx = ralloc_context(NULL); struct apply_pipeline_layout_state state = { .pdevice = pdevice, .shader = shader, .layout = layout, .add_bounds_checks = robust_buffer_access, .ssbo_addr_format = anv_nir_ssbo_addr_format(pdevice, robust_buffer_access), .lowered_instrs = _mesa_pointer_set_create(mem_ctx), .dynamic_offset_uniform_start = -1, }; for (unsigned s = 0; s < layout->num_sets; s++) { const unsigned count = layout->set[s].layout->binding_count; state.set[s].use_count = rzalloc_array(mem_ctx, uint8_t, count); state.set[s].surface_offsets = rzalloc_array(mem_ctx, uint8_t, count); state.set[s].sampler_offsets = rzalloc_array(mem_ctx, uint8_t, count); } nir_foreach_function(function, shader) { if (!function->impl) continue; nir_foreach_block(block, function->impl) get_used_bindings_block(block, &state); } for (unsigned s = 0; s < layout->num_sets; s++) { if (state.set[s].desc_buffer_used) { map->surface_to_descriptor[map->surface_count] = (struct anv_pipeline_binding) { .set = ANV_DESCRIPTOR_SET_DESCRIPTORS, .binding = s, }; state.set[s].desc_offset = map->surface_count; map->surface_count++; } } if (state.uses_constants) { state.constants_offset = map->surface_count; map->surface_to_descriptor[map->surface_count].set = ANV_DESCRIPTOR_SET_SHADER_CONSTANTS; map->surface_count++; } unsigned used_binding_count = 0; for (uint32_t set = 0; set < layout->num_sets; set++) { struct anv_descriptor_set_layout *set_layout = layout->set[set].layout; for (unsigned b = 0; b < set_layout->binding_count; b++) { if (state.set[set].use_count[b] == 0) continue; used_binding_count++; } } struct binding_info *infos = rzalloc_array(mem_ctx, struct binding_info, used_binding_count); used_binding_count = 0; for (uint32_t set = 0; set < layout->num_sets; set++) { struct anv_descriptor_set_layout *set_layout = layout->set[set].layout; for (unsigned b = 0; b < set_layout->binding_count; b++) { if (state.set[set].use_count[b] == 0) continue; struct anv_descriptor_set_binding_layout *binding = &layout->set[set].layout->binding[b]; /* Do a fixed-point calculation to generate a score based on the * number of uses and the binding array size. We shift by 7 instead * of 8 because we're going to use the top bit below to make * everything which does not support bindless super higher priority * than things which do. */ uint16_t score = ((uint16_t)state.set[set].use_count[b] << 7) / binding->array_size; /* If the descriptor type doesn't support bindless then put it at the * beginning so we guarantee it gets a slot. */ if (!anv_descriptor_supports_bindless(pdevice, binding, true) || !anv_descriptor_supports_bindless(pdevice, binding, false)) score |= 1 << 15; infos[used_binding_count++] = (struct binding_info) { .set = set, .binding = b, .score = score, }; } } /* Order the binding infos based on score with highest scores first. If * scores are equal we then order by set and binding. */ qsort(infos, used_binding_count, sizeof(struct binding_info), compare_binding_infos); bool have_dynamic_buffers = false; for (unsigned i = 0; i < used_binding_count; i++) { unsigned set = infos[i].set, b = infos[i].binding; struct anv_descriptor_set_binding_layout *binding = &layout->set[set].layout->binding[b]; if (binding->dynamic_offset_index >= 0) have_dynamic_buffers = true; const uint32_t array_size = binding->array_size; if (binding->data & ANV_DESCRIPTOR_SURFACE_STATE) { if (map->surface_count + array_size > MAX_BINDING_TABLE_SIZE || anv_descriptor_requires_bindless(pdevice, binding, false)) { /* If this descriptor doesn't fit in the binding table or if it * requires bindless for some reason, flag it as bindless. */ assert(anv_descriptor_supports_bindless(pdevice, binding, false)); state.set[set].surface_offsets[b] = BINDLESS_OFFSET; } else { state.set[set].surface_offsets[b] = map->surface_count; struct anv_sampler **samplers = binding->immutable_samplers; for (unsigned i = 0; i < binding->array_size; i++) { uint8_t planes = samplers ? samplers[i]->n_planes : 1; for (uint8_t p = 0; p < planes; p++) { map->surface_to_descriptor[map->surface_count++] = (struct anv_pipeline_binding) { .set = set, .binding = b, .index = i, .plane = p, }; } } } assert(map->surface_count <= MAX_BINDING_TABLE_SIZE); } if (binding->data & ANV_DESCRIPTOR_SAMPLER_STATE) { if (map->sampler_count + array_size > MAX_SAMPLER_TABLE_SIZE || anv_descriptor_requires_bindless(pdevice, binding, true)) { /* If this descriptor doesn't fit in the binding table or if it * requires bindless for some reason, flag it as bindless. * * We also make large sampler arrays bindless because we can avoid * using indirect sends thanks to bindless samplers being packed * less tightly than the sampler table. */ assert(anv_descriptor_supports_bindless(pdevice, binding, true)); state.set[set].sampler_offsets[b] = BINDLESS_OFFSET; } else { state.set[set].sampler_offsets[b] = map->sampler_count; struct anv_sampler **samplers = binding->immutable_samplers; for (unsigned i = 0; i < binding->array_size; i++) { uint8_t planes = samplers ? samplers[i]->n_planes : 1; for (uint8_t p = 0; p < planes; p++) { map->sampler_to_descriptor[map->sampler_count++] = (struct anv_pipeline_binding) { .set = set, .binding = b, .index = i, .plane = p, }; } } } } } if (have_dynamic_buffers) { state.dynamic_offset_uniform_start = shader->num_uniforms; uint32_t *param = brw_stage_prog_data_add_params(prog_data, MAX_DYNAMIC_BUFFERS); for (unsigned i = 0; i < MAX_DYNAMIC_BUFFERS; i++) param[i] = ANV_PARAM_DYN_OFFSET(i); shader->num_uniforms += MAX_DYNAMIC_BUFFERS * 4; assert(shader->num_uniforms == prog_data->nr_params * 4); } nir_foreach_variable(var, &shader->uniforms) { const struct glsl_type *glsl_type = glsl_without_array(var->type); if (!glsl_type_is_image(glsl_type)) continue; enum glsl_sampler_dim dim = glsl_get_sampler_dim(glsl_type); const uint32_t set = var->data.descriptor_set; const uint32_t binding = var->data.binding; const uint32_t array_size = layout->set[set].layout->binding[binding].array_size; if (state.set[set].use_count[binding] == 0) continue; if (state.set[set].surface_offsets[binding] >= MAX_BINDING_TABLE_SIZE) continue; struct anv_pipeline_binding *pipe_binding = &map->surface_to_descriptor[state.set[set].surface_offsets[binding]]; for (unsigned i = 0; i < array_size; i++) { assert(pipe_binding[i].set == set); assert(pipe_binding[i].binding == binding); assert(pipe_binding[i].index == i); if (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS) pipe_binding[i].input_attachment_index = var->data.index + i; pipe_binding[i].write_only = (var->data.image.access & ACCESS_NON_READABLE) != 0; } } nir_foreach_function(function, shader) { if (!function->impl) continue; /* Before we do the normal lowering, we look for any SSBO operations * that we can lower to the BTI model and lower them up-front. The BTI * model can perform better than the A64 model for a couple reasons: * * 1. 48-bit address calculations are potentially expensive and using * the BTI model lets us simply compute 32-bit offsets and the * hardware adds the 64-bit surface base address. * * 2. The BTI messages, because they use surface states, do bounds * checking for us. With the A64 model, we have to do our own * bounds checking and this means wider pointers and extra * calculations and branching in the shader. * * The solution to both of these is to convert things to the BTI model * opportunistically. The reason why we need to do this as a pre-pass * is for two reasons: * * 1. The BTI model requires nir_address_format_32bit_index_offset * pointers which are not the same type as the pointers needed for * the A64 model. Because all our derefs are set up for the A64 * model (in case we have variable pointers), we have to crawl all * the way back to the vulkan_resource_index intrinsic and build a * completely fresh index+offset calculation. * * 2. Because the variable-pointers-capable lowering that we do as part * of apply_pipeline_layout_block is destructive (It really has to * be to handle variable pointers properly), we've lost the deref * information by the time we get to the load/store/atomic * intrinsics in that pass. */ lower_direct_buffer_access(function->impl, &state); nir_builder_init(&state.builder, function->impl); nir_foreach_block(block, function->impl) apply_pipeline_layout_block(block, &state); nir_metadata_preserve(function->impl, nir_metadata_block_index | nir_metadata_dominance); } ralloc_free(mem_ctx); }