diff options
-rw-r--r-- | src/compiler/nir/nir_intrinsics.py | 9 | ||||
-rw-r--r-- | src/intel/Makefile.sources | 1 | ||||
-rw-r--r-- | src/intel/compiler/brw_fs_nir.cpp | 126 | ||||
-rw-r--r-- | src/intel/compiler/brw_fs_surface_builder.cpp | 1030 | ||||
-rw-r--r-- | src/intel/compiler/brw_fs_surface_builder.h | 20 | ||||
-rw-r--r-- | src/intel/compiler/brw_nir.h | 3 | ||||
-rw-r--r-- | src/intel/compiler/brw_nir_lower_image_load_store.c | 822 | ||||
-rw-r--r-- | src/intel/compiler/meson.build | 1 | ||||
-rw-r--r-- | src/intel/vulkan/anv_pipeline.c | 2 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_program.c | 2 |
10 files changed, 896 insertions, 1120 deletions
diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 17212c4862f..170f954e375 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -312,6 +312,15 @@ intrinsic("image_deref_atomic_fadd", src_comp=[1, 4, 1, 1], dest_comp=1) intrinsic("image_deref_size", src_comp=[1], dest_comp=0, flags=[CAN_ELIMINATE, CAN_REORDER]) intrinsic("image_deref_samples", src_comp=[1], dest_comp=1, flags=[CAN_ELIMINATE, CAN_REORDER]) +# Intel-specific query for loading from the brw_image_param struct passed +# into the shader as a uniform. The variable is a deref to the image +# variable. The const index specifies which of the six parameters to load. +intrinsic("image_deref_load_param_intel", src_comp=[1], dest_comp=0, + indices=[BASE], flags=[CAN_ELIMINATE, CAN_REORDER]) +intrinsic("image_deref_load_raw_intel", src_comp=[1, 1], dest_comp=0, + flags=[CAN_ELIMINATE]) +intrinsic("image_deref_store_raw_intel", src_comp=[1, 1, 0]) + # Vulkan descriptor set intrinsics # # The Vulkan API uses a different binding model from GL. In the Vulkan diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources index 5f6cd96825b..d10c4511734 100644 --- a/src/intel/Makefile.sources +++ b/src/intel/Makefile.sources @@ -84,6 +84,7 @@ COMPILER_FILES = \ compiler/brw_nir_analyze_ubo_ranges.c \ compiler/brw_nir_attribute_workarounds.c \ compiler/brw_nir_lower_cs_intrinsics.c \ + compiler/brw_nir_lower_image_load_store.c \ compiler/brw_nir_opt_peephole_ffma.c \ compiler/brw_nir_tcs_workarounds.c \ compiler/brw_packed_float.c \ diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 67c0bee7acd..b2be91f9117 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -3865,38 +3865,33 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr case nir_intrinsic_image_deref_atomic_xor: case nir_intrinsic_image_deref_atomic_exchange: case nir_intrinsic_image_deref_atomic_comp_swap: { - using namespace image_access; - if (stage == MESA_SHADER_FRAGMENT && instr->intrinsic != nir_intrinsic_image_deref_load) brw_wm_prog_data(prog_data)->has_side_effects = true; /* Get the referenced image variable and type. */ nir_deref_instr *deref = nir_src_as_deref(instr->src[0]); - const nir_variable *var = nir_deref_instr_get_variable(deref); - const glsl_type *type = var->type->without_array(); - const brw_reg_type base_type = get_image_base_type(type); + const glsl_type *type = deref->type; /* Get some metadata from the image intrinsic. */ const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; - const unsigned arr_dims = type->sampler_array ? 1 : 0; - const unsigned surf_dims = type->coordinate_components() - arr_dims; - const unsigned format = var->data.image.format; + const unsigned dims = type->coordinate_components(); const unsigned dest_components = nir_intrinsic_dest_components(instr); /* Get the arguments of the image intrinsic. */ const fs_reg image = get_nir_image_deref(deref); - const fs_reg addr = retype(get_nir_src(instr->src[1]), - BRW_REGISTER_TYPE_UD); + const fs_reg coords = retype(get_nir_src(instr->src[1]), + BRW_REGISTER_TYPE_UD); fs_reg tmp; /* Emit an image load, store or atomic op. */ - if (instr->intrinsic == nir_intrinsic_image_deref_load) - tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format); - else if (instr->intrinsic == nir_intrinsic_image_deref_store) { - const fs_reg src0 = retype(get_nir_src(instr->src[3]), base_type); - emit_image_store(bld, image, addr, src0, surf_dims, arr_dims, - var->data.image.write_only ? GL_NONE : format); + if (instr->intrinsic == nir_intrinsic_image_deref_load) { + tmp = emit_typed_read(bld, image, coords, dims, + instr->num_components); + } else if (instr->intrinsic == nir_intrinsic_image_deref_store) { + const fs_reg src0 = get_nir_src(instr->src[3]); + emit_typed_write(bld, image, coords, src0, dims, + instr->num_components); } else { int op; unsigned num_srcs = info->num_srcs; @@ -3938,25 +3933,61 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr } const fs_reg src0 = (num_srcs >= 4 ? - retype(get_nir_src(instr->src[3]), base_type) : - fs_reg()); + get_nir_src(instr->src[3]) : fs_reg()); const fs_reg src1 = (num_srcs >= 5 ? - retype(get_nir_src(instr->src[4]), base_type) : - fs_reg()); + get_nir_src(instr->src[4]) : fs_reg()); - tmp = emit_image_atomic(bld, image, addr, src0, src1, - surf_dims, arr_dims, dest_components, - op); + tmp = emit_typed_atomic(bld, image, coords, src0, src1, dims, 1, op); } /* Assign the result. */ for (unsigned c = 0; c < dest_components; ++c) { - bld.MOV(offset(retype(dest, base_type), bld, c), - offset(tmp, bld, c)); + bld.MOV(offset(retype(dest, tmp.type), bld, c), + offset(tmp, bld, c)); + } + break; + } + + case nir_intrinsic_image_deref_load_param_intel: { + nir_deref_instr *deref = nir_src_as_deref(instr->src[0]); + const fs_reg image = get_nir_image_deref(deref); + const fs_reg param = offset(image, bld, nir_intrinsic_base(instr) * 4); + for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) { + bld.MOV(offset(retype(dest, param.type), bld, c), + offset(param, bld, c)); + } + break; + } + + case nir_intrinsic_image_deref_load_raw_intel: { + const fs_reg image = get_nir_image_deref(nir_src_as_deref(instr->src[0])); + const fs_reg addr = retype(get_nir_src(instr->src[1]), + BRW_REGISTER_TYPE_UD); + + fs_reg tmp = emit_untyped_read(bld, image, addr, 1, + instr->num_components); + + for (unsigned c = 0; c < instr->num_components; ++c) { + bld.MOV(offset(retype(dest, tmp.type), bld, c), + offset(tmp, bld, c)); } break; } + case nir_intrinsic_image_deref_store_raw_intel: { + const fs_reg image = get_nir_image_deref(nir_src_as_deref(instr->src[0])); + const fs_reg addr = retype(get_nir_src(instr->src[1]), + BRW_REGISTER_TYPE_UD); + const fs_reg data = retype(get_nir_src(instr->src[2]), + BRW_REGISTER_TYPE_UD); + + brw_wm_prog_data(prog_data)->has_side_effects = true; + + emit_untyped_write(bld, image, addr, data, 1, + instr->num_components); + break; + } + case nir_intrinsic_group_memory_barrier: case nir_intrinsic_memory_barrier_shared: case nir_intrinsic_memory_barrier_atomic_counter: @@ -3979,51 +4010,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } - case nir_intrinsic_image_deref_size: { - /* Get the referenced image variable and type. */ - nir_deref_instr *deref = nir_src_as_deref(instr->src[0]); - const nir_variable *var = nir_deref_instr_get_variable(deref); - const glsl_type *type = var->type->without_array(); - - /* Get the size of the image. */ - const fs_reg image = get_nir_image_deref(deref); - const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET); - - /* For 1DArray image types, the array index is stored in the Z component. - * Fix this by swizzling the Z component to the Y component. - */ - const bool is_1d_array_image = - type->sampler_dimensionality == GLSL_SAMPLER_DIM_1D && - type->sampler_array; - - /* For CubeArray images, we should count the number of cubes instead - * of the number of faces. Fix it by dividing the (Z component) by 6. - */ - const bool is_cube_array_image = - type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE && - type->sampler_array; - - /* Copy all the components. */ - for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) { - if ((int)c >= type->coordinate_components()) { - bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c), - brw_imm_d(1)); - } else if (c == 1 && is_1d_array_image) { - bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c), - offset(size, bld, 2)); - } else if (c == 2 && is_cube_array_image) { - bld.emit(SHADER_OPCODE_INT_QUOTIENT, - offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c), - offset(size, bld, c), brw_imm_d(6)); - } else { - bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c), - offset(size, bld, c)); - } - } - - break; - } - case nir_intrinsic_image_deref_samples: /* The driver does not support multi-sampled images. */ bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1)); diff --git a/src/intel/compiler/brw_fs_surface_builder.cpp b/src/intel/compiler/brw_fs_surface_builder.cpp index 0b8418ca736..fed04da5e7a 100644 --- a/src/intel/compiler/brw_fs_surface_builder.cpp +++ b/src/intel/compiler/brw_fs_surface_builder.cpp @@ -206,1033 +206,3 @@ namespace brw { } } } - -namespace { - namespace image_format_info { - /* The higher compiler layers use the GL enums for image formats even if - * they come in from SPIR-V or Vulkan. We need to turn them into an ISL - * enum before we can use them. - */ - static enum isl_format - isl_format_for_gl_format(uint32_t gl_format) - { - switch (gl_format) { - case GL_R8: return ISL_FORMAT_R8_UNORM; - case GL_R8_SNORM: return ISL_FORMAT_R8_SNORM; - case GL_R8UI: return ISL_FORMAT_R8_UINT; - case GL_R8I: return ISL_FORMAT_R8_SINT; - case GL_RG8: return ISL_FORMAT_R8G8_UNORM; - case GL_RG8_SNORM: return ISL_FORMAT_R8G8_SNORM; - case GL_RG8UI: return ISL_FORMAT_R8G8_UINT; - case GL_RG8I: return ISL_FORMAT_R8G8_SINT; - case GL_RGBA8: return ISL_FORMAT_R8G8B8A8_UNORM; - case GL_RGBA8_SNORM: return ISL_FORMAT_R8G8B8A8_SNORM; - case GL_RGBA8UI: return ISL_FORMAT_R8G8B8A8_UINT; - case GL_RGBA8I: return ISL_FORMAT_R8G8B8A8_SINT; - case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT; - case GL_RGB10_A2: return ISL_FORMAT_R10G10B10A2_UNORM; - case GL_RGB10_A2UI: return ISL_FORMAT_R10G10B10A2_UINT; - case GL_R16: return ISL_FORMAT_R16_UNORM; - case GL_R16_SNORM: return ISL_FORMAT_R16_SNORM; - case GL_R16F: return ISL_FORMAT_R16_FLOAT; - case GL_R16UI: return ISL_FORMAT_R16_UINT; - case GL_R16I: return ISL_FORMAT_R16_SINT; - case GL_RG16: return ISL_FORMAT_R16G16_UNORM; - case GL_RG16_SNORM: return ISL_FORMAT_R16G16_SNORM; - case GL_RG16F: return ISL_FORMAT_R16G16_FLOAT; - case GL_RG16UI: return ISL_FORMAT_R16G16_UINT; - case GL_RG16I: return ISL_FORMAT_R16G16_SINT; - case GL_RGBA16: return ISL_FORMAT_R16G16B16A16_UNORM; - case GL_RGBA16_SNORM: return ISL_FORMAT_R16G16B16A16_SNORM; - case GL_RGBA16F: return ISL_FORMAT_R16G16B16A16_FLOAT; - case GL_RGBA16UI: return ISL_FORMAT_R16G16B16A16_UINT; - case GL_RGBA16I: return ISL_FORMAT_R16G16B16A16_SINT; - case GL_R32F: return ISL_FORMAT_R32_FLOAT; - case GL_R32UI: return ISL_FORMAT_R32_UINT; - case GL_R32I: return ISL_FORMAT_R32_SINT; - case GL_RG32F: return ISL_FORMAT_R32G32_FLOAT; - case GL_RG32UI: return ISL_FORMAT_R32G32_UINT; - case GL_RG32I: return ISL_FORMAT_R32G32_SINT; - case GL_RGBA32F: return ISL_FORMAT_R32G32B32A32_FLOAT; - case GL_RGBA32UI: return ISL_FORMAT_R32G32B32A32_UINT; - case GL_RGBA32I: return ISL_FORMAT_R32G32B32A32_SINT; - case GL_NONE: return ISL_FORMAT_UNSUPPORTED; - default: - assert(!"Invalid image format"); - return ISL_FORMAT_UNSUPPORTED; - } - } - - /** - * Simple 4-tuple of scalars used to pass around per-color component - * values. - */ - struct color_u { - color_u(unsigned x = 0) : r(x), g(x), b(x), a(x) - { - } - - color_u(unsigned r, unsigned g, unsigned b, unsigned a) : - r(r), g(g), b(b), a(a) - { - } - - unsigned - operator[](unsigned i) const - { - const unsigned xs[] = { r, g, b, a }; - return xs[i]; - } - - unsigned r, g, b, a; - }; - - /** - * Return the per-channel bitfield widths for a given image format. - */ - inline color_u - get_bit_widths(isl_format format) - { - const isl_format_layout *fmtl = isl_format_get_layout(format); - - return color_u(fmtl->channels.r.bits, - fmtl->channels.g.bits, - fmtl->channels.b.bits, - fmtl->channels.a.bits); - } - - /** - * Return the per-channel bitfield shifts for a given image format. - */ - inline color_u - get_bit_shifts(isl_format format) - { - const color_u widths = get_bit_widths(format); - return color_u(0, widths.r, widths.r + widths.g, - widths.r + widths.g + widths.b); - } - - /** - * Return true if all present components have the same bit width. - */ - inline bool - is_homogeneous(isl_format format) - { - const color_u widths = get_bit_widths(format); - return ((widths.g == 0 || widths.g == widths.r) && - (widths.b == 0 || widths.b == widths.r) && - (widths.a == 0 || widths.a == widths.r)); - } - - /** - * Return true if the format conversion boils down to a trivial copy. - */ - inline bool - is_conversion_trivial(const gen_device_info *devinfo, isl_format format) - { - return (get_bit_widths(format).r == 32 && is_homogeneous(format)) || - format == isl_lower_storage_image_format(devinfo, format); - } - - /** - * Return true if the hardware natively supports some format with - * compatible bitfield layout, but possibly different data types. - */ - inline bool - has_supported_bit_layout(const gen_device_info *devinfo, - isl_format format) - { - const color_u widths = get_bit_widths(format); - const color_u lower_widths = get_bit_widths( - isl_lower_storage_image_format(devinfo, format)); - - return (widths.r == lower_widths.r && - widths.g == lower_widths.g && - widths.b == lower_widths.b && - widths.a == lower_widths.a); - } - - /** - * Return true if we are required to spread individual components over - * several components of the format used by the hardware (RG32 and - * friends implemented as RGBA16UI). - */ - inline bool - has_split_bit_layout(const gen_device_info *devinfo, isl_format format) - { - const isl_format lower_format = - isl_lower_storage_image_format(devinfo, format); - - return (isl_format_get_num_channels(format) < - isl_format_get_num_channels(lower_format)); - } - - /** - * Return true if the hardware returns garbage in the unused high bits - * of each component. This may happen on IVB because we rely on the - * undocumented behavior that typed reads from surfaces of the - * unsupported R8 and R16 formats return useful data in their least - * significant bits. - */ - inline bool - has_undefined_high_bits(const gen_device_info *devinfo, - isl_format format) - { - const isl_format lower_format = - isl_lower_storage_image_format(devinfo, format); - - return (devinfo->gen == 7 && !devinfo->is_haswell && - (lower_format == ISL_FORMAT_R16_UINT || - lower_format == ISL_FORMAT_R8_UINT)); - } - - /** - * Return true if the format represents values as signed integers - * requiring sign extension when unpacking. - */ - inline bool - needs_sign_extension(isl_format format) - { - return isl_format_has_snorm_channel(format) || - isl_format_has_sint_channel(format); - } - } - - namespace image_validity { - /** - * Check whether the bound image is suitable for untyped access. - */ - static brw_predicate - emit_untyped_image_check(const fs_builder &bld, const fs_reg &image, - brw_predicate pred) - { - const gen_device_info *devinfo = bld.shader->devinfo; - const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET); - - if (devinfo->gen == 7 && !devinfo->is_haswell) { - /* Check whether the first stride component (i.e. the Bpp value) - * is greater than four, what on Gen7 indicates that a surface of - * type RAW has been bound for untyped access. Reading or writing - * to a surface of type other than RAW using untyped surface - * messages causes a hang on IVB and VLV. - */ - set_predicate(pred, - bld.CMP(bld.null_reg_ud(), stride, brw_imm_d(4), - BRW_CONDITIONAL_G)); - - return BRW_PREDICATE_NORMAL; - } else { - /* More recent generations handle the format mismatch - * gracefully. - */ - return pred; - } - } - - /** - * Check whether there is an image bound at the given index and write - * the comparison result to f0.0. Returns an appropriate predication - * mode to use on subsequent image operations. - */ - static brw_predicate - emit_typed_atomic_check(const fs_builder &bld, const fs_reg &image) - { - const gen_device_info *devinfo = bld.shader->devinfo; - const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET); - - if (devinfo->gen == 7 && !devinfo->is_haswell) { - /* Check the first component of the size field to find out if the - * image is bound. Necessary on IVB for typed atomics because - * they don't seem to respect null surfaces and will happily - * corrupt or read random memory when no image is bound. - */ - bld.CMP(bld.null_reg_ud(), - retype(size, BRW_REGISTER_TYPE_UD), - brw_imm_d(0), BRW_CONDITIONAL_NZ); - - return BRW_PREDICATE_NORMAL; - } else { - /* More recent platforms implement compliant behavior when a null - * surface is bound. - */ - return BRW_PREDICATE_NONE; - } - } - - /** - * Check whether the provided coordinates are within the image bounds - * and write the comparison result to f0.0. Returns an appropriate - * predication mode to use on subsequent image operations. - */ - static brw_predicate - emit_bounds_check(const fs_builder &bld, const fs_reg &image, - const fs_reg &addr, unsigned dims) - { - const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET); - - for (unsigned c = 0; c < dims; ++c) - set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL, - bld.CMP(bld.null_reg_ud(), - offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c), - offset(size, bld, c), - BRW_CONDITIONAL_L)); - - return BRW_PREDICATE_NORMAL; - } - } - - namespace image_coordinates { - /** - * Return the total number of coordinates needed to address a texel of - * the surface, which may be more than the sum of \p surf_dims and \p - * arr_dims if padding is required. - */ - static unsigned - num_image_coordinates(const fs_builder &bld, - unsigned surf_dims, unsigned arr_dims, - isl_format format) - { - /* HSW in vec4 mode and our software coordinate handling for untyped - * reads want the array index to be at the Z component. - */ - const bool array_index_at_z = - format != ISL_FORMAT_UNSUPPORTED && - !isl_has_matching_typed_storage_image_format( - bld.shader->devinfo, format); - const unsigned zero_dims = - ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0); - - return surf_dims + zero_dims + arr_dims; - } - - /** - * Transform image coordinates into the form expected by the - * implementation. - */ - static fs_reg - emit_image_coordinates(const fs_builder &bld, const fs_reg &addr, - unsigned surf_dims, unsigned arr_dims, - isl_format format) - { - const unsigned dims = - num_image_coordinates(bld, surf_dims, arr_dims, format); - - if (dims > surf_dims + arr_dims) { - assert(surf_dims == 1 && arr_dims == 1 && dims == 3); - /* The array index is required to be passed in as the Z component, - * insert a zero at the Y component to shift it to the right - * position. - * - * FINISHME: Factor out this frequently recurring pattern into a - * helper function. - */ - const fs_reg srcs[] = { addr, brw_imm_d(0), offset(addr, bld, 1) }; - const fs_reg dst = bld.vgrf(addr.type, dims); - bld.LOAD_PAYLOAD(dst, srcs, dims, 0); - return dst; - } else { - return addr; - } - } - - /** - * Calculate the offset in memory of the texel given by \p coord. - * - * This is meant to be used with untyped surface messages to access a - * tiled surface, what involves taking into account the tiling and - * swizzling modes of the surface manually so it will hopefully not - * happen very often. - * - * The tiling algorithm implemented here matches either the X or Y - * tiling layouts supported by the hardware depending on the tiling - * coefficients passed to the program as uniforms. See Volume 1 Part 2 - * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth - * explanation of the hardware tiling format. - */ - static fs_reg - emit_address_calculation(const fs_builder &bld, const fs_reg &image, - const fs_reg &coord, unsigned dims) - { - const gen_device_info *devinfo = bld.shader->devinfo; - const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET); - const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET); - const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET); - const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET); - const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); - const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); - const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); - const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); - const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD); - - /* Shift the coordinates by the fixed surface offset. It may be - * non-zero if the image is a single slice of a higher-dimensional - * surface, or if a non-zero mipmap level of the surface is bound to - * the pipeline. The offset needs to be applied here rather than at - * surface state set-up time because the desired slice-level may - * start mid-tile, so simply shifting the surface base address - * wouldn't give a well-formed tiled surface in the general case. - */ - for (unsigned c = 0; c < 2; ++c) - bld.ADD(offset(addr, bld, c), offset(off, bld, c), - (c < dims ? - offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) : - fs_reg(brw_imm_d(0)))); - - /* The layout of 3-D textures in memory is sort-of like a tiling - * format. At each miplevel, the slices are arranged in rows of - * 2^level slices per row. The slice row is stored in tmp.y and - * the slice within the row is stored in tmp.x. - * - * The layout of 2-D array textures and cubemaps is much simpler: - * Depending on whether the ARYSPC_LOD0 layout is in use it will be - * stored in memory as an array of slices, each one being a 2-D - * arrangement of miplevels, or as a 2D arrangement of miplevels, - * each one being an array of slices. In either case the separation - * between slices of the same LOD is equal to the qpitch value - * provided as stride.w. - * - * This code can be made to handle either 2D arrays and 3D textures - * by passing in the miplevel as tile.z for 3-D textures and 0 in - * tile.z for 2-D array textures. - * - * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface - * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion - * of the hardware 3D texture and 2D array layouts. - */ - if (dims > 2) { - /* Decompose z into a major (tmp.y) and a minor (tmp.x) - * index. - */ - bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), brw_imm_d(0), - offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2)); - bld.SHR(offset(tmp, bld, 1), - offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2), - offset(tile, bld, 2)); - - /* Take into account the horizontal (tmp.x) and vertical (tmp.y) - * slice offset. - */ - for (unsigned c = 0; c < 2; ++c) { - bld.MUL(offset(tmp, bld, c), - offset(stride, bld, 2 + c), offset(tmp, bld, c)); - bld.ADD(offset(addr, bld, c), - offset(addr, bld, c), offset(tmp, bld, c)); - } - } - - if (dims > 1) { - /* Calculate the major/minor x and y indices. In order to - * accommodate both X and Y tiling, the Y-major tiling format is - * treated as being a bunch of narrow X-tiles placed next to each - * other. This means that the tile width for Y-tiling is actually - * the width of one sub-column of the Y-major tile where each 4K - * tile has 8 512B sub-columns. - * - * The major Y value is the row of tiles in which the pixel lives. - * The major X value is the tile sub-column in which the pixel - * lives; for X tiling, this is the same as the tile column, for Y - * tiling, each tile has 8 sub-columns. The minor X and Y indices - * are the position within the sub-column. - */ - for (unsigned c = 0; c < 2; ++c) { - /* Calculate the minor x and y indices. */ - bld.BFE(offset(minor, bld, c), offset(tile, bld, c), - brw_imm_d(0), offset(addr, bld, c)); - - /* Calculate the major x and y indices. */ - bld.SHR(offset(major, bld, c), - offset(addr, bld, c), offset(tile, bld, c)); - } - - /* Calculate the texel index from the start of the tile row and - * the vertical coordinate of the row. - * Equivalent to: - * tmp.x = (major.x << tile.y << tile.x) + - * (minor.y << tile.x) + minor.x - * tmp.y = major.y << tile.y - */ - bld.SHL(tmp, major, offset(tile, bld, 1)); - bld.ADD(tmp, tmp, offset(minor, bld, 1)); - bld.SHL(tmp, tmp, offset(tile, bld, 0)); - bld.ADD(tmp, tmp, minor); - bld.SHL(offset(tmp, bld, 1), - offset(major, bld, 1), offset(tile, bld, 1)); - - /* Add it to the start of the tile row. */ - bld.MUL(offset(tmp, bld, 1), - offset(tmp, bld, 1), offset(stride, bld, 1)); - bld.ADD(tmp, tmp, offset(tmp, bld, 1)); - - /* Multiply by the Bpp value. */ - bld.MUL(dst, tmp, stride); - - if (devinfo->gen < 8 && !devinfo->is_baytrail) { - /* Take into account the two dynamically specified shifts. - * Both need are used to implement swizzling of X-tiled - * surfaces. For Y-tiled surfaces only one bit needs to be - * XOR-ed with bit 6 of the memory address, so a swz value of - * 0xff (actually interpreted as 31 by the hardware) will be - * provided to cause the relevant bit of tmp.y to be zero and - * turn the first XOR into the identity. For linear surfaces - * or platforms lacking address swizzling both shifts will be - * 0xff causing the relevant bits of both tmp.x and .y to be - * zero, what effectively disables swizzling. - */ - for (unsigned c = 0; c < 2; ++c) - bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c)); - - /* XOR tmp.x and tmp.y with bit 6 of the memory address. */ - bld.XOR(tmp, tmp, offset(tmp, bld, 1)); - bld.AND(tmp, tmp, brw_imm_d(1 << 6)); - bld.XOR(dst, dst, tmp); - } - - } else { - /* Multiply by the Bpp/stride value. Note that the addr.y may be - * non-zero even if the image is one-dimensional because a - * vertical offset may have been applied above to select a - * non-zero slice or level of a higher-dimensional texture. - */ - bld.MUL(offset(addr, bld, 1), - offset(addr, bld, 1), offset(stride, bld, 1)); - bld.ADD(addr, addr, offset(addr, bld, 1)); - bld.MUL(dst, addr, stride); - } - - return dst; - } - } - - namespace image_format_conversion { - using image_format_info::color_u; - - namespace { - /** - * Maximum representable value in an unsigned integer with the given - * number of bits. - */ - inline unsigned - scale(unsigned n) - { - return (1 << n) - 1; - } - } - - /** - * Pack the vector \p src in a bitfield given the per-component bit - * shifts and widths. Note that bitfield components are not allowed to - * cross 32-bit boundaries. - */ - static fs_reg - emit_pack(const fs_builder &bld, const fs_reg &src, - const color_u &shifts, const color_u &widths) - { - const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4); - bool seen[4] = {}; - - for (unsigned c = 0; c < 4; ++c) { - if (widths[c]) { - const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); - - /* Shift each component left to the correct bitfield position. */ - bld.SHL(tmp, offset(src, bld, c), brw_imm_ud(shifts[c] % 32)); - - /* Add everything up. */ - if (seen[shifts[c] / 32]) { - bld.OR(offset(dst, bld, shifts[c] / 32), - offset(dst, bld, shifts[c] / 32), tmp); - } else { - bld.MOV(offset(dst, bld, shifts[c] / 32), tmp); - seen[shifts[c] / 32] = true; - } - } - } - - return dst; - } - - /** - * Unpack a vector from the bitfield \p src given the per-component bit - * shifts and widths. Note that bitfield components are not allowed to - * cross 32-bit boundaries. - */ - static fs_reg - emit_unpack(const fs_builder &bld, const fs_reg &src, - const color_u &shifts, const color_u &widths) - { - const fs_reg dst = bld.vgrf(src.type, 4); - - for (unsigned c = 0; c < 4; ++c) { - if (widths[c]) { - /* Shift left to discard the most significant bits. */ - bld.SHL(offset(dst, bld, c), - offset(src, bld, shifts[c] / 32), - brw_imm_ud(32 - shifts[c] % 32 - widths[c])); - - /* Shift back to the least significant bits using an arithmetic - * shift to get sign extension on signed types. - */ - bld.ASR(offset(dst, bld, c), - offset(dst, bld, c), brw_imm_ud(32 - widths[c])); - } - } - - return dst; - } - - /** - * Convert an integer vector into another integer vector of the - * specified bit widths, properly handling overflow. - */ - static fs_reg - emit_convert_to_integer(const fs_builder &bld, const fs_reg &src, - const color_u &widths, bool is_signed) - { - const unsigned s = (is_signed ? 1 : 0); - const fs_reg dst = bld.vgrf( - is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4); - assert(src.type == dst.type); - - for (unsigned c = 0; c < 4; ++c) { - if (widths[c]) { - /* Clamp to the maximum value. */ - bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c), - brw_imm_d((int)scale(widths[c] - s)), - BRW_CONDITIONAL_L); - - /* Clamp to the minimum value. */ - if (is_signed) - bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c), - brw_imm_d(-(int)scale(widths[c] - s) - 1), - BRW_CONDITIONAL_GE); - - /* Mask off all but the bits we actually want. Otherwise, if - * we pass a negative number into the hardware when it's - * expecting something like UINT8, it will happily clamp it to - * +255 for us. - */ - if (is_signed && widths[c] < 32) - bld.AND(offset(dst, bld, c), offset(dst, bld, c), - brw_imm_d(scale(widths[c]))); - } - } - - return dst; - } - - /** - * Convert a normalized fixed-point vector of the specified signedness - * and bit widths into a floating point vector. - */ - static fs_reg - emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src, - const color_u &widths, bool is_signed) - { - const unsigned s = (is_signed ? 1 : 0); - const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4); - - for (unsigned c = 0; c < 4; ++c) { - if (widths[c]) { - /* Convert to float. */ - bld.MOV(offset(dst, bld, c), offset(src, bld, c)); - - /* Divide by the normalization constants. */ - bld.MUL(offset(dst, bld, c), offset(dst, bld, c), - brw_imm_f(1.0f / scale(widths[c] - s))); - - /* Clamp to the minimum value. */ - if (is_signed) - bld.emit_minmax(offset(dst, bld, c), - offset(dst, bld, c), brw_imm_f(-1.0f), - BRW_CONDITIONAL_GE); - } - } - return dst; - } - - /** - * Convert a floating-point vector into a normalized fixed-point vector - * of the specified signedness and bit widths. - */ - static fs_reg - emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src, - const color_u &widths, bool is_signed) - { - const unsigned s = (is_signed ? 1 : 0); - const fs_reg dst = bld.vgrf( - is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4); - const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F); - - for (unsigned c = 0; c < 4; ++c) { - if (widths[c]) { - /* Clamp the normalized floating-point argument. */ - if (is_signed) { - bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c), - brw_imm_f(-1.0f), BRW_CONDITIONAL_GE); - - bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c), - brw_imm_f(1.0f), BRW_CONDITIONAL_L); - } else { - set_saturate(true, bld.MOV(offset(fdst, bld, c), - offset(src, bld, c))); - } - - /* Multiply by the normalization constants. */ - bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c), - brw_imm_f((float)scale(widths[c] - s))); - - /* Convert to integer. */ - bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c)); - bld.MOV(offset(dst, bld, c), offset(fdst, bld, c)); - - /* Mask off all but the bits we actually want. Otherwise, if - * we pass a negative number into the hardware when it's - * expecting something like UINT8, it will happily clamp it to - * +255 for us. - */ - if (is_signed && widths[c] < 32) - bld.AND(offset(dst, bld, c), offset(dst, bld, c), - brw_imm_d(scale(widths[c]))); - } - } - - return dst; - } - - /** - * Convert a floating point vector of the specified bit widths into a - * 32-bit floating point vector. - */ - static fs_reg - emit_convert_from_float(const fs_builder &bld, const fs_reg &src, - const color_u &widths) - { - const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4); - const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F); - - for (unsigned c = 0; c < 4; ++c) { - if (widths[c]) { - bld.MOV(offset(dst, bld, c), offset(src, bld, c)); - - /* Extend 10-bit and 11-bit floating point numbers to 15 bits. - * This works because they have a 5-bit exponent just like the - * 16-bit floating point format, and they have no sign bit. - */ - if (widths[c] < 16) - bld.SHL(offset(dst, bld, c), - offset(dst, bld, c), brw_imm_ud(15 - widths[c])); - - /* Convert to 32-bit floating point. */ - bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c)); - } - } - - return fdst; - } - - /** - * Convert a vector into a floating point vector of the specified bit - * widths. - */ - static fs_reg - emit_convert_to_float(const fs_builder &bld, const fs_reg &src, - const color_u &widths) - { - const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4); - const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F); - - for (unsigned c = 0; c < 4; ++c) { - if (widths[c]) { - bld.MOV(offset(fdst, bld, c), offset(src, bld, c)); - - /* Clamp to the minimum value. */ - if (widths[c] < 16) - bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c), - brw_imm_f(0.0f), BRW_CONDITIONAL_GE); - - /* Convert to 16-bit floating-point. */ - bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c)); - - /* Discard the least significant bits to get floating point - * numbers of the requested width. This works because the - * 10-bit and 11-bit floating point formats have a 5-bit - * exponent just like the 16-bit format, and they have no sign - * bit. - */ - if (widths[c] < 16) - bld.SHR(offset(dst, bld, c), offset(dst, bld, c), - brw_imm_ud(15 - widths[c])); - } - } - - return dst; - } - - /** - * Fill missing components of a vector with 0, 0, 0, 1. - */ - static fs_reg - emit_pad(const fs_builder &bld, const fs_reg &src, - const color_u &widths) - { - const fs_reg dst = bld.vgrf(src.type, 4); - const unsigned pad[] = { 0, 0, 0, 1 }; - - for (unsigned c = 0; c < 4; ++c) - bld.MOV(offset(dst, bld, c), - widths[c] ? offset(src, bld, c) - : fs_reg(brw_imm_ud(pad[c]))); - - return dst; - } - } -} - -namespace brw { - namespace image_access { - /** - * Load a vector from a surface of the given format and dimensionality - * at the given coordinates. \p surf_dims and \p arr_dims give the - * number of non-array and array coordinates of the image respectively. - */ - fs_reg - emit_image_load(const fs_builder &bld, - const fs_reg &image, const fs_reg &addr, - unsigned surf_dims, unsigned arr_dims, - unsigned gl_format) - { - using namespace image_format_info; - using namespace image_format_conversion; - using namespace image_validity; - using namespace image_coordinates; - using namespace surface_access; - const gen_device_info *devinfo = bld.shader->devinfo; - const isl_format format = isl_format_for_gl_format(gl_format); - const isl_format lower_format = - isl_lower_storage_image_format(devinfo, format); - fs_reg tmp; - - /* Transform the image coordinates into actual surface coordinates. */ - const fs_reg saddr = - emit_image_coordinates(bld, addr, surf_dims, arr_dims, format); - const unsigned dims = - num_image_coordinates(bld, surf_dims, arr_dims, format); - - if (isl_has_matching_typed_storage_image_format(devinfo, format)) { - /* Hopefully we get here most of the time... */ - tmp = emit_typed_read(bld, image, saddr, dims, - isl_format_get_num_channels(lower_format)); - } else { - /* Untyped surface reads return 32 bits of the surface per - * component, without any sort of unpacking or type conversion, - */ - const unsigned size = isl_format_get_layout(format)->bpb / 32; - /* they don't properly handle out of bounds access, so we have to - * check manually if the coordinates are valid and predicate the - * surface read on the result, - */ - const brw_predicate pred = - emit_untyped_image_check(bld, image, - emit_bounds_check(bld, image, - saddr, dims)); - - /* and they don't know about surface coordinates, we need to - * convert them to a raw memory offset. - */ - const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims); - - tmp = emit_untyped_read(bld, image, laddr, 1, size, pred); - - /* An out of bounds surface access should give zero as result. */ - for (unsigned c = 0; c < size; ++c) - set_predicate(pred, bld.SEL(offset(tmp, bld, c), - offset(tmp, bld, c), brw_imm_d(0))); - } - - /* Set the register type to D instead of UD if the data type is - * represented as a signed integer in memory so that sign extension - * is handled correctly by unpack. - */ - if (needs_sign_extension(format)) - tmp = retype(tmp, BRW_REGISTER_TYPE_D); - - if (!has_supported_bit_layout(devinfo, format)) { - /* Unpack individual vector components from the bitfield if the - * hardware is unable to do it for us. - */ - if (has_split_bit_layout(devinfo, format)) - tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format), - get_bit_widths(lower_format)); - else - tmp = emit_unpack(bld, tmp, get_bit_shifts(format), - get_bit_widths(format)); - - } else if ((needs_sign_extension(format) && - !is_conversion_trivial(devinfo, format)) || - has_undefined_high_bits(devinfo, format)) { - /* Perform a trivial unpack even though the bit layout matches in - * order to get the most significant bits of each component - * initialized properly. - */ - tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96), - get_bit_widths(format)); - } - - if (!isl_format_has_int_channel(format)) { - if (is_conversion_trivial(devinfo, format)) { - /* Just need to cast the vector to the target type. */ - tmp = retype(tmp, BRW_REGISTER_TYPE_F); - } else { - /* Do the right sort of type conversion to float. */ - if (isl_format_has_float_channel(format)) - tmp = emit_convert_from_float( - bld, tmp, get_bit_widths(format)); - else - tmp = emit_convert_from_scaled( - bld, tmp, get_bit_widths(format), - isl_format_has_snorm_channel(format)); - } - } - - /* Initialize missing components of the result. */ - return emit_pad(bld, tmp, get_bit_widths(format)); - } - - /** - * Store a vector in a surface of the given format and dimensionality at - * the given coordinates. \p surf_dims and \p arr_dims give the number - * of non-array and array coordinates of the image respectively. - */ - void - emit_image_store(const fs_builder &bld, const fs_reg &image, - const fs_reg &addr, const fs_reg &src, - unsigned surf_dims, unsigned arr_dims, - unsigned gl_format) - { - using namespace image_format_info; - using namespace image_format_conversion; - using namespace image_validity; - using namespace image_coordinates; - using namespace surface_access; - const isl_format format = isl_format_for_gl_format(gl_format); - const gen_device_info *devinfo = bld.shader->devinfo; - - /* Transform the image coordinates into actual surface coordinates. */ - const fs_reg saddr = - emit_image_coordinates(bld, addr, surf_dims, arr_dims, format); - const unsigned dims = - num_image_coordinates(bld, surf_dims, arr_dims, format); - - if (gl_format == GL_NONE) { - /* We don't know what the format is, but that's fine because it - * implies write-only access, and typed surface writes are always - * able to take care of type conversion and packing for us. - */ - emit_typed_write(bld, image, saddr, src, dims, 4); - - } else { - const isl_format lower_format = - isl_lower_storage_image_format(devinfo, format); - fs_reg tmp = src; - - if (!is_conversion_trivial(devinfo, format)) { - /* Do the right sort of type conversion. */ - if (isl_format_has_float_channel(format)) - tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format)); - - else if (isl_format_has_int_channel(format)) - tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format), - isl_format_has_sint_channel(format)); - - else - tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format), - isl_format_has_snorm_channel(format)); - } - - /* We're down to bit manipulation at this point. */ - tmp = retype(tmp, BRW_REGISTER_TYPE_UD); - - if (!has_supported_bit_layout(devinfo, format)) { - /* Pack the vector components into a bitfield if the hardware - * is unable to do it for us. - */ - if (has_split_bit_layout(devinfo, format)) - tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format), - get_bit_widths(lower_format)); - - else - tmp = emit_pack(bld, tmp, get_bit_shifts(format), - get_bit_widths(format)); - } - - if (isl_has_matching_typed_storage_image_format(devinfo, format)) { - /* Hopefully we get here most of the time... */ - emit_typed_write(bld, image, saddr, tmp, dims, - isl_format_get_num_channels(lower_format)); - - } else { - /* Untyped surface writes store 32 bits of the surface per - * component, without any sort of packing or type conversion, - */ - const unsigned size = isl_format_get_layout(format)->bpb / 32; - - /* they don't properly handle out of bounds access, so we have - * to check manually if the coordinates are valid and predicate - * the surface write on the result, - */ - const brw_predicate pred = - emit_untyped_image_check(bld, image, - emit_bounds_check(bld, image, - saddr, dims)); - - /* and, phew, they don't know about surface coordinates, we - * need to convert them to a raw memory offset. - */ - const fs_reg laddr = emit_address_calculation( - bld, image, saddr, dims); - - emit_untyped_write(bld, image, laddr, tmp, 1, size, pred); - } - } - } - - /** - * Perform an atomic read-modify-write operation in a surface of the - * given dimensionality at the given coordinates. \p surf_dims and \p - * arr_dims give the number of non-array and array coordinates of the - * image respectively. Main building block of the imageAtomic GLSL - * built-ins. - */ - fs_reg - emit_image_atomic(const fs_builder &bld, - const fs_reg &image, const fs_reg &addr, - const fs_reg &src0, const fs_reg &src1, - unsigned surf_dims, unsigned arr_dims, - unsigned rsize, unsigned op) - { - using namespace image_validity; - using namespace image_coordinates; - using namespace surface_access; - /* Avoid performing an atomic operation on an unbound surface. */ - const brw_predicate pred = emit_typed_atomic_check(bld, image); - - /* Transform the image coordinates into actual surface coordinates. */ - const fs_reg saddr = - emit_image_coordinates(bld, addr, surf_dims, arr_dims, - ISL_FORMAT_R32_UINT); - const unsigned dims = - num_image_coordinates(bld, surf_dims, arr_dims, - ISL_FORMAT_R32_UINT); - - /* Thankfully we can do without untyped atomics here. */ - const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1, - dims, rsize, op, pred); - - /* An unbound surface access should give zero as result. */ - if (rsize && pred) - set_predicate(pred, bld.SEL(tmp, tmp, brw_imm_d(0))); - - return retype(tmp, src0.type); - } - } -} diff --git a/src/intel/compiler/brw_fs_surface_builder.h b/src/intel/compiler/brw_fs_surface_builder.h index 6952df64286..8df5d25f4fa 100644 --- a/src/intel/compiler/brw_fs_surface_builder.h +++ b/src/intel/compiler/brw_fs_surface_builder.h @@ -85,25 +85,5 @@ namespace brw { unsigned bit_size, brw_predicate pred = BRW_PREDICATE_NONE); } - - namespace image_access { - fs_reg - emit_image_load(const fs_builder &bld, - const fs_reg &image, const fs_reg &addr, - unsigned surf_dims, unsigned arr_dims, - unsigned gl_format); - - void - emit_image_store(const fs_builder &bld, const fs_reg &image, - const fs_reg &addr, const fs_reg &src, - unsigned surf_dims, unsigned arr_dims, - unsigned gl_format); - fs_reg - emit_image_atomic(const fs_builder &bld, - const fs_reg &image, const fs_reg &addr, - const fs_reg &src0, const fs_reg &src1, - unsigned surf_dims, unsigned arr_dims, - unsigned rsize, unsigned op); - } } #endif diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h index 5c75ef2324a..72a6ee8884a 100644 --- a/src/intel/compiler/brw_nir.h +++ b/src/intel/compiler/brw_nir.h @@ -114,6 +114,9 @@ void brw_nir_lower_tcs_outputs(nir_shader *nir, const struct brw_vue_map *vue, GLenum tes_primitive_mode); void brw_nir_lower_fs_outputs(nir_shader *nir); +bool brw_nir_lower_image_load_store(nir_shader *nir, + const struct gen_device_info *devinfo); + nir_shader *brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler, bool is_scalar); diff --git a/src/intel/compiler/brw_nir_lower_image_load_store.c b/src/intel/compiler/brw_nir_lower_image_load_store.c new file mode 100644 index 00000000000..b931a6d3512 --- /dev/null +++ b/src/intel/compiler/brw_nir_lower_image_load_store.c @@ -0,0 +1,822 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "isl/isl.h" + +#include "brw_nir.h" +#include "compiler/nir/nir_builder.h" +#include "compiler/nir/nir_format_convert.h" + +/* The higher compiler layers use the GL enums for image formats even if + * they come in from SPIR-V or Vulkan. We need to turn them into an ISL + * enum before we can use them. + */ +static enum isl_format +isl_format_for_gl_format(uint32_t gl_format) +{ + switch (gl_format) { + case GL_R8: return ISL_FORMAT_R8_UNORM; + case GL_R8_SNORM: return ISL_FORMAT_R8_SNORM; + case GL_R8UI: return ISL_FORMAT_R8_UINT; + case GL_R8I: return ISL_FORMAT_R8_SINT; + case GL_RG8: return ISL_FORMAT_R8G8_UNORM; + case GL_RG8_SNORM: return ISL_FORMAT_R8G8_SNORM; + case GL_RG8UI: return ISL_FORMAT_R8G8_UINT; + case GL_RG8I: return ISL_FORMAT_R8G8_SINT; + case GL_RGBA8: return ISL_FORMAT_R8G8B8A8_UNORM; + case GL_RGBA8_SNORM: return ISL_FORMAT_R8G8B8A8_SNORM; + case GL_RGBA8UI: return ISL_FORMAT_R8G8B8A8_UINT; + case GL_RGBA8I: return ISL_FORMAT_R8G8B8A8_SINT; + case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT; + case GL_RGB10_A2: return ISL_FORMAT_R10G10B10A2_UNORM; + case GL_RGB10_A2UI: return ISL_FORMAT_R10G10B10A2_UINT; + case GL_R16: return ISL_FORMAT_R16_UNORM; + case GL_R16_SNORM: return ISL_FORMAT_R16_SNORM; + case GL_R16F: return ISL_FORMAT_R16_FLOAT; + case GL_R16UI: return ISL_FORMAT_R16_UINT; + case GL_R16I: return ISL_FORMAT_R16_SINT; + case GL_RG16: return ISL_FORMAT_R16G16_UNORM; + case GL_RG16_SNORM: return ISL_FORMAT_R16G16_SNORM; + case GL_RG16F: return ISL_FORMAT_R16G16_FLOAT; + case GL_RG16UI: return ISL_FORMAT_R16G16_UINT; + case GL_RG16I: return ISL_FORMAT_R16G16_SINT; + case GL_RGBA16: return ISL_FORMAT_R16G16B16A16_UNORM; + case GL_RGBA16_SNORM: return ISL_FORMAT_R16G16B16A16_SNORM; + case GL_RGBA16F: return ISL_FORMAT_R16G16B16A16_FLOAT; + case GL_RGBA16UI: return ISL_FORMAT_R16G16B16A16_UINT; + case GL_RGBA16I: return ISL_FORMAT_R16G16B16A16_SINT; + case GL_R32F: return ISL_FORMAT_R32_FLOAT; + case GL_R32UI: return ISL_FORMAT_R32_UINT; + case GL_R32I: return ISL_FORMAT_R32_SINT; + case GL_RG32F: return ISL_FORMAT_R32G32_FLOAT; + case GL_RG32UI: return ISL_FORMAT_R32G32_UINT; + case GL_RG32I: return ISL_FORMAT_R32G32_SINT; + case GL_RGBA32F: return ISL_FORMAT_R32G32B32A32_FLOAT; + case GL_RGBA32UI: return ISL_FORMAT_R32G32B32A32_UINT; + case GL_RGBA32I: return ISL_FORMAT_R32G32B32A32_SINT; + case GL_NONE: return ISL_FORMAT_UNSUPPORTED; + default: + assert(!"Invalid image format"); + return ISL_FORMAT_UNSUPPORTED; + } +} + +static nir_ssa_def * +_load_image_param(nir_builder *b, nir_deref_instr *deref, unsigned offset) +{ + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_image_deref_load_param_intel); + load->src[0] = nir_src_for_ssa(&deref->dest.ssa); + nir_intrinsic_set_base(load, offset / 4); + + switch (offset) { + case BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET: + load->num_components = 1; + break; + case BRW_IMAGE_PARAM_OFFSET_OFFSET: + case BRW_IMAGE_PARAM_SWIZZLING_OFFSET: + load->num_components = 2; + break; + case BRW_IMAGE_PARAM_TILING_OFFSET: + case BRW_IMAGE_PARAM_SIZE_OFFSET: + load->num_components = 3; + break; + case BRW_IMAGE_PARAM_STRIDE_OFFSET: + load->num_components = 4; + break; + default: + unreachable("Invalid param offset"); + } + nir_ssa_dest_init(&load->instr, &load->dest, + load->num_components, 32, NULL); + + nir_builder_instr_insert(b, &load->instr); + return &load->dest.ssa; +} + +#define load_image_param(b, d, o) \ + _load_image_param(b, d, BRW_IMAGE_PARAM_##o##_OFFSET) + +static nir_ssa_def * +sanitize_image_coord(nir_builder *b, nir_deref_instr *deref, nir_ssa_def *coord) +{ + if (glsl_get_sampler_dim(deref->type) == GLSL_SAMPLER_DIM_1D && + glsl_sampler_type_is_array(deref->type)) { + /* It's easier if 1D arrays are treated like 2D arrays */ + return nir_vec3(b, nir_channel(b, coord, 0), + nir_imm_int(b, 0), + nir_channel(b, coord, 1)); + } else { + unsigned dims = glsl_get_sampler_coordinate_components(deref->type); + return nir_channels(b, coord, (1 << dims) - 1); + } +} + +static nir_ssa_def * +image_coord_is_in_bounds(nir_builder *b, nir_deref_instr *deref, + nir_ssa_def *coord) +{ + coord = sanitize_image_coord(b, deref, coord); + nir_ssa_def *size = load_image_param(b, deref, SIZE); + + nir_ssa_def *cmp = nir_ilt(b, coord, size); + nir_ssa_def *in_bounds = nir_imm_int(b, NIR_TRUE); + for (unsigned i = 0; i < coord->num_components; i++) + in_bounds = nir_iand(b, in_bounds, nir_channel(b, cmp, i)); + + return in_bounds; +} + +/** Calculate the offset in memory of the texel given by \p coord. + * + * This is meant to be used with untyped surface messages to access a tiled + * surface, what involves taking into account the tiling and swizzling modes + * of the surface manually so it will hopefully not happen very often. + * + * The tiling algorithm implemented here matches either the X or Y tiling + * layouts supported by the hardware depending on the tiling coefficients + * passed to the program as uniforms. See Volume 1 Part 2 Section 4.5 + * "Address Tiling Function" of the IVB PRM for an in-depth explanation of + * the hardware tiling format. + */ +static nir_ssa_def * +image_address(nir_builder *b, const struct gen_device_info *devinfo, + nir_deref_instr *deref, nir_ssa_def *coord) +{ + coord = sanitize_image_coord(b, deref, coord); + + nir_ssa_def *offset = load_image_param(b, deref, OFFSET); + nir_ssa_def *tiling = load_image_param(b, deref, TILING); + nir_ssa_def *stride = load_image_param(b, deref, STRIDE); + + /* Shift the coordinates by the fixed surface offset. It may be non-zero + * if the image is a single slice of a higher-dimensional surface, or if a + * non-zero mipmap level of the surface is bound to the pipeline. The + * offset needs to be applied here rather than at surface state set-up time + * because the desired slice-level may start mid-tile, so simply shifting + * the surface base address wouldn't give a well-formed tiled surface in + * the general case. + */ + nir_ssa_def *xypos = (coord->num_components == 1) ? + nir_vec2(b, coord, nir_imm_int(b, 0)) : + nir_channels(b, coord, 0x3); + xypos = nir_iadd(b, xypos, offset); + + /* The layout of 3-D textures in memory is sort-of like a tiling + * format. At each miplevel, the slices are arranged in rows of + * 2^level slices per row. The slice row is stored in tmp.y and + * the slice within the row is stored in tmp.x. + * + * The layout of 2-D array textures and cubemaps is much simpler: + * Depending on whether the ARYSPC_LOD0 layout is in use it will be + * stored in memory as an array of slices, each one being a 2-D + * arrangement of miplevels, or as a 2D arrangement of miplevels, + * each one being an array of slices. In either case the separation + * between slices of the same LOD is equal to the qpitch value + * provided as stride.w. + * + * This code can be made to handle either 2D arrays and 3D textures + * by passing in the miplevel as tile.z for 3-D textures and 0 in + * tile.z for 2-D array textures. + * + * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface + * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion + * of the hardware 3D texture and 2D array layouts. + */ + if (coord->num_components > 2) { + /* Decompose z into a major (tmp.y) and a minor (tmp.x) + * index. + */ + nir_ssa_def *z = nir_channel(b, coord, 2); + nir_ssa_def *z_x = nir_ubfe(b, z, nir_imm_int(b, 0), + nir_channel(b, tiling, 2)); + nir_ssa_def *z_y = nir_ushr(b, z, nir_channel(b, tiling, 2)); + + /* Take into account the horizontal (tmp.x) and vertical (tmp.y) + * slice offset. + */ + xypos = nir_iadd(b, xypos, nir_imul(b, nir_vec2(b, z_x, z_y), + nir_channels(b, stride, 0xc))); + } + + nir_ssa_def *addr; + if (coord->num_components > 1) { + /* Calculate the major/minor x and y indices. In order to + * accommodate both X and Y tiling, the Y-major tiling format is + * treated as being a bunch of narrow X-tiles placed next to each + * other. This means that the tile width for Y-tiling is actually + * the width of one sub-column of the Y-major tile where each 4K + * tile has 8 512B sub-columns. + * + * The major Y value is the row of tiles in which the pixel lives. + * The major X value is the tile sub-column in which the pixel + * lives; for X tiling, this is the same as the tile column, for Y + * tiling, each tile has 8 sub-columns. The minor X and Y indices + * are the position within the sub-column. + */ + + /* Calculate the minor x and y indices. */ + nir_ssa_def *minor = nir_ubfe(b, xypos, nir_imm_int(b, 0), + nir_channels(b, tiling, 0x3)); + nir_ssa_def *major = nir_ushr(b, xypos, nir_channels(b, tiling, 0x3)); + + /* Calculate the texel index from the start of the tile row and the + * vertical coordinate of the row. + * Equivalent to: + * tmp.x = (major.x << tile.y << tile.x) + + * (minor.y << tile.x) + minor.x + * tmp.y = major.y << tile.y + */ + nir_ssa_def *idx_x, *idx_y; + idx_x = nir_ishl(b, nir_channel(b, major, 0), nir_channel(b, tiling, 1)); + idx_x = nir_iadd(b, idx_x, nir_channel(b, minor, 1)); + idx_x = nir_ishl(b, idx_x, nir_channel(b, tiling, 0)); + idx_x = nir_iadd(b, idx_x, nir_channel(b, minor, 0)); + idx_y = nir_ishl(b, nir_channel(b, major, 1), nir_channel(b, tiling, 1)); + + /* Add it to the start of the tile row. */ + nir_ssa_def *idx; + idx = nir_imul(b, idx_y, nir_channel(b, stride, 1)); + idx = nir_iadd(b, idx, idx_x); + + /* Multiply by the Bpp value. */ + addr = nir_imul(b, idx, nir_channel(b, stride, 0)); + + if (devinfo->gen < 8 && !devinfo->is_baytrail) { + /* Take into account the two dynamically specified shifts. Both are + * used to implement swizzling of X-tiled surfaces. For Y-tiled + * surfaces only one bit needs to be XOR-ed with bit 6 of the memory + * address, so a swz value of 0xff (actually interpreted as 31 by the + * hardware) will be provided to cause the relevant bit of tmp.y to + * be zero and turn the first XOR into the identity. For linear + * surfaces or platforms lacking address swizzling both shifts will + * be 0xff causing the relevant bits of both tmp.x and .y to be zero, + * what effectively disables swizzling. + */ + nir_ssa_def *swizzle = load_image_param(b, deref, SWIZZLING); + nir_ssa_def *shift0 = nir_ushr(b, addr, nir_channel(b, swizzle, 0)); + nir_ssa_def *shift1 = nir_ushr(b, addr, nir_channel(b, swizzle, 1)); + + /* XOR tmp.x and tmp.y with bit 6 of the memory address. */ + nir_ssa_def *bit = nir_iand(b, nir_ixor(b, shift0, shift1), + nir_imm_int(b, 1 << 6)); + addr = nir_ixor(b, addr, bit); + } + } else { + /* Multiply by the Bpp/stride value. Note that the addr.y may be + * non-zero even if the image is one-dimensional because a vertical + * offset may have been applied above to select a non-zero slice or + * level of a higher-dimensional texture. + */ + nir_ssa_def *idx; + idx = nir_imul(b, nir_channel(b, xypos, 1), nir_channel(b, stride, 1)); + idx = nir_iadd(b, nir_channel(b, xypos, 0), idx); + addr = nir_imul(b, idx, nir_channel(b, stride, 0)); + } + + return addr; +} + +struct format_info { + const struct isl_format_layout *fmtl; + unsigned chans; + unsigned bits[4]; +}; + +static struct format_info +get_format_info(enum isl_format fmt) +{ + const struct isl_format_layout *fmtl = isl_format_get_layout(fmt); + + return (struct format_info) { + .fmtl = fmtl, + .chans = isl_format_get_num_channels(fmt), + .bits = { + fmtl->channels.r.bits, + fmtl->channels.g.bits, + fmtl->channels.b.bits, + fmtl->channels.a.bits + }, + }; +} + +static nir_ssa_def * +nir_zero_vec(nir_builder *b, unsigned num_components) +{ + nir_const_value v; + memset(&v, 0, sizeof(v)); + + return nir_build_imm(b, num_components, 32, v); +} + +static nir_ssa_def * +convert_color_for_load(nir_builder *b, const struct gen_device_info *devinfo, + nir_ssa_def *color, + enum isl_format image_fmt, enum isl_format lower_fmt, + unsigned dest_components) +{ + if (image_fmt == lower_fmt) + goto expand_vec; + + if (image_fmt == ISL_FORMAT_R11G11B10_FLOAT) { + assert(lower_fmt == ISL_FORMAT_R32_UINT); + color = nir_format_unpack_11f11f10f(b, color); + goto expand_vec; + } + + struct format_info image = get_format_info(image_fmt); + struct format_info lower = get_format_info(lower_fmt); + + const bool needs_sign_extension = + isl_format_has_snorm_channel(image_fmt) || + isl_format_has_sint_channel(image_fmt); + + /* We only check the red channel to detect if we need to pack/unpack */ + assert(image.bits[0] != lower.bits[0] || + memcmp(image.bits, lower.bits, sizeof(image.bits)) == 0); + + if (image.bits[0] != lower.bits[0] && lower_fmt == ISL_FORMAT_R32_UINT) { + if (needs_sign_extension) + color = nir_format_unpack_sint(b, color, image.bits, image.chans); + else + color = nir_format_unpack_uint(b, color, image.bits, image.chans); + } else { + /* All these formats are homogeneous */ + for (unsigned i = 1; i < image.chans; i++) + assert(image.bits[i] == image.bits[0]); + + /* On IVB, we rely on the undocumented behavior that typed reads from + * surfaces of the unsupported R8 and R16 formats return useful data in + * their least significant bits. However, the data in the high bits is + * garbage so we have to discard it. + */ + if (devinfo->gen == 7 && !devinfo->is_haswell && + (lower_fmt == ISL_FORMAT_R16_UINT || + lower_fmt == ISL_FORMAT_R8_UINT)) + color = nir_format_mask_uvec(b, color, lower.bits); + + if (image.bits[0] != lower.bits[0]) { + color = nir_format_bitcast_uvec_unmasked(b, color, lower.bits[0], + image.bits[0]); + } + + if (needs_sign_extension) + color = nir_format_sign_extend_ivec(b, color, image.bits); + } + + switch (image.fmtl->channels.r.type) { + case ISL_UNORM: + assert(isl_format_has_uint_channel(lower_fmt)); + color = nir_format_unorm_to_float(b, color, image.bits); + break; + + case ISL_SNORM: + assert(isl_format_has_uint_channel(lower_fmt)); + color = nir_format_snorm_to_float(b, color, image.bits); + break; + + case ISL_SFLOAT: + if (image.bits[0] == 16) + color = nir_unpack_half_2x16_split_x(b, color); + break; + + case ISL_UINT: + case ISL_SINT: + break; + + default: + unreachable("Invalid image channel type"); + } + +expand_vec: + assert(dest_components == 1 || dest_components == 4); + assert(color->num_components <= dest_components); + if (color->num_components == dest_components) + return color; + + nir_ssa_def *comps[4]; + for (unsigned i = 0; i < color->num_components; i++) + comps[i] = nir_channel(b, color, i); + + for (unsigned i = color->num_components; i < 3; i++) + comps[i] = nir_imm_int(b, 0); + + if (color->num_components < 4) { + if (isl_format_has_int_channel(image_fmt)) + comps[3] = nir_imm_int(b, 1); + else + comps[3] = nir_imm_float(b, 1); + } + + return nir_vec(b, comps, dest_components); +} + +static bool +lower_image_load_instr(nir_builder *b, + const struct gen_device_info *devinfo, + nir_intrinsic_instr *intrin) +{ + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + nir_variable *var = nir_deref_instr_get_variable(deref); + const enum isl_format image_fmt = + isl_format_for_gl_format(var->data.image.format); + + if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt)) { + const enum isl_format lower_fmt = + isl_lower_storage_image_format(devinfo, image_fmt); + const unsigned dest_components = intrin->num_components; + + /* Use an undef to hold the uses of the load while we do the color + * conversion. + */ + nir_ssa_def *placeholder = nir_ssa_undef(b, 4, 32); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(placeholder)); + + intrin->num_components = isl_format_get_num_channels(lower_fmt); + intrin->dest.ssa.num_components = intrin->num_components; + + b->cursor = nir_after_instr(&intrin->instr); + + nir_ssa_def *color = convert_color_for_load(b, devinfo, + &intrin->dest.ssa, + image_fmt, lower_fmt, + dest_components); + + nir_ssa_def_rewrite_uses(placeholder, nir_src_for_ssa(color)); + nir_instr_remove(placeholder->parent_instr); + } else { + const struct isl_format_layout *image_fmtl = + isl_format_get_layout(image_fmt); + /* We have a matching typed format for everything 32b and below */ + assert(image_fmtl->bpb == 64 || image_fmtl->bpb == 128); + enum isl_format raw_fmt = (image_fmtl->bpb == 64) ? + ISL_FORMAT_R32G32_UINT : + ISL_FORMAT_R32G32B32A32_UINT; + const unsigned dest_components = intrin->num_components; + + b->cursor = nir_instr_remove(&intrin->instr); + + nir_ssa_def *coord = intrin->src[1].ssa; + + nir_ssa_def *do_load = image_coord_is_in_bounds(b, deref, coord); + if (devinfo->gen == 7 && !devinfo->is_haswell) { + /* Check whether the first stride component (i.e. the Bpp value) + * is greater than four, what on Gen7 indicates that a surface of + * type RAW has been bound for untyped access. Reading or writing + * to a surface of type other than RAW using untyped surface + * messages causes a hang on IVB and VLV. + */ + nir_ssa_def *stride = load_image_param(b, deref, STRIDE); + nir_ssa_def *is_raw = + nir_ilt(b, nir_imm_int(b, 4), nir_channel(b, stride, 0)); + do_load = nir_iand(b, do_load, is_raw); + } + nir_push_if(b, do_load); + + nir_ssa_def *addr = image_address(b, devinfo, deref, coord); + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_image_deref_load_raw_intel); + load->src[0] = nir_src_for_ssa(&deref->dest.ssa); + load->src[1] = nir_src_for_ssa(addr); + load->num_components = image_fmtl->bpb / 32; + nir_ssa_dest_init(&load->instr, &load->dest, + load->num_components, 32, NULL); + nir_builder_instr_insert(b, &load->instr); + + nir_push_else(b, NULL); + + nir_ssa_def *zero = nir_zero_vec(b, load->num_components); + + nir_pop_if(b, NULL); + + nir_ssa_def *value = nir_if_phi(b, &load->dest.ssa, zero); + + nir_ssa_def *color = convert_color_for_load(b, devinfo, value, + image_fmt, raw_fmt, + dest_components); + + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(color)); + } + + return true; +} + +static nir_ssa_def * +convert_color_for_store(nir_builder *b, const struct gen_device_info *devinfo, + nir_ssa_def *color, + enum isl_format image_fmt, enum isl_format lower_fmt) +{ + struct format_info image = get_format_info(image_fmt); + struct format_info lower = get_format_info(lower_fmt); + + color = nir_channels(b, color, (1 << image.chans) - 1); + + if (image_fmt == lower_fmt) + return color; + + if (image_fmt == ISL_FORMAT_R11G11B10_FLOAT) { + assert(lower_fmt == ISL_FORMAT_R32_UINT); + return nir_format_pack_11f11f10f(b, color); + } + + switch (image.fmtl->channels.r.type) { + case ISL_UNORM: + assert(isl_format_has_uint_channel(lower_fmt)); + color = nir_format_float_to_unorm(b, color, image.bits); + break; + + case ISL_SNORM: + assert(isl_format_has_uint_channel(lower_fmt)); + color = nir_format_float_to_snorm(b, color, image.bits); + break; + + case ISL_SFLOAT: + if (image.bits[0] == 16) { + nir_ssa_def *f16comps[4]; + for (unsigned i = 0; i < image.chans; i++) { + f16comps[i] = nir_pack_half_2x16_split(b, nir_channel(b, color, i), + nir_imm_float(b, 0)); + } + color = nir_vec(b, f16comps, image.chans); + } + break; + + case ISL_UINT: + if (image.bits[0] < 32) { + nir_const_value max; + for (unsigned i = 0; i < image.chans; i++) { + assert(image.bits[i] < 32); + max.u32[i] = (1u << image.bits[i]) - 1; + } + color = nir_umin(b, color, nir_build_imm(b, image.chans, 32, max)); + } + break; + + case ISL_SINT: + if (image.bits[0] < 32) { + nir_const_value min, max; + for (unsigned i = 0; i < image.chans; i++) { + assert(image.bits[i] < 32); + max.i32[i] = (1 << (image.bits[i] - 1)) - 1; + min.i32[i] = -(1 << (image.bits[i] - 1)); + } + color = nir_imin(b, color, nir_build_imm(b, image.chans, 32, max)); + color = nir_imax(b, color, nir_build_imm(b, image.chans, 32, min)); + } + break; + + default: + unreachable("Invalid image channel type"); + } + + if (image.bits[0] < 32 && + (isl_format_has_snorm_channel(image_fmt) || + isl_format_has_sint_channel(image_fmt))) + color = nir_format_mask_uvec(b, color, image.bits); + + if (image.bits[0] != lower.bits[0] && lower_fmt == ISL_FORMAT_R32_UINT) { + color = nir_format_pack_uint(b, color, image.bits, image.chans); + } else { + /* All these formats are homogeneous */ + for (unsigned i = 1; i < image.chans; i++) + assert(image.bits[i] == image.bits[0]); + + if (image.bits[0] != lower.bits[0]) { + color = nir_format_bitcast_uvec_unmasked(b, color, image.bits[0], + lower.bits[0]); + } + } + + return color; +} + +static bool +lower_image_store_instr(nir_builder *b, + const struct gen_device_info *devinfo, + nir_intrinsic_instr *intrin) +{ + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + nir_variable *var = nir_deref_instr_get_variable(deref); + + /* For write-only surfaces, we trust that the hardware can just do the + * conversion for us. + */ + if (var->data.image.write_only) + return false; + + const enum isl_format image_fmt = + isl_format_for_gl_format(var->data.image.format); + + if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt)) { + const enum isl_format lower_fmt = + isl_lower_storage_image_format(devinfo, image_fmt); + + /* Color conversion goes before the store */ + b->cursor = nir_before_instr(&intrin->instr); + + nir_ssa_def *color = convert_color_for_store(b, devinfo, + intrin->src[3].ssa, + image_fmt, lower_fmt); + intrin->num_components = isl_format_get_num_channels(lower_fmt); + nir_instr_rewrite_src(&intrin->instr, &intrin->src[3], + nir_src_for_ssa(color)); + } else { + const struct isl_format_layout *image_fmtl = + isl_format_get_layout(image_fmt); + /* We have a matching typed format for everything 32b and below */ + assert(image_fmtl->bpb == 64 || image_fmtl->bpb == 128); + enum isl_format raw_fmt = (image_fmtl->bpb == 64) ? + ISL_FORMAT_R32G32_UINT : + ISL_FORMAT_R32G32B32A32_UINT; + + b->cursor = nir_instr_remove(&intrin->instr); + + nir_ssa_def *coord = intrin->src[1].ssa; + + nir_ssa_def *do_store = image_coord_is_in_bounds(b, deref, coord); + if (devinfo->gen == 7 && !devinfo->is_haswell) { + /* Check whether the first stride component (i.e. the Bpp value) + * is greater than four, what on Gen7 indicates that a surface of + * type RAW has been bound for untyped access. Reading or writing + * to a surface of type other than RAW using untyped surface + * messages causes a hang on IVB and VLV. + */ + nir_ssa_def *stride = load_image_param(b, deref, STRIDE); + nir_ssa_def *is_raw = + nir_ilt(b, nir_imm_int(b, 4), nir_channel(b, stride, 0)); + do_store = nir_iand(b, do_store, is_raw); + } + nir_push_if(b, do_store); + + nir_ssa_def *addr = image_address(b, devinfo, deref, coord); + nir_ssa_def *color = convert_color_for_store(b, devinfo, + intrin->src[3].ssa, + image_fmt, raw_fmt); + + nir_intrinsic_instr *store = + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_image_deref_store_raw_intel); + store->src[0] = nir_src_for_ssa(&deref->dest.ssa); + store->src[1] = nir_src_for_ssa(addr); + store->src[2] = nir_src_for_ssa(color); + store->num_components = image_fmtl->bpb / 32; + nir_builder_instr_insert(b, &store->instr); + + nir_pop_if(b, NULL); + } + + return true; +} + +static bool +lower_image_atomic_instr(nir_builder *b, + const struct gen_device_info *devinfo, + nir_intrinsic_instr *intrin) +{ + if (devinfo->is_haswell || devinfo->gen >= 8) + return false; + + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + + b->cursor = nir_instr_remove(&intrin->instr); + + /* Use an undef to hold the uses of the load conversion. */ + nir_ssa_def *placeholder = nir_ssa_undef(b, 4, 32); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(placeholder)); + + /* Check the first component of the size field to find out if the + * image is bound. Necessary on IVB for typed atomics because + * they don't seem to respect null surfaces and will happily + * corrupt or read random memory when no image is bound. + */ + nir_ssa_def *size = load_image_param(b, deref, SIZE); + nir_ssa_def *zero = nir_imm_int(b, 0); + nir_push_if(b, nir_ine(b, nir_channel(b, size, 0), zero)); + + nir_builder_instr_insert(b, &intrin->instr); + + nir_pop_if(b, NULL); + + nir_ssa_def *result = nir_if_phi(b, &intrin->dest.ssa, zero); + nir_ssa_def_rewrite_uses(placeholder, nir_src_for_ssa(result)); + + return true; +} + +static bool +lower_image_size_instr(nir_builder *b, + const struct gen_device_info *devinfo, + nir_intrinsic_instr *intrin) +{ + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + + b->cursor = nir_instr_remove(&intrin->instr); + + nir_ssa_def *size = load_image_param(b, deref, SIZE); + + nir_ssa_def *comps[4] = { NULL, NULL, NULL, NULL }; + + enum glsl_sampler_dim dim = glsl_get_sampler_dim(deref->type); + unsigned coord_comps = glsl_get_sampler_coordinate_components(deref->type); + for (unsigned c = 0; c < coord_comps; c++) { + if (c == 1 && dim == GLSL_SAMPLER_DIM_1D) { + /* The array length for 1D arrays is in .z */ + comps[1] = nir_channel(b, size, 2); + } else if (c == 2 && dim == GLSL_SAMPLER_DIM_CUBE) { + comps[2] = nir_idiv(b, nir_channel(b, size, 2), nir_imm_int(b, 6)); + } else { + comps[c] = nir_channel(b, size, c); + } + } + + for (unsigned c = coord_comps; c < intrin->dest.ssa.num_components; ++c) + comps[c] = nir_imm_int(b, 1); + + nir_ssa_def *vec = nir_vec(b, comps, intrin->dest.ssa.num_components); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(vec)); + + return true; +} + +bool +brw_nir_lower_image_load_store(nir_shader *shader, + const struct gen_device_info *devinfo) +{ + bool progress = false; + + nir_foreach_function(function, shader) { + if (function->impl == NULL) + continue; + + nir_foreach_block_safe(block, function->impl) { + nir_builder b; + nir_builder_init(&b, function->impl); + + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_image_deref_load: + if (lower_image_load_instr(&b, devinfo, intrin)) + progress = true; + break; + + case nir_intrinsic_image_deref_store: + if (lower_image_store_instr(&b, devinfo, intrin)) + progress = true; + break; + + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_min: + case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + if (lower_image_atomic_instr(&b, devinfo, intrin)) + progress = true; + break; + + case nir_intrinsic_image_deref_size: + if (lower_image_size_instr(&b, devinfo, intrin)) + progress = true; + break; + + default: + /* Nothing to do */ + break; + } + } + } + + nir_metadata_preserve(function->impl, nir_metadata_block_index | + nir_metadata_dominance); + } + + return progress; +} diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build index 98860c94374..3cdeb6214a8 100644 --- a/src/intel/compiler/meson.build +++ b/src/intel/compiler/meson.build @@ -77,6 +77,7 @@ libintel_compiler_files = files( 'brw_nir_analyze_ubo_ranges.c', 'brw_nir_attribute_workarounds.c', 'brw_nir_lower_cs_intrinsics.c', + 'brw_nir_lower_image_load_store.c', 'brw_nir_opt_peephole_ffma.c', 'brw_nir_tcs_workarounds.c', 'brw_packed_float.c', diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 0fe0c7e296e..19d59b7fbac 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -532,6 +532,8 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline, if (nir->info.stage != MESA_SHADER_COMPUTE) brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges); + NIR_PASS_V(nir, brw_nir_lower_image_load_store, compiler->devinfo); + assert(nir->num_uniforms == prog_data->nr_params * 4); stage->nir = nir; diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c index a669814d0d2..f5ebd3c3b05 100644 --- a/src/mesa/drivers/dri/i965/brw_program.c +++ b/src/mesa/drivers/dri/i965/brw_program.c @@ -102,6 +102,8 @@ brw_create_nir(struct brw_context *brw, nir = brw_preprocess_nir(brw->screen->compiler, nir); + NIR_PASS_V(nir, brw_nir_lower_image_load_store, devinfo); + if (stage == MESA_SHADER_TESS_CTRL) { /* Lower gl_PatchVerticesIn from a sys. value to a uniform on Gen8+. */ static const gl_state_index16 tokens[STATE_LENGTH] = |