diff options
Diffstat (limited to 'src/intel')
-rw-r--r-- | src/intel/Android.blorp.mk | 47 | ||||
-rw-r--r-- | src/intel/Android.mk | 1 | ||||
-rw-r--r-- | src/intel/Makefile.am | 2 | ||||
-rw-r--r-- | src/intel/Makefile.blorp.am | 24 | ||||
-rw-r--r-- | src/intel/Makefile.sources | 8 | ||||
-rw-r--r-- | src/intel/blorp/blorp.c | 292 | ||||
-rw-r--r-- | src/intel/blorp/blorp.h | 153 | ||||
-rw-r--r-- | src/intel/blorp/blorp_blit.c | 1649 | ||||
-rw-r--r-- | src/intel/blorp/blorp_clear.c | 344 | ||||
-rw-r--r-- | src/intel/blorp/blorp_genX_exec.h | 1176 | ||||
-rw-r--r-- | src/intel/blorp/blorp_priv.h | 291 |
11 files changed, 3987 insertions, 0 deletions
diff --git a/src/intel/Android.blorp.mk b/src/intel/Android.blorp.mk new file mode 100644 index 00000000000..268d5ebfa74 --- /dev/null +++ b/src/intel/Android.blorp.mk @@ -0,0 +1,47 @@ +# Copyright © 2016 Intel Corporation +# Copyright © 2016 Mauro Rossi <[email protected]> +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# --------------------------------------- +# Build libmesa_blorp +# --------------------------------------- + +include $(CLEAR_VARS) + +LOCAL_MODULE := libmesa_blorp + +LOCAL_MODULE_CLASS := STATIC_LIBRARIES + +LOCAL_SRC_FILES := $(BLORP_FILES) + +LOCAL_C_INCLUDES := := \ + $(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_nir,,)/nir \ + $(MESA_TOP)/src/gallium/auxiliary \ + $(MESA_TOP)/src/gallium/include \ + $(MESA_TOP)/src/mapi \ + $(MESA_TOP)/src/mesa \ + $(MESA_TOP)/src/mesa/drivers/dri/i965 + +LOCAL_STATIC_LIBRARIES := libmesa_nir + +LOCAL_SHARED_LIBRARIES := libdrm_intel + +include $(MESA_COMMON_MK) +include $(BUILD_STATIC_LIBRARY) diff --git a/src/intel/Android.mk b/src/intel/Android.mk index 114b111eba9..0e9c29dfa2b 100644 --- a/src/intel/Android.mk +++ b/src/intel/Android.mk @@ -25,5 +25,6 @@ LOCAL_PATH := $(call my-dir) # Import variables include $(LOCAL_PATH)/Makefile.sources +include $(LOCAL_PATH)/Android.blorp.mk include $(LOCAL_PATH)/Android.genxml.mk include $(LOCAL_PATH)/Android.isl.mk diff --git a/src/intel/Makefile.am b/src/intel/Makefile.am index 3b7d2f3a85b..fa4570d18aa 100644 --- a/src/intel/Makefile.am +++ b/src/intel/Makefile.am @@ -26,6 +26,7 @@ AM_CPPFLAGS = \ -I$(top_srcdir)/include \ -I$(top_builddir)/src \ -I$(top_srcdir)/src \ + -I$(top_builddir)/src/compiler/nir \ -I$(top_builddir)/src/intel \ -I$(top_srcdir)/src/intel \ -I$(top_srcdir)/src/mapi \ @@ -51,6 +52,7 @@ BUILT_SOURCES = CLEANFILES = EXTRA_DIST = +include Makefile.blorp.am include Makefile.genxml.am include Makefile.isl.am diff --git a/src/intel/Makefile.blorp.am b/src/intel/Makefile.blorp.am new file mode 100644 index 00000000000..36d99dfa1f4 --- /dev/null +++ b/src/intel/Makefile.blorp.am @@ -0,0 +1,24 @@ +# Copyright 2015-2016 Intel Corporation +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +noinst_LTLIBRARIES += blorp/libblorp.la + +blorp_libblorp_la_SOURCES = $(BLORP_FILES) diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources index a8ba0b9a170..fcf85eacfc4 100644 --- a/src/intel/Makefile.sources +++ b/src/intel/Makefile.sources @@ -1,3 +1,11 @@ +BLORP_FILES = \ + blorp/blorp.c \ + blorp/blorp.h \ + blorp/blorp_blit.c \ + blorp/blorp_clear.c \ + blorp/blorp_genX_exec.h \ + blorp/blorp_priv.h + GENXML_GENERATED_FILES = \ genxml/gen4_pack.h \ genxml/gen45_pack.h \ diff --git a/src/intel/blorp/blorp.c b/src/intel/blorp/blorp.c new file mode 100644 index 00000000000..4dbba017489 --- /dev/null +++ b/src/intel/blorp/blorp.c @@ -0,0 +1,292 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <errno.h> + +#include "program/prog_instruction.h" + +#include "blorp_priv.h" +#include "brw_compiler.h" +#include "brw_nir.h" + +void +blorp_init(struct blorp_context *blorp, void *driver_ctx, + struct isl_device *isl_dev) +{ + blorp->driver_ctx = driver_ctx; + blorp->isl_dev = isl_dev; +} + +void +blorp_finish(struct blorp_context *blorp) +{ + blorp->driver_ctx = NULL; +} + +void +blorp_batch_init(struct blorp_context *blorp, + struct blorp_batch *batch, void *driver_batch) +{ + batch->blorp = blorp; + batch->driver_batch = driver_batch; +} + +void +blorp_batch_finish(struct blorp_batch *batch) +{ + batch->blorp = NULL; +} + +void +brw_blorp_surface_info_init(struct blorp_context *blorp, + struct brw_blorp_surface_info *info, + const struct blorp_surf *surf, + unsigned int level, unsigned int layer, + enum isl_format format, bool is_render_target) +{ + /* Layer is a physical layer, so if this is a 2D multisample array texture + * using INTEL_MSAA_LAYOUT_UMS or INTEL_MSAA_LAYOUT_CMS, then it had better + * be a multiple of num_samples. + */ + unsigned layer_multiplier = 1; + if (surf->surf->msaa_layout == ISL_MSAA_LAYOUT_ARRAY) { + assert(layer % surf->surf->samples == 0); + layer_multiplier = surf->surf->samples; + } + + if (format == ISL_FORMAT_UNSUPPORTED) + format = surf->surf->format; + + if (format == ISL_FORMAT_R24_UNORM_X8_TYPELESS) { + /* Unfortunately, ISL_FORMAT_R24_UNORM_X8_TYPELESS it isn't supported as + * a render target, which would prevent us from blitting to 24-bit + * depth. The miptree consists of 32 bits per pixel, arranged as 24-bit + * depth values interleaved with 8 "don't care" bits. Since depth + * values don't require any blending, it doesn't matter how we interpret + * the bit pattern as long as we copy the right amount of data, so just + * map it as 8-bit BGRA. + */ + format = ISL_FORMAT_B8G8R8A8_UNORM; + } else if (surf->surf->usage & ISL_SURF_USAGE_STENCIL_BIT) { + assert(surf->surf->format == ISL_FORMAT_R8_UINT); + /* Prior to Broadwell, we can't render to R8_UINT */ + if (blorp->isl_dev->info->gen < 8) + format = ISL_FORMAT_R8_UNORM; + } + + info->surf = *surf->surf; + info->addr = surf->addr; + + info->aux_usage = surf->aux_usage; + if (info->aux_usage != ISL_AUX_USAGE_NONE) { + info->aux_surf = *surf->aux_surf; + info->aux_addr = surf->aux_addr; + } + + info->clear_color = surf->clear_color; + + info->view = (struct isl_view) { + .usage = is_render_target ? ISL_SURF_USAGE_RENDER_TARGET_BIT : + ISL_SURF_USAGE_TEXTURE_BIT, + .format = format, + .base_level = level, + .levels = 1, + .channel_select = { + ISL_CHANNEL_SELECT_RED, + ISL_CHANNEL_SELECT_GREEN, + ISL_CHANNEL_SELECT_BLUE, + ISL_CHANNEL_SELECT_ALPHA, + }, + }; + + if (!is_render_target && + (info->surf.dim == ISL_SURF_DIM_3D || + info->surf.msaa_layout == ISL_MSAA_LAYOUT_ARRAY)) { + /* 3-D textures don't support base_array layer and neither do 2-D + * multisampled textures on IVB so we need to pass it through the + * sampler in those cases. These are also two cases where we are + * guaranteed that we won't be doing any funny surface hacks. + */ + info->view.base_array_layer = 0; + info->view.array_len = MAX2(info->surf.logical_level0_px.depth, + info->surf.logical_level0_px.array_len); + info->z_offset = layer / layer_multiplier; + } else { + info->view.base_array_layer = layer / layer_multiplier; + info->view.array_len = 1; + info->z_offset = 0; + } +} + + +void +blorp_params_init(struct blorp_params *params) +{ + memset(params, 0, sizeof(*params)); + params->num_draw_buffers = 1; + params->num_layers = 1; +} + +void +brw_blorp_init_wm_prog_key(struct brw_wm_prog_key *wm_key) +{ + memset(wm_key, 0, sizeof(*wm_key)); + wm_key->nr_color_regions = 1; + for (int i = 0; i < MAX_SAMPLERS; i++) + wm_key->tex.swizzles[i] = SWIZZLE_XYZW; +} + +static int +nir_uniform_type_size(const struct glsl_type *type) +{ + /* Only very basic types are allowed */ + assert(glsl_type_is_vector_or_scalar(type)); + assert(glsl_get_bit_size(type) == 32); + + return glsl_get_vector_elements(type) * 4; +} + +const unsigned * +brw_blorp_compile_nir_shader(struct blorp_context *blorp, struct nir_shader *nir, + const struct brw_wm_prog_key *wm_key, + bool use_repclear, + struct brw_blorp_prog_data *prog_data, + unsigned *program_size) +{ + const struct brw_compiler *compiler = blorp->compiler; + + void *mem_ctx = ralloc_context(NULL); + + /* Calling brw_preprocess_nir and friends is destructive and, if cloning is + * enabled, may end up completely replacing the nir_shader. Therefore, we + * own it and might as well put it in our context for easy cleanup. + */ + ralloc_steal(mem_ctx, nir); + nir->options = + compiler->glsl_compiler_options[MESA_SHADER_FRAGMENT].NirOptions; + + struct brw_wm_prog_data wm_prog_data; + memset(&wm_prog_data, 0, sizeof(wm_prog_data)); + + wm_prog_data.base.nr_params = 0; + wm_prog_data.base.param = NULL; + + /* BLORP always just uses the first two binding table entries */ + wm_prog_data.binding_table.render_target_start = BLORP_RENDERBUFFER_BT_INDEX; + wm_prog_data.base.binding_table.texture_start = BLORP_TEXTURE_BT_INDEX; + + nir = brw_preprocess_nir(compiler, nir); + nir_remove_dead_variables(nir, nir_var_shader_in); + nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); + + /* Uniforms are required to be lowered before going into compile_fs. For + * BLORP, we'll assume that whoever builds the shader sets the location + * they want so we just need to lower them and figure out how many we have + * in total. + */ + nir->num_uniforms = 0; + nir_foreach_variable(var, &nir->uniforms) { + var->data.driver_location = var->data.location; + unsigned end = var->data.location + nir_uniform_type_size(var->type); + nir->num_uniforms = MAX2(nir->num_uniforms, end); + } + nir_lower_io(nir, nir_var_uniform, nir_uniform_type_size); + + const unsigned *program = + brw_compile_fs(compiler, blorp->driver_ctx, mem_ctx, + wm_key, &wm_prog_data, nir, + NULL, -1, -1, false, use_repclear, program_size, NULL); + + /* Copy the relavent bits of wm_prog_data over into the blorp prog data */ + prog_data->dispatch_8 = wm_prog_data.dispatch_8; + prog_data->dispatch_16 = wm_prog_data.dispatch_16; + prog_data->first_curbe_grf_0 = wm_prog_data.base.dispatch_grf_start_reg; + prog_data->first_curbe_grf_2 = wm_prog_data.dispatch_grf_start_reg_2; + prog_data->ksp_offset_2 = wm_prog_data.prog_offset_2; + prog_data->persample_msaa_dispatch = wm_prog_data.persample_dispatch; + prog_data->flat_inputs = wm_prog_data.flat_inputs; + prog_data->num_varying_inputs = wm_prog_data.num_varying_inputs; + prog_data->inputs_read = nir->info.inputs_read; + + assert(wm_prog_data.base.nr_params == 0); + + return program; +} + +void +blorp_gen6_hiz_op(struct blorp_batch *batch, + struct blorp_surf *surf, unsigned level, unsigned layer, + enum blorp_hiz_op op) +{ + struct blorp_params params; + blorp_params_init(¶ms); + + params.hiz_op = op; + + brw_blorp_surface_info_init(batch->blorp, ¶ms.depth, surf, level, layer, + surf->surf->format, true); + + /* Align the rectangle primitive to 8x4 pixels. + * + * During fast depth clears, the emitted rectangle primitive must be + * aligned to 8x4 pixels. From the Ivybridge PRM, Vol 2 Part 1 Section + * 11.5.3.1 Depth Buffer Clear (and the matching section in the Sandybridge + * PRM): + * If Number of Multisamples is NUMSAMPLES_1, the rectangle must be + * aligned to an 8x4 pixel block relative to the upper left corner + * of the depth buffer [...] + * + * For hiz resolves, the rectangle must also be 8x4 aligned. Item + * WaHizAmbiguate8x4Aligned from the Haswell workarounds page and the + * Ivybridge simulator require the alignment. + * + * To be safe, let's just align the rect for all hiz operations and all + * hardware generations. + * + * However, for some miptree slices of a Z24 texture, emitting an 8x4 + * aligned rectangle that covers the slice may clobber adjacent slices if + * we strictly adhered to the texture alignments specified in the PRM. The + * Ivybridge PRM, Section "Alignment Unit Size", states that + * SURFACE_STATE.Surface_Horizontal_Alignment should be 4 for Z24 surfaces, + * not 8. But commit 1f112cc increased the alignment from 4 to 8, which + * prevents the clobbering. + */ + params.x1 = minify(params.depth.surf.logical_level0_px.width, + params.depth.view.base_level); + params.y1 = minify(params.depth.surf.logical_level0_px.height, + params.depth.view.base_level); + params.x1 = ALIGN(params.x1, 8); + params.y1 = ALIGN(params.y1, 4); + + if (params.depth.view.base_level == 0) { + /* TODO: What about MSAA? */ + params.depth.surf.logical_level0_px.width = params.x1; + params.depth.surf.logical_level0_px.height = params.y1; + } + + params.dst.surf.samples = params.depth.surf.samples; + params.dst.surf.logical_level0_px = params.depth.surf.logical_level0_px; + params.depth_format = isl_format_get_depth_format(surf->surf->format, false); + + batch->blorp->exec(batch, ¶ms); +} diff --git a/src/intel/blorp/blorp.h b/src/intel/blorp/blorp.h new file mode 100644 index 00000000000..a4fcfdfcf70 --- /dev/null +++ b/src/intel/blorp/blorp.h @@ -0,0 +1,153 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#include <stdint.h> +#include <stdbool.h> + +#include "isl/isl.h" + +struct brw_context; +struct brw_wm_prog_key; + +#ifdef __cplusplus +extern "C" { +#endif + +struct blorp_batch; +struct blorp_params; + +struct blorp_context { + void *driver_ctx; + + const struct isl_device *isl_dev; + + const struct brw_compiler *compiler; + + struct { + uint32_t tex; + uint32_t rb; + uint32_t vb; + } mocs; + + bool (*lookup_shader)(struct blorp_context *blorp, + const void *key, uint32_t key_size, + uint32_t *kernel_out, void *prog_data_out); + void (*upload_shader)(struct blorp_context *blorp, + const void *key, uint32_t key_size, + const void *kernel, uint32_t kernel_size, + const void *prog_data, uint32_t prog_data_size, + uint32_t *kernel_out, void *prog_data_out); + void (*exec)(struct blorp_batch *batch, const struct blorp_params *params); +}; + +void blorp_init(struct blorp_context *blorp, void *driver_ctx, + struct isl_device *isl_dev); +void blorp_finish(struct blorp_context *blorp); + +struct blorp_batch { + struct blorp_context *blorp; + void *driver_batch; +}; + +void blorp_batch_init(struct blorp_context *blorp, struct blorp_batch *batch, + void *driver_batch); +void blorp_batch_finish(struct blorp_batch *batch); + +struct blorp_address { + void *buffer; + uint32_t read_domains; + uint32_t write_domain; + uint32_t offset; +}; + +struct blorp_surf +{ + const struct isl_surf *surf; + struct blorp_address addr; + + const struct isl_surf *aux_surf; + struct blorp_address aux_addr; + enum isl_aux_usage aux_usage; + + union isl_color_value clear_color; +}; + +void +blorp_blit(struct blorp_batch *batch, + const struct blorp_surf *src_surf, + unsigned src_level, unsigned src_layer, + enum isl_format src_format, int src_swizzle, + const struct blorp_surf *dst_surf, + unsigned dst_level, unsigned dst_layer, + enum isl_format dst_format, + float src_x0, float src_y0, + float src_x1, float src_y1, + float dst_x0, float dst_y0, + float dst_x1, float dst_y1, + uint32_t filter, bool mirror_x, bool mirror_y); + +void +blorp_fast_clear(struct blorp_batch *batch, + const struct blorp_surf *surf, + uint32_t level, uint32_t layer, + uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1); + +void +blorp_clear(struct blorp_batch *batch, + const struct blorp_surf *surf, + uint32_t level, uint32_t layer, + uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1, + enum isl_format format, union isl_color_value clear_color, + bool color_write_disable[4]); + +void +blorp_ccs_resolve(struct blorp_batch *batch, + struct blorp_surf *surf, enum isl_format format); + +/** + * For an overview of the HiZ operations, see the following sections of the + * Sandy Bridge PRM, Volume 1, Part2: + * - 7.5.3.1 Depth Buffer Clear + * - 7.5.3.2 Depth Buffer Resolve + * - 7.5.3.3 Hierarchical Depth Buffer Resolve + * + * Of these, two get entered in the resolve map as needing to be done to the + * buffer: depth resolve and hiz resolve. + */ +enum blorp_hiz_op { + BLORP_HIZ_OP_NONE, + BLORP_HIZ_OP_DEPTH_CLEAR, + BLORP_HIZ_OP_DEPTH_RESOLVE, + BLORP_HIZ_OP_HIZ_RESOLVE, +}; + +void +blorp_gen6_hiz_op(struct blorp_batch *batch, + struct blorp_surf *surf, unsigned level, unsigned layer, + enum blorp_hiz_op op); + +#ifdef __cplusplus +} /* end extern "C" */ +#endif /* __cplusplus */ diff --git a/src/intel/blorp/blorp_blit.c b/src/intel/blorp/blorp_blit.c new file mode 100644 index 00000000000..170c3816e38 --- /dev/null +++ b/src/intel/blorp/blorp_blit.c @@ -0,0 +1,1649 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "program/prog_instruction.h" +#include "compiler/nir/nir_builder.h" + +#include "blorp_priv.h" +#include "brw_meta_util.h" + +#define FILE_DEBUG_FLAG DEBUG_BLORP + +/** + * Enum to specify the order of arguments in a sampler message + */ +enum sampler_message_arg +{ + SAMPLER_MESSAGE_ARG_U_FLOAT, + SAMPLER_MESSAGE_ARG_V_FLOAT, + SAMPLER_MESSAGE_ARG_U_INT, + SAMPLER_MESSAGE_ARG_V_INT, + SAMPLER_MESSAGE_ARG_R_INT, + SAMPLER_MESSAGE_ARG_SI_INT, + SAMPLER_MESSAGE_ARG_MCS_INT, + SAMPLER_MESSAGE_ARG_ZERO_INT, +}; + +struct brw_blorp_blit_vars { + /* Input values from brw_blorp_wm_inputs */ + nir_variable *v_discard_rect; + nir_variable *v_rect_grid; + nir_variable *v_coord_transform; + nir_variable *v_src_z; + + /* gl_FragCoord */ + nir_variable *frag_coord; + + /* gl_FragColor */ + nir_variable *color_out; +}; + +static void +brw_blorp_blit_vars_init(nir_builder *b, struct brw_blorp_blit_vars *v, + const struct brw_blorp_blit_prog_key *key) +{ + /* Blended and scaled blits never use pixel discard. */ + assert(!key->use_kill || !(key->blend && key->blit_scaled)); + +#define LOAD_INPUT(name, type)\ + v->v_##name = nir_variable_create(b->shader, nir_var_shader_in, \ + type, #name); \ + v->v_##name->data.interpolation = INTERP_MODE_FLAT; \ + v->v_##name->data.location = VARYING_SLOT_VAR0 + \ + offsetof(struct brw_blorp_wm_inputs, name) / (4 * sizeof(float)); + + LOAD_INPUT(discard_rect, glsl_vec4_type()) + LOAD_INPUT(rect_grid, glsl_vec4_type()) + LOAD_INPUT(coord_transform, glsl_vec4_type()) + LOAD_INPUT(src_z, glsl_uint_type()) + +#undef LOAD_INPUT + + v->frag_coord = nir_variable_create(b->shader, nir_var_shader_in, + glsl_vec4_type(), "gl_FragCoord"); + v->frag_coord->data.location = VARYING_SLOT_POS; + v->frag_coord->data.origin_upper_left = true; + + v->color_out = nir_variable_create(b->shader, nir_var_shader_out, + glsl_vec4_type(), "gl_FragColor"); + v->color_out->data.location = FRAG_RESULT_COLOR; +} + +static nir_ssa_def * +blorp_blit_get_frag_coords(nir_builder *b, + const struct brw_blorp_blit_prog_key *key, + struct brw_blorp_blit_vars *v) +{ + nir_ssa_def *coord = nir_f2i(b, nir_load_var(b, v->frag_coord)); + + if (key->persample_msaa_dispatch) { + return nir_vec3(b, nir_channel(b, coord, 0), nir_channel(b, coord, 1), + nir_load_sample_id(b)); + } else { + return nir_vec2(b, nir_channel(b, coord, 0), nir_channel(b, coord, 1)); + } +} + +/** + * Emit code to translate from destination (X, Y) coordinates to source (X, Y) + * coordinates. + */ +static nir_ssa_def * +blorp_blit_apply_transform(nir_builder *b, nir_ssa_def *src_pos, + struct brw_blorp_blit_vars *v) +{ + nir_ssa_def *coord_transform = nir_load_var(b, v->v_coord_transform); + + nir_ssa_def *offset = nir_vec2(b, nir_channel(b, coord_transform, 1), + nir_channel(b, coord_transform, 3)); + nir_ssa_def *mul = nir_vec2(b, nir_channel(b, coord_transform, 0), + nir_channel(b, coord_transform, 2)); + + return nir_ffma(b, src_pos, mul, offset); +} + +static inline void +blorp_nir_discard_if_outside_rect(nir_builder *b, nir_ssa_def *pos, + struct brw_blorp_blit_vars *v) +{ + nir_ssa_def *c0, *c1, *c2, *c3; + nir_ssa_def *discard_rect = nir_load_var(b, v->v_discard_rect); + nir_ssa_def *dst_x0 = nir_channel(b, discard_rect, 0); + nir_ssa_def *dst_x1 = nir_channel(b, discard_rect, 1); + nir_ssa_def *dst_y0 = nir_channel(b, discard_rect, 2); + nir_ssa_def *dst_y1 = nir_channel(b, discard_rect, 3); + + c0 = nir_ult(b, nir_channel(b, pos, 0), dst_x0); + c1 = nir_uge(b, nir_channel(b, pos, 0), dst_x1); + c2 = nir_ult(b, nir_channel(b, pos, 1), dst_y0); + c3 = nir_uge(b, nir_channel(b, pos, 1), dst_y1); + + nir_ssa_def *oob = nir_ior(b, nir_ior(b, c0, c1), nir_ior(b, c2, c3)); + + nir_intrinsic_instr *discard = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard_if); + discard->src[0] = nir_src_for_ssa(oob); + nir_builder_instr_insert(b, &discard->instr); +} + +static nir_tex_instr * +blorp_create_nir_tex_instr(nir_builder *b, struct brw_blorp_blit_vars *v, + nir_texop op, nir_ssa_def *pos, unsigned num_srcs, + nir_alu_type dst_type) +{ + nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs); + + tex->op = op; + + tex->dest_type = dst_type; + tex->is_array = false; + tex->is_shadow = false; + + /* Blorp only has one texture and it's bound at unit 0 */ + tex->texture = NULL; + tex->sampler = NULL; + tex->texture_index = 0; + tex->sampler_index = 0; + + /* To properly handle 3-D and 2-D array textures, we pull the Z component + * from an input. TODO: This is a bit magic; we should probably make this + * more explicit in the future. + */ + assert(pos->num_components >= 2); + pos = nir_vec3(b, nir_channel(b, pos, 0), nir_channel(b, pos, 1), + nir_load_var(b, v->v_src_z)); + + tex->src[0].src_type = nir_tex_src_coord; + tex->src[0].src = nir_src_for_ssa(pos); + tex->coord_components = 3; + + nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL); + + return tex; +} + +static nir_ssa_def * +blorp_nir_tex(nir_builder *b, struct brw_blorp_blit_vars *v, + nir_ssa_def *pos, nir_alu_type dst_type) +{ + nir_tex_instr *tex = + blorp_create_nir_tex_instr(b, v, nir_texop_tex, pos, 2, dst_type); + + assert(pos->num_components == 2); + tex->sampler_dim = GLSL_SAMPLER_DIM_2D; + tex->src[1].src_type = nir_tex_src_lod; + tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0)); + + nir_builder_instr_insert(b, &tex->instr); + + return &tex->dest.ssa; +} + +static nir_ssa_def * +blorp_nir_txf(nir_builder *b, struct brw_blorp_blit_vars *v, + nir_ssa_def *pos, nir_alu_type dst_type) +{ + nir_tex_instr *tex = + blorp_create_nir_tex_instr(b, v, nir_texop_txf, pos, 2, dst_type); + + tex->sampler_dim = GLSL_SAMPLER_DIM_3D; + tex->src[1].src_type = nir_tex_src_lod; + tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0)); + + nir_builder_instr_insert(b, &tex->instr); + + return &tex->dest.ssa; +} + +static nir_ssa_def * +blorp_nir_txf_ms(nir_builder *b, struct brw_blorp_blit_vars *v, + nir_ssa_def *pos, nir_ssa_def *mcs, nir_alu_type dst_type) +{ + nir_tex_instr *tex = + blorp_create_nir_tex_instr(b, v, nir_texop_txf_ms, pos, + mcs != NULL ? 3 : 2, dst_type); + + tex->sampler_dim = GLSL_SAMPLER_DIM_MS; + + tex->src[1].src_type = nir_tex_src_ms_index; + if (pos->num_components == 2) { + tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0)); + } else { + assert(pos->num_components == 3); + tex->src[1].src = nir_src_for_ssa(nir_channel(b, pos, 2)); + } + + if (mcs) { + tex->src[2].src_type = nir_tex_src_ms_mcs; + tex->src[2].src = nir_src_for_ssa(mcs); + } + + nir_builder_instr_insert(b, &tex->instr); + + return &tex->dest.ssa; +} + +static nir_ssa_def * +blorp_nir_txf_ms_mcs(nir_builder *b, struct brw_blorp_blit_vars *v, nir_ssa_def *pos) +{ + nir_tex_instr *tex = + blorp_create_nir_tex_instr(b, v, nir_texop_txf_ms_mcs, + pos, 1, nir_type_int); + + tex->sampler_dim = GLSL_SAMPLER_DIM_MS; + + nir_builder_instr_insert(b, &tex->instr); + + return &tex->dest.ssa; +} + +static nir_ssa_def * +nir_mask_shift_or(struct nir_builder *b, nir_ssa_def *dst, nir_ssa_def *src, + uint32_t src_mask, int src_left_shift) +{ + nir_ssa_def *masked = nir_iand(b, src, nir_imm_int(b, src_mask)); + + nir_ssa_def *shifted; + if (src_left_shift > 0) { + shifted = nir_ishl(b, masked, nir_imm_int(b, src_left_shift)); + } else if (src_left_shift < 0) { + shifted = nir_ushr(b, masked, nir_imm_int(b, -src_left_shift)); + } else { + assert(src_left_shift == 0); + shifted = masked; + } + + return nir_ior(b, dst, shifted); +} + +/** + * Emit code to compensate for the difference between Y and W tiling. + * + * This code modifies the X and Y coordinates according to the formula: + * + * (X', Y', S') = detile(W-MAJOR, tile(Y-MAJOR, X, Y, S)) + * + * (See brw_blorp_build_nir_shader). + */ +static inline nir_ssa_def * +blorp_nir_retile_y_to_w(nir_builder *b, nir_ssa_def *pos) +{ + assert(pos->num_components == 2); + nir_ssa_def *x_Y = nir_channel(b, pos, 0); + nir_ssa_def *y_Y = nir_channel(b, pos, 1); + + /* Given X and Y coordinates that describe an address using Y tiling, + * translate to the X and Y coordinates that describe the same address + * using W tiling. + * + * If we break down the low order bits of X and Y, using a + * single letter to represent each low-order bit: + * + * X = A << 7 | 0bBCDEFGH + * Y = J << 5 | 0bKLMNP (1) + * + * Then we can apply the Y tiling formula to see the memory offset being + * addressed: + * + * offset = (J * tile_pitch + A) << 12 | 0bBCDKLMNPEFGH (2) + * + * If we apply the W detiling formula to this memory location, that the + * corresponding X' and Y' coordinates are: + * + * X' = A << 6 | 0bBCDPFH (3) + * Y' = J << 6 | 0bKLMNEG + * + * Combining (1) and (3), we see that to transform (X, Y) to (X', Y'), + * we need to make the following computation: + * + * X' = (X & ~0b1011) >> 1 | (Y & 0b1) << 2 | X & 0b1 (4) + * Y' = (Y & ~0b1) << 1 | (X & 0b1000) >> 2 | (X & 0b10) >> 1 + */ + nir_ssa_def *x_W = nir_imm_int(b, 0); + x_W = nir_mask_shift_or(b, x_W, x_Y, 0xfffffff4, -1); + x_W = nir_mask_shift_or(b, x_W, y_Y, 0x1, 2); + x_W = nir_mask_shift_or(b, x_W, x_Y, 0x1, 0); + + nir_ssa_def *y_W = nir_imm_int(b, 0); + y_W = nir_mask_shift_or(b, y_W, y_Y, 0xfffffffe, 1); + y_W = nir_mask_shift_or(b, y_W, x_Y, 0x8, -2); + y_W = nir_mask_shift_or(b, y_W, x_Y, 0x2, -1); + + return nir_vec2(b, x_W, y_W); +} + +/** + * Emit code to compensate for the difference between Y and W tiling. + * + * This code modifies the X and Y coordinates according to the formula: + * + * (X', Y', S') = detile(Y-MAJOR, tile(W-MAJOR, X, Y, S)) + * + * (See brw_blorp_build_nir_shader). + */ +static inline nir_ssa_def * +blorp_nir_retile_w_to_y(nir_builder *b, nir_ssa_def *pos) +{ + assert(pos->num_components == 2); + nir_ssa_def *x_W = nir_channel(b, pos, 0); + nir_ssa_def *y_W = nir_channel(b, pos, 1); + + /* Applying the same logic as above, but in reverse, we obtain the + * formulas: + * + * X' = (X & ~0b101) << 1 | (Y & 0b10) << 2 | (Y & 0b1) << 1 | X & 0b1 + * Y' = (Y & ~0b11) >> 1 | (X & 0b100) >> 2 + */ + nir_ssa_def *x_Y = nir_imm_int(b, 0); + x_Y = nir_mask_shift_or(b, x_Y, x_W, 0xfffffffa, 1); + x_Y = nir_mask_shift_or(b, x_Y, y_W, 0x2, 2); + x_Y = nir_mask_shift_or(b, x_Y, y_W, 0x1, 1); + x_Y = nir_mask_shift_or(b, x_Y, x_W, 0x1, 0); + + nir_ssa_def *y_Y = nir_imm_int(b, 0); + y_Y = nir_mask_shift_or(b, y_Y, y_W, 0xfffffffc, -1); + y_Y = nir_mask_shift_or(b, y_Y, x_W, 0x4, -2); + + return nir_vec2(b, x_Y, y_Y); +} + +/** + * Emit code to compensate for the difference between MSAA and non-MSAA + * surfaces. + * + * This code modifies the X and Y coordinates according to the formula: + * + * (X', Y', S') = encode_msaa(num_samples, IMS, X, Y, S) + * + * (See brw_blorp_blit_program). + */ +static inline nir_ssa_def * +blorp_nir_encode_msaa(nir_builder *b, nir_ssa_def *pos, + unsigned num_samples, enum isl_msaa_layout layout) +{ + assert(pos->num_components == 2 || pos->num_components == 3); + + switch (layout) { + case ISL_MSAA_LAYOUT_NONE: + assert(pos->num_components == 2); + return pos; + case ISL_MSAA_LAYOUT_ARRAY: + /* No translation needed */ + return pos; + case ISL_MSAA_LAYOUT_INTERLEAVED: { + nir_ssa_def *x_in = nir_channel(b, pos, 0); + nir_ssa_def *y_in = nir_channel(b, pos, 1); + nir_ssa_def *s_in = pos->num_components == 2 ? nir_imm_int(b, 0) : + nir_channel(b, pos, 2); + + nir_ssa_def *x_out = nir_imm_int(b, 0); + nir_ssa_def *y_out = nir_imm_int(b, 0); + switch (num_samples) { + case 2: + case 4: + /* encode_msaa(2, IMS, X, Y, S) = (X', Y', 0) + * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1) + * Y' = Y + * + * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0) + * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1) + * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1) + */ + x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 1); + x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1); + x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); + if (num_samples == 2) { + y_out = y_in; + } else { + y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 1); + y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0); + y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); + } + break; + + case 8: + /* encode_msaa(8, IMS, X, Y, S) = (X', Y', 0) + * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 + * | (X & 0b1) + * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1) + */ + x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 2); + x_out = nir_mask_shift_or(b, x_out, s_in, 0x4, 0); + x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1); + x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); + y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 1); + y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0); + y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); + break; + + case 16: + /* encode_msaa(16, IMS, X, Y, S) = (X', Y', 0) + * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 + * | (X & 0b1) + * Y' = (Y & ~0b1) << 2 | (S & 0b1000) >> 1 (S & 0b10) + * | (Y & 0b1) + */ + x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 2); + x_out = nir_mask_shift_or(b, x_out, s_in, 0x4, 0); + x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1); + x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); + y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 2); + y_out = nir_mask_shift_or(b, y_out, s_in, 0x8, -1); + y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0); + y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); + break; + + default: + unreachable("Invalid number of samples for IMS layout"); + } + + return nir_vec2(b, x_out, y_out); + } + + default: + unreachable("Invalid MSAA layout"); + } +} + +/** + * Emit code to compensate for the difference between MSAA and non-MSAA + * surfaces. + * + * This code modifies the X and Y coordinates according to the formula: + * + * (X', Y', S) = decode_msaa(num_samples, IMS, X, Y, S) + * + * (See brw_blorp_blit_program). + */ +static inline nir_ssa_def * +blorp_nir_decode_msaa(nir_builder *b, nir_ssa_def *pos, + unsigned num_samples, enum isl_msaa_layout layout) +{ + assert(pos->num_components == 2 || pos->num_components == 3); + + switch (layout) { + case ISL_MSAA_LAYOUT_NONE: + /* No translation necessary, and S should already be zero. */ + assert(pos->num_components == 2); + return pos; + case ISL_MSAA_LAYOUT_ARRAY: + /* No translation necessary. */ + return pos; + case ISL_MSAA_LAYOUT_INTERLEAVED: { + assert(pos->num_components == 2); + + nir_ssa_def *x_in = nir_channel(b, pos, 0); + nir_ssa_def *y_in = nir_channel(b, pos, 1); + + nir_ssa_def *x_out = nir_imm_int(b, 0); + nir_ssa_def *y_out = nir_imm_int(b, 0); + nir_ssa_def *s_out = nir_imm_int(b, 0); + switch (num_samples) { + case 2: + case 4: + /* decode_msaa(2, IMS, X, Y, 0) = (X', Y', S) + * where X' = (X & ~0b11) >> 1 | (X & 0b1) + * S = (X & 0b10) >> 1 + * + * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S) + * where X' = (X & ~0b11) >> 1 | (X & 0b1) + * Y' = (Y & ~0b11) >> 1 | (Y & 0b1) + * S = (Y & 0b10) | (X & 0b10) >> 1 + */ + x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffc, -1); + x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); + if (num_samples == 2) { + y_out = y_in; + s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1); + } else { + y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffc, -1); + y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); + s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1); + s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0); + } + break; + + case 8: + /* decode_msaa(8, IMS, X, Y, 0) = (X', Y', S) + * where X' = (X & ~0b111) >> 2 | (X & 0b1) + * Y' = (Y & ~0b11) >> 1 | (Y & 0b1) + * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1 + */ + x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffff8, -2); + x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); + y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffc, -1); + y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); + s_out = nir_mask_shift_or(b, s_out, x_in, 0x4, 0); + s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0); + s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1); + break; + + case 16: + /* decode_msaa(16, IMS, X, Y, 0) = (X', Y', S) + * where X' = (X & ~0b111) >> 2 | (X & 0b1) + * Y' = (Y & ~0b111) >> 2 | (Y & 0b1) + * S = (Y & 0b100) << 1 | (X & 0b100) | + * (Y & 0b10) | (X & 0b10) >> 1 + */ + x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffff8, -2); + x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); + y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffff8, -2); + y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); + s_out = nir_mask_shift_or(b, s_out, y_in, 0x4, 1); + s_out = nir_mask_shift_or(b, s_out, x_in, 0x4, 0); + s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0); + s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1); + break; + + default: + unreachable("Invalid number of samples for IMS layout"); + } + + return nir_vec3(b, x_out, y_out, s_out); + } + + default: + unreachable("Invalid MSAA layout"); + } +} + +/** + * Count the number of trailing 1 bits in the given value. For example: + * + * count_trailing_one_bits(0) == 0 + * count_trailing_one_bits(7) == 3 + * count_trailing_one_bits(11) == 2 + */ +static inline int count_trailing_one_bits(unsigned value) +{ +#ifdef HAVE___BUILTIN_CTZ + return __builtin_ctz(~value); +#else + return _mesa_bitcount(value & ~(value + 1)); +#endif +} + +static nir_ssa_def * +blorp_nir_manual_blend_average(nir_builder *b, struct brw_blorp_blit_vars *v, + nir_ssa_def *pos, unsigned tex_samples, + enum isl_aux_usage tex_aux_usage, + nir_alu_type dst_type) +{ + /* If non-null, this is the outer-most if statement */ + nir_if *outer_if = NULL; + + nir_variable *color = + nir_local_variable_create(b->impl, glsl_vec4_type(), "color"); + + nir_ssa_def *mcs = NULL; + if (tex_aux_usage == ISL_AUX_USAGE_MCS) + mcs = blorp_nir_txf_ms_mcs(b, v, pos); + + /* We add together samples using a binary tree structure, e.g. for 4x MSAA: + * + * result = ((sample[0] + sample[1]) + (sample[2] + sample[3])) / 4 + * + * This ensures that when all samples have the same value, no numerical + * precision is lost, since each addition operation always adds two equal + * values, and summing two equal floating point values does not lose + * precision. + * + * We perform this computation by treating the texture_data array as a + * stack and performing the following operations: + * + * - push sample 0 onto stack + * - push sample 1 onto stack + * - add top two stack entries + * - push sample 2 onto stack + * - push sample 3 onto stack + * - add top two stack entries + * - add top two stack entries + * - divide top stack entry by 4 + * + * Note that after pushing sample i onto the stack, the number of add + * operations we do is equal to the number of trailing 1 bits in i. This + * works provided the total number of samples is a power of two, which it + * always is for i965. + * + * For integer formats, we replace the add operations with average + * operations and skip the final division. + */ + nir_ssa_def *texture_data[5]; + unsigned stack_depth = 0; + for (unsigned i = 0; i < tex_samples; ++i) { + assert(stack_depth == _mesa_bitcount(i)); /* Loop invariant */ + + /* Push sample i onto the stack */ + assert(stack_depth < ARRAY_SIZE(texture_data)); + + nir_ssa_def *ms_pos = nir_vec3(b, nir_channel(b, pos, 0), + nir_channel(b, pos, 1), + nir_imm_int(b, i)); + texture_data[stack_depth++] = blorp_nir_txf_ms(b, v, ms_pos, mcs, dst_type); + + if (i == 0 && tex_aux_usage == ISL_AUX_USAGE_MCS) { + /* The Ivy Bridge PRM, Vol4 Part1 p27 (Multisample Control Surface) + * suggests an optimization: + * + * "A simple optimization with probable large return in + * performance is to compare the MCS value to zero (indicating + * all samples are on sample slice 0), and sample only from + * sample slice 0 using ld2dss if MCS is zero." + * + * Note that in the case where the MCS value is zero, sampling from + * sample slice 0 using ld2dss and sampling from sample 0 using + * ld2dms are equivalent (since all samples are on sample slice 0). + * Since we have already sampled from sample 0, all we need to do is + * skip the remaining fetches and averaging if MCS is zero. + */ + nir_ssa_def *mcs_zero = + nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b, 0)); + if (tex_samples == 16) { + mcs_zero = nir_iand(b, mcs_zero, + nir_ieq(b, nir_channel(b, mcs, 1), nir_imm_int(b, 0))); + } + + nir_if *if_stmt = nir_if_create(b->shader); + if_stmt->condition = nir_src_for_ssa(mcs_zero); + nir_cf_node_insert(b->cursor, &if_stmt->cf_node); + + b->cursor = nir_after_cf_list(&if_stmt->then_list); + nir_store_var(b, color, texture_data[0], 0xf); + + b->cursor = nir_after_cf_list(&if_stmt->else_list); + outer_if = if_stmt; + } + + for (int j = 0; j < count_trailing_one_bits(i); j++) { + assert(stack_depth >= 2); + --stack_depth; + + assert(dst_type == nir_type_float); + texture_data[stack_depth - 1] = + nir_fadd(b, texture_data[stack_depth - 1], + texture_data[stack_depth]); + } + } + + /* We should have just 1 sample on the stack now. */ + assert(stack_depth == 1); + + texture_data[0] = nir_fmul(b, texture_data[0], + nir_imm_float(b, 1.0 / tex_samples)); + + nir_store_var(b, color, texture_data[0], 0xf); + + if (outer_if) + b->cursor = nir_after_cf_node(&outer_if->cf_node); + + return nir_load_var(b, color); +} + +static inline nir_ssa_def * +nir_imm_vec2(nir_builder *build, float x, float y) +{ + nir_const_value v; + + memset(&v, 0, sizeof(v)); + v.f32[0] = x; + v.f32[1] = y; + + return nir_build_imm(build, 4, 32, v); +} + +static nir_ssa_def * +blorp_nir_manual_blend_bilinear(nir_builder *b, nir_ssa_def *pos, + unsigned tex_samples, + const struct brw_blorp_blit_prog_key *key, + struct brw_blorp_blit_vars *v) +{ + nir_ssa_def *pos_xy = nir_channels(b, pos, 0x3); + nir_ssa_def *rect_grid = nir_load_var(b, v->v_rect_grid); + nir_ssa_def *scale = nir_imm_vec2(b, key->x_scale, key->y_scale); + + /* Translate coordinates to lay out the samples in a rectangular grid + * roughly corresponding to sample locations. + */ + pos_xy = nir_fmul(b, pos_xy, scale); + /* Adjust coordinates so that integers represent pixel centers rather + * than pixel edges. + */ + pos_xy = nir_fadd(b, pos_xy, nir_imm_float(b, -0.5)); + /* Clamp the X, Y texture coordinates to properly handle the sampling of + * texels on texture edges. + */ + pos_xy = nir_fmin(b, nir_fmax(b, pos_xy, nir_imm_float(b, 0.0)), + nir_vec2(b, nir_channel(b, rect_grid, 0), + nir_channel(b, rect_grid, 1))); + + /* Store the fractional parts to be used as bilinear interpolation + * coefficients. + */ + nir_ssa_def *frac_xy = nir_ffract(b, pos_xy); + /* Round the float coordinates down to nearest integer */ + pos_xy = nir_fdiv(b, nir_ftrunc(b, pos_xy), scale); + + nir_ssa_def *tex_data[4]; + for (unsigned i = 0; i < 4; ++i) { + float sample_off_x = (float)(i & 0x1) / key->x_scale; + float sample_off_y = (float)((i >> 1) & 0x1) / key->y_scale; + nir_ssa_def *sample_off = nir_imm_vec2(b, sample_off_x, sample_off_y); + + nir_ssa_def *sample_coords = nir_fadd(b, pos_xy, sample_off); + nir_ssa_def *sample_coords_int = nir_f2i(b, sample_coords); + + /* The MCS value we fetch has to match up with the pixel that we're + * sampling from. Since we sample from different pixels in each + * iteration of this "for" loop, the call to mcs_fetch() should be + * here inside the loop after computing the pixel coordinates. + */ + nir_ssa_def *mcs = NULL; + if (key->tex_aux_usage == ISL_AUX_USAGE_MCS) + mcs = blorp_nir_txf_ms_mcs(b, v, sample_coords_int); + + /* Compute sample index and map the sample index to a sample number. + * Sample index layout shows the numbering of slots in a rectangular + * grid of samples with in a pixel. Sample number layout shows the + * rectangular grid of samples roughly corresponding to the real sample + * locations with in a pixel. + * In case of 4x MSAA, layout of sample indices matches the layout of + * sample numbers: + * --------- + * | 0 | 1 | + * --------- + * | 2 | 3 | + * --------- + * + * In case of 8x MSAA the two layouts don't match. + * sample index layout : --------- sample number layout : --------- + * | 0 | 1 | | 3 | 7 | + * --------- --------- + * | 2 | 3 | | 5 | 0 | + * --------- --------- + * | 4 | 5 | | 1 | 2 | + * --------- --------- + * | 6 | 7 | | 4 | 6 | + * --------- --------- + * + * Fortunately, this can be done fairly easily as: + * S' = (0x17306425 >> (S * 4)) & 0xf + * + * In the case of 16x MSAA the two layouts don't match. + * Sample index layout: Sample number layout: + * --------------------- --------------------- + * | 0 | 1 | 2 | 3 | | 15 | 10 | 9 | 7 | + * --------------------- --------------------- + * | 4 | 5 | 6 | 7 | | 4 | 1 | 3 | 13 | + * --------------------- --------------------- + * | 8 | 9 | 10 | 11 | | 12 | 2 | 0 | 6 | + * --------------------- --------------------- + * | 12 | 13 | 14 | 15 | | 11 | 8 | 5 | 14 | + * --------------------- --------------------- + * + * This is equivalent to + * S' = (0xe58b602cd31479af >> (S * 4)) & 0xf + */ + nir_ssa_def *frac = nir_ffract(b, sample_coords); + nir_ssa_def *sample = + nir_fdot2(b, frac, nir_imm_vec2(b, key->x_scale, + key->x_scale * key->y_scale)); + sample = nir_f2i(b, sample); + + if (tex_samples == 8) { + sample = nir_iand(b, nir_ishr(b, nir_imm_int(b, 0x64210573), + nir_ishl(b, sample, nir_imm_int(b, 2))), + nir_imm_int(b, 0xf)); + } else if (tex_samples == 16) { + nir_ssa_def *sample_low = + nir_iand(b, nir_ishr(b, nir_imm_int(b, 0xd31479af), + nir_ishl(b, sample, nir_imm_int(b, 2))), + nir_imm_int(b, 0xf)); + nir_ssa_def *sample_high = + nir_iand(b, nir_ishr(b, nir_imm_int(b, 0xe58b602c), + nir_ishl(b, nir_iadd(b, sample, + nir_imm_int(b, -8)), + nir_imm_int(b, 2))), + nir_imm_int(b, 0xf)); + + sample = nir_bcsel(b, nir_ilt(b, sample, nir_imm_int(b, 8)), + sample_low, sample_high); + } + nir_ssa_def *pos_ms = nir_vec3(b, nir_channel(b, sample_coords_int, 0), + nir_channel(b, sample_coords_int, 1), + sample); + tex_data[i] = blorp_nir_txf_ms(b, v, pos_ms, mcs, key->texture_data_type); + } + + nir_ssa_def *frac_x = nir_channel(b, frac_xy, 0); + nir_ssa_def *frac_y = nir_channel(b, frac_xy, 1); + return nir_flrp(b, nir_flrp(b, tex_data[0], tex_data[1], frac_x), + nir_flrp(b, tex_data[2], tex_data[3], frac_x), + frac_y); +} + +/** + * Generator for WM programs used in BLORP blits. + * + * The bulk of the work done by the WM program is to wrap and unwrap the + * coordinate transformations used by the hardware to store surfaces in + * memory. The hardware transforms a pixel location (X, Y, S) (where S is the + * sample index for a multisampled surface) to a memory offset by the + * following formulas: + * + * offset = tile(tiling_format, encode_msaa(num_samples, layout, X, Y, S)) + * (X, Y, S) = decode_msaa(num_samples, layout, detile(tiling_format, offset)) + * + * For a single-sampled surface, or for a multisampled surface using + * INTEL_MSAA_LAYOUT_UMS, encode_msaa() and decode_msaa are the identity + * function: + * + * encode_msaa(1, NONE, X, Y, 0) = (X, Y, 0) + * decode_msaa(1, NONE, X, Y, 0) = (X, Y, 0) + * encode_msaa(n, UMS, X, Y, S) = (X, Y, S) + * decode_msaa(n, UMS, X, Y, S) = (X, Y, S) + * + * For a 4x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa() + * embeds the sample number into bit 1 of the X and Y coordinates: + * + * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0) + * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1) + * Y' = (Y & ~0b1 ) << 1 | (S & 0b10) | (Y & 0b1) + * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S) + * where X' = (X & ~0b11) >> 1 | (X & 0b1) + * Y' = (Y & ~0b11) >> 1 | (Y & 0b1) + * S = (Y & 0b10) | (X & 0b10) >> 1 + * + * For an 8x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa() + * embeds the sample number into bits 1 and 2 of the X coordinate and bit 1 of + * the Y coordinate: + * + * encode_msaa(8, IMS, X, Y, S) = (X', Y', 0) + * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 | (X & 0b1) + * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1) + * decode_msaa(8, IMS, X, Y, 0) = (X', Y', S) + * where X' = (X & ~0b111) >> 2 | (X & 0b1) + * Y' = (Y & ~0b11) >> 1 | (Y & 0b1) + * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1 + * + * For X tiling, tile() combines together the low-order bits of the X and Y + * coordinates in the pattern 0byyyxxxxxxxxx, creating 4k tiles that are 512 + * bytes wide and 8 rows high: + * + * tile(x_tiled, X, Y, S) = A + * where A = tile_num << 12 | offset + * tile_num = (Y' >> 3) * tile_pitch + (X' >> 9) + * offset = (Y' & 0b111) << 9 + * | (X & 0b111111111) + * X' = X * cpp + * Y' = Y + S * qpitch + * detile(x_tiled, A) = (X, Y, S) + * where X = X' / cpp + * Y = Y' % qpitch + * S = Y' / qpitch + * Y' = (tile_num / tile_pitch) << 3 + * | (A & 0b111000000000) >> 9 + * X' = (tile_num % tile_pitch) << 9 + * | (A & 0b111111111) + * + * (In all tiling formulas, cpp is the number of bytes occupied by a single + * sample ("chars per pixel"), tile_pitch is the number of 4k tiles required + * to fill the width of the surface, and qpitch is the spacing (in rows) + * between array slices). + * + * For Y tiling, tile() combines together the low-order bits of the X and Y + * coordinates in the pattern 0bxxxyyyyyxxxx, creating 4k tiles that are 128 + * bytes wide and 32 rows high: + * + * tile(y_tiled, X, Y, S) = A + * where A = tile_num << 12 | offset + * tile_num = (Y' >> 5) * tile_pitch + (X' >> 7) + * offset = (X' & 0b1110000) << 5 + * | (Y' & 0b11111) << 4 + * | (X' & 0b1111) + * X' = X * cpp + * Y' = Y + S * qpitch + * detile(y_tiled, A) = (X, Y, S) + * where X = X' / cpp + * Y = Y' % qpitch + * S = Y' / qpitch + * Y' = (tile_num / tile_pitch) << 5 + * | (A & 0b111110000) >> 4 + * X' = (tile_num % tile_pitch) << 7 + * | (A & 0b111000000000) >> 5 + * | (A & 0b1111) + * + * For W tiling, tile() combines together the low-order bits of the X and Y + * coordinates in the pattern 0bxxxyyyyxyxyx, creating 4k tiles that are 64 + * bytes wide and 64 rows high (note that W tiling is only used for stencil + * buffers, which always have cpp = 1 and S=0): + * + * tile(w_tiled, X, Y, S) = A + * where A = tile_num << 12 | offset + * tile_num = (Y' >> 6) * tile_pitch + (X' >> 6) + * offset = (X' & 0b111000) << 6 + * | (Y' & 0b111100) << 3 + * | (X' & 0b100) << 2 + * | (Y' & 0b10) << 2 + * | (X' & 0b10) << 1 + * | (Y' & 0b1) << 1 + * | (X' & 0b1) + * X' = X * cpp = X + * Y' = Y + S * qpitch + * detile(w_tiled, A) = (X, Y, S) + * where X = X' / cpp = X' + * Y = Y' % qpitch = Y' + * S = Y / qpitch = 0 + * Y' = (tile_num / tile_pitch) << 6 + * | (A & 0b111100000) >> 3 + * | (A & 0b1000) >> 2 + * | (A & 0b10) >> 1 + * X' = (tile_num % tile_pitch) << 6 + * | (A & 0b111000000000) >> 6 + * | (A & 0b10000) >> 2 + * | (A & 0b100) >> 1 + * | (A & 0b1) + * + * Finally, for a non-tiled surface, tile() simply combines together the X and + * Y coordinates in the natural way: + * + * tile(untiled, X, Y, S) = A + * where A = Y * pitch + X' + * X' = X * cpp + * Y' = Y + S * qpitch + * detile(untiled, A) = (X, Y, S) + * where X = X' / cpp + * Y = Y' % qpitch + * S = Y' / qpitch + * X' = A % pitch + * Y' = A / pitch + * + * (In these formulas, pitch is the number of bytes occupied by a single row + * of samples). + */ +static nir_shader * +brw_blorp_build_nir_shader(struct blorp_context *blorp, + const struct brw_blorp_blit_prog_key *key) +{ + const struct brw_device_info *devinfo = blorp->isl_dev->info; + nir_ssa_def *src_pos, *dst_pos, *color; + + /* Sanity checks */ + if (key->dst_tiled_w && key->rt_samples > 1) { + /* If the destination image is W tiled and multisampled, then the thread + * must be dispatched once per sample, not once per pixel. This is + * necessary because after conversion between W and Y tiling, there's no + * guarantee that all samples corresponding to a single pixel will still + * be together. + */ + assert(key->persample_msaa_dispatch); + } + + if (key->blend) { + /* We are blending, which means we won't have an opportunity to + * translate the tiling and sample count for the texture surface. So + * the surface state for the texture must be configured with the correct + * tiling and sample count. + */ + assert(!key->src_tiled_w); + assert(key->tex_samples == key->src_samples); + assert(key->tex_layout == key->src_layout); + assert(key->tex_samples > 0); + } + + if (key->persample_msaa_dispatch) { + /* It only makes sense to do persample dispatch if the render target is + * configured as multisampled. + */ + assert(key->rt_samples > 0); + } + + /* Make sure layout is consistent with sample count */ + assert((key->tex_layout == ISL_MSAA_LAYOUT_NONE) == + (key->tex_samples <= 1)); + assert((key->rt_layout == ISL_MSAA_LAYOUT_NONE) == + (key->rt_samples <= 1)); + assert((key->src_layout == ISL_MSAA_LAYOUT_NONE) == + (key->src_samples <= 1)); + assert((key->dst_layout == ISL_MSAA_LAYOUT_NONE) == + (key->dst_samples <= 1)); + + nir_builder b; + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL); + + struct brw_blorp_blit_vars v; + brw_blorp_blit_vars_init(&b, &v, key); + + dst_pos = blorp_blit_get_frag_coords(&b, key, &v); + + /* Render target and texture hardware don't support W tiling until Gen8. */ + const bool rt_tiled_w = false; + const bool tex_tiled_w = devinfo->gen >= 8 && key->src_tiled_w; + + /* The address that data will be written to is determined by the + * coordinates supplied to the WM thread and the tiling and sample count of + * the render target, according to the formula: + * + * (X, Y, S) = decode_msaa(rt_samples, detile(rt_tiling, offset)) + * + * If the actual tiling and sample count of the destination surface are not + * the same as the configuration of the render target, then these + * coordinates are wrong and we have to adjust them to compensate for the + * difference. + */ + if (rt_tiled_w != key->dst_tiled_w || + key->rt_samples != key->dst_samples || + key->rt_layout != key->dst_layout) { + dst_pos = blorp_nir_encode_msaa(&b, dst_pos, key->rt_samples, + key->rt_layout); + /* Now (X, Y, S) = detile(rt_tiling, offset) */ + if (rt_tiled_w != key->dst_tiled_w) + dst_pos = blorp_nir_retile_y_to_w(&b, dst_pos); + /* Now (X, Y, S) = detile(rt_tiling, offset) */ + dst_pos = blorp_nir_decode_msaa(&b, dst_pos, key->dst_samples, + key->dst_layout); + } + + /* Now (X, Y, S) = decode_msaa(dst_samples, detile(dst_tiling, offset)). + * + * That is: X, Y and S now contain the true coordinates and sample index of + * the data that the WM thread should output. + * + * If we need to kill pixels that are outside the destination rectangle, + * now is the time to do it. + */ + if (key->use_kill) { + assert(!(key->blend && key->blit_scaled)); + blorp_nir_discard_if_outside_rect(&b, dst_pos, &v); + } + + src_pos = blorp_blit_apply_transform(&b, nir_i2f(&b, dst_pos), &v); + if (dst_pos->num_components == 3) { + /* The sample coordinate is an integer that we want left alone but + * blorp_blit_apply_transform() blindly applies the transform to all + * three coordinates. Grab the original sample index. + */ + src_pos = nir_vec3(&b, nir_channel(&b, src_pos, 0), + nir_channel(&b, src_pos, 1), + nir_channel(&b, dst_pos, 2)); + } + + /* If the source image is not multisampled, then we want to fetch sample + * number 0, because that's the only sample there is. + */ + if (key->src_samples == 1) + src_pos = nir_channels(&b, src_pos, 0x3); + + /* X, Y, and S are now the coordinates of the pixel in the source image + * that we want to texture from. Exception: if we are blending, then S is + * irrelevant, because we are going to fetch all samples. + */ + if (key->blend && !key->blit_scaled) { + /* Resolves (effecively) use texelFetch, so we need integers and we + * don't care about the sample index if we got one. + */ + src_pos = nir_f2i(&b, nir_channels(&b, src_pos, 0x3)); + + if (devinfo->gen == 6) { + /* Because gen6 only supports 4x interleved MSAA, we can do all the + * blending we need with a single linear-interpolated texture lookup + * at the center of the sample. The texture coordinates to be odd + * integers so that they correspond to the center of a 2x2 block + * representing the four samples that maxe up a pixel. So we need + * to multiply our X and Y coordinates each by 2 and then add 1. + */ + src_pos = nir_ishl(&b, src_pos, nir_imm_int(&b, 1)); + src_pos = nir_iadd(&b, src_pos, nir_imm_int(&b, 1)); + src_pos = nir_i2f(&b, src_pos); + color = blorp_nir_tex(&b, &v, src_pos, key->texture_data_type); + } else { + /* Gen7+ hardware doesn't automaticaly blend. */ + color = blorp_nir_manual_blend_average(&b, &v, src_pos, key->src_samples, + key->tex_aux_usage, + key->texture_data_type); + } + } else if (key->blend && key->blit_scaled) { + assert(!key->use_kill); + color = blorp_nir_manual_blend_bilinear(&b, src_pos, key->src_samples, key, &v); + } else { + if (key->bilinear_filter) { + color = blorp_nir_tex(&b, &v, src_pos, key->texture_data_type); + } else { + /* We're going to use texelFetch, so we need integers */ + if (src_pos->num_components == 2) { + src_pos = nir_f2i(&b, src_pos); + } else { + assert(src_pos->num_components == 3); + src_pos = nir_vec3(&b, nir_channel(&b, nir_f2i(&b, src_pos), 0), + nir_channel(&b, nir_f2i(&b, src_pos), 1), + nir_channel(&b, src_pos, 2)); + } + + /* We aren't blending, which means we just want to fetch a single + * sample from the source surface. The address that we want to fetch + * from is related to the X, Y and S values according to the formula: + * + * (X, Y, S) = decode_msaa(src_samples, detile(src_tiling, offset)). + * + * If the actual tiling and sample count of the source surface are + * not the same as the configuration of the texture, then we need to + * adjust the coordinates to compensate for the difference. + */ + if (tex_tiled_w != key->src_tiled_w || + key->tex_samples != key->src_samples || + key->tex_layout != key->src_layout) { + src_pos = blorp_nir_encode_msaa(&b, src_pos, key->src_samples, + key->src_layout); + /* Now (X, Y, S) = detile(src_tiling, offset) */ + if (tex_tiled_w != key->src_tiled_w) + src_pos = blorp_nir_retile_w_to_y(&b, src_pos); + /* Now (X, Y, S) = detile(tex_tiling, offset) */ + src_pos = blorp_nir_decode_msaa(&b, src_pos, key->tex_samples, + key->tex_layout); + } + + /* Now (X, Y, S) = decode_msaa(tex_samples, detile(tex_tiling, offset)). + * + * In other words: X, Y, and S now contain values which, when passed to + * the texturing unit, will cause data to be read from the correct + * memory location. So we can fetch the texel now. + */ + if (key->src_samples == 1) { + color = blorp_nir_txf(&b, &v, src_pos, key->texture_data_type); + } else { + nir_ssa_def *mcs = NULL; + if (key->tex_aux_usage == ISL_AUX_USAGE_MCS) + mcs = blorp_nir_txf_ms_mcs(&b, &v, src_pos); + + color = blorp_nir_txf_ms(&b, &v, src_pos, mcs, key->texture_data_type); + } + } + } + + nir_store_var(&b, v.color_out, color, 0xf); + + return b.shader; +} + +static void +brw_blorp_get_blit_kernel(struct blorp_context *blorp, + struct blorp_params *params, + const struct brw_blorp_blit_prog_key *prog_key) +{ + if (blorp->lookup_shader(blorp, prog_key, sizeof(*prog_key), + ¶ms->wm_prog_kernel, ¶ms->wm_prog_data)) + return; + + const unsigned *program; + unsigned program_size; + struct brw_blorp_prog_data prog_data; + + /* Try and compile with NIR first. If that fails, fall back to the old + * method of building shaders manually. + */ + nir_shader *nir = brw_blorp_build_nir_shader(blorp, prog_key); + struct brw_wm_prog_key wm_key; + brw_blorp_init_wm_prog_key(&wm_key); + wm_key.tex.compressed_multisample_layout_mask = + prog_key->tex_aux_usage == ISL_AUX_USAGE_MCS; + wm_key.tex.msaa_16 = prog_key->tex_samples == 16; + wm_key.multisample_fbo = prog_key->rt_samples > 1; + + program = brw_blorp_compile_nir_shader(blorp, nir, &wm_key, false, + &prog_data, &program_size); + + blorp->upload_shader(blorp, prog_key, sizeof(*prog_key), + program, program_size, + &prog_data, sizeof(prog_data), + ¶ms->wm_prog_kernel, ¶ms->wm_prog_data); +} + +static void +brw_blorp_setup_coord_transform(struct brw_blorp_coord_transform *xform, + GLfloat src0, GLfloat src1, + GLfloat dst0, GLfloat dst1, + bool mirror) +{ + float scale = (src1 - src0) / (dst1 - dst0); + if (!mirror) { + /* When not mirroring a coordinate (say, X), we need: + * src_x - src_x0 = (dst_x - dst_x0 + 0.5) * scale + * Therefore: + * src_x = src_x0 + (dst_x - dst_x0 + 0.5) * scale + * + * blorp program uses "round toward zero" to convert the + * transformed floating point coordinates to integer coordinates, + * whereas the behaviour we actually want is "round to nearest", + * so 0.5 provides the necessary correction. + */ + xform->multiplier = scale; + xform->offset = src0 + (-dst0 + 0.5f) * scale; + } else { + /* When mirroring X we need: + * src_x - src_x0 = dst_x1 - dst_x - 0.5 + * Therefore: + * src_x = src_x0 + (dst_x1 -dst_x - 0.5) * scale + */ + xform->multiplier = -scale; + xform->offset = src0 + (dst1 - 0.5f) * scale; + } +} + +/** + * Convert an swizzle enumeration (i.e. SWIZZLE_X) to one of the Gen7.5+ + * "Shader Channel Select" enumerations (i.e. HSW_SCS_RED). The mappings are + * + * SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_ZERO, SWIZZLE_ONE + * 0 1 2 3 4 5 + * 4 5 6 7 0 1 + * SCS_RED, SCS_GREEN, SCS_BLUE, SCS_ALPHA, SCS_ZERO, SCS_ONE + * + * which is simply adding 4 then modding by 8 (or anding with 7). + * + * We then may need to apply workarounds for textureGather hardware bugs. + */ +static enum isl_channel_select +swizzle_to_scs(GLenum swizzle) +{ + return (enum isl_channel_select)((swizzle + 4) & 7); +} + +static void +surf_convert_to_single_slice(const struct isl_device *isl_dev, + struct brw_blorp_surface_info *info) +{ + /* This only makes sense for a single level and array slice */ + assert(info->view.levels == 1 && info->view.array_len == 1); + + /* Just bail if we have nothing to do. */ + if (info->surf.dim == ISL_SURF_DIM_2D && + info->view.base_level == 0 && info->view.base_array_layer == 0 && + info->surf.levels == 0 && info->surf.logical_level0_px.array_len == 0) + return; + + uint32_t x_offset_sa, y_offset_sa; + isl_surf_get_image_offset_sa(&info->surf, info->view.base_level, + info->view.base_array_layer, 0, + &x_offset_sa, &y_offset_sa); + + uint32_t byte_offset; + isl_tiling_get_intratile_offset_sa(isl_dev, info->surf.tiling, + info->view.format, info->surf.row_pitch, + x_offset_sa, y_offset_sa, + &byte_offset, + &info->tile_x_sa, &info->tile_y_sa); + info->addr.offset += byte_offset; + + /* TODO: Once this file gets converted to C, we shouls just use designated + * initializers. + */ + struct isl_surf_init_info init_info = { 0, }; + + init_info.dim = ISL_SURF_DIM_2D; + init_info.format = ISL_FORMAT_R8_UINT; + init_info.width = + minify(info->surf.logical_level0_px.width, info->view.base_level); + init_info.height = + minify(info->surf.logical_level0_px.height, info->view.base_level); + init_info.depth = 1; + init_info.levels = 1; + init_info.array_len = 1; + init_info.samples = info->surf.samples; + init_info.min_pitch = info->surf.row_pitch; + init_info.usage = info->surf.usage; + init_info.tiling_flags = 1 << info->surf.tiling; + + isl_surf_init_s(isl_dev, &info->surf, &init_info); + assert(info->surf.row_pitch == init_info.min_pitch); + + /* The view is also different now. */ + info->view.base_level = 0; + info->view.levels = 1; + info->view.base_array_layer = 0; + info->view.array_len = 1; +} + +static void +surf_fake_interleaved_msaa(const struct isl_device *isl_dev, + struct brw_blorp_surface_info *info) +{ + assert(info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED); + + /* First, we need to convert it to a simple 1-level 1-layer 2-D surface */ + surf_convert_to_single_slice(isl_dev, info); + + info->surf.logical_level0_px = info->surf.phys_level0_sa; + info->surf.samples = 1; + info->surf.msaa_layout = ISL_MSAA_LAYOUT_NONE; +} + +static void +surf_retile_w_to_y(const struct isl_device *isl_dev, + struct brw_blorp_surface_info *info) +{ + assert(info->surf.tiling == ISL_TILING_W); + + /* First, we need to convert it to a simple 1-level 1-layer 2-D surface */ + surf_convert_to_single_slice(isl_dev, info); + + /* On gen7+, we don't have interleaved multisampling for color render + * targets so we have to fake it. + * + * TODO: Are we sure we don't also need to fake it on gen6? + */ + if (isl_dev->info->gen > 6 && + info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) { + info->surf.logical_level0_px = info->surf.phys_level0_sa; + info->surf.samples = 1; + info->surf.msaa_layout = ISL_MSAA_LAYOUT_NONE; + } + + if (isl_dev->info->gen == 6) { + /* Gen6 stencil buffers have a very large alignment coming in from the + * miptree. It's out-of-bounds for what the surface state can handle. + * Since we have a single layer and level, it doesn't really matter as + * long as we don't pass a bogus value into isl_surf_fill_state(). + */ + info->surf.image_alignment_el = isl_extent3d(4, 2, 1); + } + + /* Now that we've converted everything to a simple 2-D surface with only + * one miplevel, we can go about retiling it. + */ + const unsigned x_align = 8, y_align = info->surf.samples != 0 ? 8 : 4; + info->surf.tiling = ISL_TILING_Y0; + info->surf.logical_level0_px.width = + ALIGN(info->surf.logical_level0_px.width, x_align) * 2; + info->surf.logical_level0_px.height = + ALIGN(info->surf.logical_level0_px.height, y_align) / 2; + info->tile_x_sa *= 2; + info->tile_y_sa /= 2; +} + +void +blorp_blit(struct blorp_batch *batch, + const struct blorp_surf *src_surf, + unsigned src_level, unsigned src_layer, + enum isl_format src_format, int src_swizzle, + const struct blorp_surf *dst_surf, + unsigned dst_level, unsigned dst_layer, + enum isl_format dst_format, + float src_x0, float src_y0, + float src_x1, float src_y1, + float dst_x0, float dst_y0, + float dst_x1, float dst_y1, + GLenum filter, bool mirror_x, bool mirror_y) +{ + const struct brw_device_info *devinfo = batch->blorp->isl_dev->info; + + struct blorp_params params; + blorp_params_init(¶ms); + + brw_blorp_surface_info_init(batch->blorp, ¶ms.src, src_surf, src_level, + src_layer, src_format, false); + brw_blorp_surface_info_init(batch->blorp, ¶ms.dst, dst_surf, dst_level, + dst_layer, dst_format, true); + + struct brw_blorp_blit_prog_key wm_prog_key; + memset(&wm_prog_key, 0, sizeof(wm_prog_key)); + + if (isl_format_has_sint_channel(params.src.view.format)) { + wm_prog_key.texture_data_type = nir_type_int; + } else if (isl_format_has_uint_channel(params.src.view.format)) { + wm_prog_key.texture_data_type = nir_type_uint; + } else { + wm_prog_key.texture_data_type = nir_type_float; + } + + /* Scaled blitting or not. */ + wm_prog_key.blit_scaled = + ((dst_x1 - dst_x0) == (src_x1 - src_x0) && + (dst_y1 - dst_y0) == (src_y1 - src_y0)) ? false : true; + + /* Scaling factors used for bilinear filtering in multisample scaled + * blits. + */ + if (params.src.surf.samples == 16) + wm_prog_key.x_scale = 4.0f; + else + wm_prog_key.x_scale = 2.0f; + wm_prog_key.y_scale = params.src.surf.samples / wm_prog_key.x_scale; + + if (filter == GL_LINEAR && + params.src.surf.samples <= 1 && params.dst.surf.samples <= 1) + wm_prog_key.bilinear_filter = true; + + if ((params.src.surf.usage & ISL_SURF_USAGE_DEPTH_BIT) == 0 && + (params.src.surf.usage & ISL_SURF_USAGE_STENCIL_BIT) == 0 && + !isl_format_has_int_channel(params.src.surf.format) && + params.src.surf.samples > 1 && params.dst.surf.samples <= 1) { + /* We are downsampling a non-integer color buffer, so blend. + * + * Regarding integer color buffers, the OpenGL ES 3.2 spec says: + * + * "If the source formats are integer types or stencil values, a + * single sample's value is selected for each pixel." + * + * This implies we should not blend in that case. + */ + wm_prog_key.blend = true; + } + + /* src_samples and dst_samples are the true sample counts */ + wm_prog_key.src_samples = params.src.surf.samples; + wm_prog_key.dst_samples = params.dst.surf.samples; + + wm_prog_key.tex_aux_usage = params.src.aux_usage; + + /* src_layout and dst_layout indicate the true MSAA layout used by src and + * dst. + */ + wm_prog_key.src_layout = params.src.surf.msaa_layout; + wm_prog_key.dst_layout = params.dst.surf.msaa_layout; + + /* Round floating point values to nearest integer to avoid "off by one texel" + * kind of errors when blitting. + */ + params.x0 = params.wm_inputs.discard_rect.x0 = roundf(dst_x0); + params.y0 = params.wm_inputs.discard_rect.y0 = roundf(dst_y0); + params.x1 = params.wm_inputs.discard_rect.x1 = roundf(dst_x1); + params.y1 = params.wm_inputs.discard_rect.y1 = roundf(dst_y1); + + params.wm_inputs.rect_grid.x1 = + minify(params.src.surf.logical_level0_px.width, src_level) * + wm_prog_key.x_scale - 1.0f; + params.wm_inputs.rect_grid.y1 = + minify(params.src.surf.logical_level0_px.height, src_level) * + wm_prog_key.y_scale - 1.0f; + + brw_blorp_setup_coord_transform(¶ms.wm_inputs.coord_transform[0], + src_x0, src_x1, dst_x0, dst_x1, mirror_x); + brw_blorp_setup_coord_transform(¶ms.wm_inputs.coord_transform[1], + src_y0, src_y1, dst_y0, dst_y1, mirror_y); + + /* For some texture types, we need to pass the layer through the sampler. */ + params.wm_inputs.src_z = params.src.z_offset; + + if (devinfo->gen > 6 && + params.dst.surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) { + assert(params.dst.surf.samples > 1); + + /* We must expand the rectangle we send through the rendering pipeline, + * to account for the fact that we are mapping the destination region as + * single-sampled when it is in fact multisampled. We must also align + * it to a multiple of the multisampling pattern, because the + * differences between multisampled and single-sampled surface formats + * will mean that pixels are scrambled within the multisampling pattern. + * TODO: what if this makes the coordinates too large? + * + * Note: this only works if the destination surface uses the IMS layout. + * If it's UMS, then we have no choice but to set up the rendering + * pipeline as multisampled. + */ + switch (params.dst.surf.samples) { + case 2: + params.x0 = ROUND_DOWN_TO(params.x0 * 2, 4); + params.y0 = ROUND_DOWN_TO(params.y0, 4); + params.x1 = ALIGN(params.x1 * 2, 4); + params.y1 = ALIGN(params.y1, 4); + break; + case 4: + params.x0 = ROUND_DOWN_TO(params.x0 * 2, 4); + params.y0 = ROUND_DOWN_TO(params.y0 * 2, 4); + params.x1 = ALIGN(params.x1 * 2, 4); + params.y1 = ALIGN(params.y1 * 2, 4); + break; + case 8: + params.x0 = ROUND_DOWN_TO(params.x0 * 4, 8); + params.y0 = ROUND_DOWN_TO(params.y0 * 2, 4); + params.x1 = ALIGN(params.x1 * 4, 8); + params.y1 = ALIGN(params.y1 * 2, 4); + break; + case 16: + params.x0 = ROUND_DOWN_TO(params.x0 * 4, 8); + params.y0 = ROUND_DOWN_TO(params.y0 * 4, 8); + params.x1 = ALIGN(params.x1 * 4, 8); + params.y1 = ALIGN(params.y1 * 4, 8); + break; + default: + unreachable("Unrecognized sample count in brw_blorp_blit_params ctor"); + } + + surf_fake_interleaved_msaa(batch->blorp->isl_dev, ¶ms.dst); + + wm_prog_key.use_kill = true; + } + + if (params.dst.surf.tiling == ISL_TILING_W) { + /* We must modify the rectangle we send through the rendering pipeline + * (and the size and x/y offset of the destination surface), to account + * for the fact that we are mapping it as Y-tiled when it is in fact + * W-tiled. + * + * Both Y tiling and W tiling can be understood as organizations of + * 32-byte sub-tiles; within each 32-byte sub-tile, the layout of pixels + * is different, but the layout of the 32-byte sub-tiles within the 4k + * tile is the same (8 sub-tiles across by 16 sub-tiles down, in + * column-major order). In Y tiling, the sub-tiles are 16 bytes wide + * and 2 rows high; in W tiling, they are 8 bytes wide and 4 rows high. + * + * Therefore, to account for the layout differences within the 32-byte + * sub-tiles, we must expand the rectangle so the X coordinates of its + * edges are multiples of 8 (the W sub-tile width), and its Y + * coordinates of its edges are multiples of 4 (the W sub-tile height). + * Then we need to scale the X and Y coordinates of the rectangle to + * account for the differences in aspect ratio between the Y and W + * sub-tiles. We need to modify the layer width and height similarly. + * + * A correction needs to be applied when MSAA is in use: since + * INTEL_MSAA_LAYOUT_IMS uses an interleaving pattern whose height is 4, + * we need to align the Y coordinates to multiples of 8, so that when + * they are divided by two they are still multiples of 4. + * + * Note: Since the x/y offset of the surface will be applied using the + * SURFACE_STATE command packet, it will be invisible to the swizzling + * code in the shader; therefore it needs to be in a multiple of the + * 32-byte sub-tile size. Fortunately it is, since the sub-tile is 8 + * pixels wide and 4 pixels high (when viewed as a W-tiled stencil + * buffer), and the miplevel alignment used for stencil buffers is 8 + * pixels horizontally and either 4 or 8 pixels vertically (see + * intel_horizontal_texture_alignment_unit() and + * intel_vertical_texture_alignment_unit()). + * + * Note: Also, since the SURFACE_STATE command packet can only apply + * offsets that are multiples of 4 pixels horizontally and 2 pixels + * vertically, it is important that the offsets will be multiples of + * these sizes after they are converted into Y-tiled coordinates. + * Fortunately they will be, since we know from above that the offsets + * are a multiple of the 32-byte sub-tile size, and in Y-tiled + * coordinates the sub-tile is 16 pixels wide and 2 pixels high. + * + * TODO: what if this makes the coordinates (or the texture size) too + * large? + */ + const unsigned x_align = 8, y_align = params.dst.surf.samples != 0 ? 8 : 4; + params.x0 = ROUND_DOWN_TO(params.x0, x_align) * 2; + params.y0 = ROUND_DOWN_TO(params.y0, y_align) / 2; + params.x1 = ALIGN(params.x1, x_align) * 2; + params.y1 = ALIGN(params.y1, y_align) / 2; + + /* Retile the surface to Y-tiled */ + surf_retile_w_to_y(batch->blorp->isl_dev, ¶ms.dst); + + wm_prog_key.dst_tiled_w = true; + wm_prog_key.use_kill = true; + + if (params.dst.surf.samples > 1) { + /* If the destination surface is a W-tiled multisampled stencil + * buffer that we're mapping as Y tiled, then we need to arrange for + * the WM program to run once per sample rather than once per pixel, + * because the memory layout of related samples doesn't match between + * W and Y tiling. + */ + wm_prog_key.persample_msaa_dispatch = true; + } + } + + if (devinfo->gen < 8 && params.src.surf.tiling == ISL_TILING_W) { + /* On Haswell and earlier, we have to fake W-tiled sources as Y-tiled. + * Broadwell adds support for sampling from stencil. + * + * See the comments above concerning x/y offset alignment for the + * destination surface. + * + * TODO: what if this makes the texture size too large? + */ + surf_retile_w_to_y(batch->blorp->isl_dev, ¶ms.src); + + wm_prog_key.src_tiled_w = true; + } + + /* tex_samples and rt_samples are the sample counts that are set up in + * SURFACE_STATE. + */ + wm_prog_key.tex_samples = params.src.surf.samples; + wm_prog_key.rt_samples = params.dst.surf.samples; + + /* tex_layout and rt_layout indicate the MSAA layout the GPU pipeline will + * use to access the source and destination surfaces. + */ + wm_prog_key.tex_layout = params.src.surf.msaa_layout; + wm_prog_key.rt_layout = params.dst.surf.msaa_layout; + + if (params.src.surf.samples > 0 && params.dst.surf.samples > 1) { + /* We are blitting from a multisample buffer to a multisample buffer, so + * we must preserve samples within a pixel. This means we have to + * arrange for the WM program to run once per sample rather than once + * per pixel. + */ + wm_prog_key.persample_msaa_dispatch = true; + } + + brw_blorp_get_blit_kernel(batch->blorp, ¶ms, &wm_prog_key); + + for (unsigned i = 0; i < 4; i++) { + params.src.view.channel_select[i] = + swizzle_to_scs(GET_SWZ(src_swizzle, i)); + } + + batch->blorp->exec(batch, ¶ms); +} diff --git a/src/intel/blorp/blorp_clear.c b/src/intel/blorp/blorp_clear.c new file mode 100644 index 00000000000..a371dfd31ef --- /dev/null +++ b/src/intel/blorp/blorp_clear.c @@ -0,0 +1,344 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "util/ralloc.h" + +#include "blorp_priv.h" +#include "brw_defines.h" + +#include "compiler/nir/nir_builder.h" + +#define FILE_DEBUG_FLAG DEBUG_BLORP + +struct brw_blorp_const_color_prog_key +{ + bool use_simd16_replicated_data; + bool pad[3]; +}; + +static void +blorp_params_get_clear_kernel(struct blorp_context *blorp, + struct blorp_params *params, + bool use_replicated_data) +{ + struct brw_blorp_const_color_prog_key blorp_key; + memset(&blorp_key, 0, sizeof(blorp_key)); + blorp_key.use_simd16_replicated_data = use_replicated_data; + + if (blorp->lookup_shader(blorp, &blorp_key, sizeof(blorp_key), + ¶ms->wm_prog_kernel, ¶ms->wm_prog_data)) + return; + + void *mem_ctx = ralloc_context(NULL); + + nir_builder b; + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL); + b.shader->info.name = ralloc_strdup(b.shader, "BLORP-clear"); + + nir_variable *v_color = nir_variable_create(b.shader, nir_var_shader_in, + glsl_vec4_type(), "v_color"); + v_color->data.location = VARYING_SLOT_VAR0; + v_color->data.interpolation = INTERP_MODE_FLAT; + + nir_variable *frag_color = nir_variable_create(b.shader, nir_var_shader_out, + glsl_vec4_type(), + "gl_FragColor"); + frag_color->data.location = FRAG_RESULT_COLOR; + + nir_copy_var(&b, frag_color, v_color); + + struct brw_wm_prog_key wm_key; + brw_blorp_init_wm_prog_key(&wm_key); + + struct brw_blorp_prog_data prog_data; + unsigned program_size; + const unsigned *program = + brw_blorp_compile_nir_shader(blorp, b.shader, &wm_key, use_replicated_data, + &prog_data, &program_size); + + blorp->upload_shader(blorp, &blorp_key, sizeof(blorp_key), + program, program_size, + &prog_data, sizeof(prog_data), + ¶ms->wm_prog_kernel, ¶ms->wm_prog_data); + + ralloc_free(mem_ctx); +} + +/* The x0, y0, x1, and y1 parameters must already be populated with the render + * area of the framebuffer to be cleared. + */ +static void +get_fast_clear_rect(const struct isl_device *dev, + const struct isl_surf *aux_surf, + unsigned *x0, unsigned *y0, + unsigned *x1, unsigned *y1) +{ + unsigned int x_align, y_align; + unsigned int x_scaledown, y_scaledown; + + /* Only single sampled surfaces need to (and actually can) be resolved. */ + if (aux_surf->usage == ISL_SURF_USAGE_CCS_BIT) { + /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render + * Target(s)", beneath the "Fast Color Clear" bullet (p327): + * + * Clear pass must have a clear rectangle that must follow + * alignment rules in terms of pixels and lines as shown in the + * table below. Further, the clear-rectangle height and width + * must be multiple of the following dimensions. If the height + * and width of the render target being cleared do not meet these + * requirements, an MCS buffer can be created such that it + * follows the requirement and covers the RT. + * + * The alignment size in the table that follows is related to the + * alignment size that is baked into the CCS surface format but with X + * alignment multiplied by 16 and Y alignment multiplied by 32. + */ + x_align = isl_format_get_layout(aux_surf->format)->bw; + y_align = isl_format_get_layout(aux_surf->format)->bh; + + x_align *= 16; + + /* SKL+ line alignment requirement for Y-tiled are half those of the prior + * generations. + */ + if (dev->info->gen >= 9) + y_align *= 16; + else + y_align *= 32; + + /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render + * Target(s)", beneath the "Fast Color Clear" bullet (p327): + * + * In order to optimize the performance MCS buffer (when bound to + * 1X RT) clear similarly to MCS buffer clear for MSRT case, + * clear rect is required to be scaled by the following factors + * in the horizontal and vertical directions: + * + * The X and Y scale down factors in the table that follows are each + * equal to half the alignment value computed above. + */ + x_scaledown = x_align / 2; + y_scaledown = y_align / 2; + + /* From BSpec: 3D-Media-GPGPU Engine > 3D Pipeline > Pixel > Pixel + * Backend > MCS Buffer for Render Target(s) [DevIVB+] > Table "Color + * Clear of Non-MultiSampled Render Target Restrictions": + * + * Clear rectangle must be aligned to two times the number of + * pixels in the table shown below due to 16x16 hashing across the + * slice. + */ + x_align *= 2; + y_align *= 2; + } else { + assert(aux_surf->usage == ISL_SURF_USAGE_MCS_BIT); + + /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render + * Target(s)", beneath the "MSAA Compression" bullet (p326): + * + * Clear pass for this case requires that scaled down primitive + * is sent down with upper left co-ordinate to coincide with + * actual rectangle being cleared. For MSAA, clear rectangle’s + * height and width need to as show in the following table in + * terms of (width,height) of the RT. + * + * MSAA Width of Clear Rect Height of Clear Rect + * 2X Ceil(1/8*width) Ceil(1/2*height) + * 4X Ceil(1/8*width) Ceil(1/2*height) + * 8X Ceil(1/2*width) Ceil(1/2*height) + * 16X width Ceil(1/2*height) + * + * The text "with upper left co-ordinate to coincide with actual + * rectangle being cleared" is a little confusing--it seems to imply + * that to clear a rectangle from (x,y) to (x+w,y+h), one needs to + * feed the pipeline using the rectangle (x,y) to + * (x+Ceil(w/N),y+Ceil(h/2)), where N is either 2 or 8 depending on + * the number of samples. Experiments indicate that this is not + * quite correct; actually, what the hardware appears to do is to + * align whatever rectangle is sent down the pipeline to the nearest + * multiple of 2x2 blocks, and then scale it up by a factor of N + * horizontally and 2 vertically. So the resulting alignment is 4 + * vertically and either 4 or 16 horizontally, and the scaledown + * factor is 2 vertically and either 2 or 8 horizontally. + */ + switch (aux_surf->format) { + case ISL_FORMAT_MCS_2X: + case ISL_FORMAT_MCS_4X: + x_scaledown = 8; + break; + case ISL_FORMAT_MCS_8X: + x_scaledown = 2; + break; + case ISL_FORMAT_MCS_16X: + x_scaledown = 1; + break; + default: + unreachable("Unexpected MCS format for fast clear"); + } + y_scaledown = 2; + x_align = x_scaledown * 2; + y_align = y_scaledown * 2; + } + + *x0 = ROUND_DOWN_TO(*x0, x_align) / x_scaledown; + *y0 = ROUND_DOWN_TO(*y0, y_align) / y_scaledown; + *x1 = ALIGN(*x1, x_align) / x_scaledown; + *y1 = ALIGN(*y1, y_align) / y_scaledown; +} + +void +blorp_fast_clear(struct blorp_batch *batch, + const struct blorp_surf *surf, + uint32_t level, uint32_t layer, + uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1) +{ + struct blorp_params params; + blorp_params_init(¶ms); + + params.x0 = x0; + params.y0 = y0; + params.x1 = x1; + params.y1 = y1; + + memset(¶ms.wm_inputs, 0xff, 4*sizeof(float)); + params.fast_clear_op = BLORP_FAST_CLEAR_OP_CLEAR; + + get_fast_clear_rect(batch->blorp->isl_dev, surf->aux_surf, + ¶ms.x0, ¶ms.y0, ¶ms.x1, ¶ms.y1); + + blorp_params_get_clear_kernel(batch->blorp, ¶ms, true); + + brw_blorp_surface_info_init(batch->blorp, ¶ms.dst, surf, level, layer, + surf->surf->format, true); + + batch->blorp->exec(batch, ¶ms); +} + + +void +blorp_clear(struct blorp_batch *batch, + const struct blorp_surf *surf, + uint32_t level, uint32_t layer, + uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1, + enum isl_format format, union isl_color_value clear_color, + bool color_write_disable[4]) +{ + struct blorp_params params; + blorp_params_init(¶ms); + + params.x0 = x0; + params.y0 = y0; + params.x1 = x1; + params.y1 = y1; + + memcpy(¶ms.wm_inputs, clear_color.f32, sizeof(float) * 4); + + bool use_simd16_replicated_data = true; + + /* From the SNB PRM (Vol4_Part1): + * + * "Replicated data (Message Type = 111) is only supported when + * accessing tiled memory. Using this Message Type to access linear + * (untiled) memory is UNDEFINED." + */ + if (surf->surf->tiling == ISL_TILING_LINEAR) + use_simd16_replicated_data = false; + + /* Constant color writes ignore everyting in blend and color calculator + * state. This is not documented. + */ + for (unsigned i = 0; i < 4; i++) { + params.color_write_disable[i] = color_write_disable[i]; + if (color_write_disable[i]) + use_simd16_replicated_data = false; + } + + blorp_params_get_clear_kernel(batch->blorp, ¶ms, + use_simd16_replicated_data); + + brw_blorp_surface_info_init(batch->blorp, ¶ms.dst, surf, level, layer, + format, true); + + batch->blorp->exec(batch, ¶ms); +} + +void +blorp_ccs_resolve(struct blorp_batch *batch, + struct blorp_surf *surf, enum isl_format format) +{ + struct blorp_params params; + blorp_params_init(¶ms); + + brw_blorp_surface_info_init(batch->blorp, ¶ms.dst, surf, + 0 /* level */, 0 /* layer */, format, true); + + /* From the Ivy Bridge PRM, Vol2 Part1 11.9 "Render Target Resolve": + * + * A rectangle primitive must be scaled down by the following factors + * with respect to render target being resolved. + * + * The scaledown factors in the table that follows are related to the block + * size of the CCS format. For IVB and HSW, we divide by two, for BDW we + * multiply by 8 and 16. On Sky Lake, we multiply by 8. + */ + const struct isl_format_layout *aux_fmtl = + isl_format_get_layout(params.dst.aux_surf.format); + assert(aux_fmtl->txc == ISL_TXC_CCS); + + unsigned x_scaledown, y_scaledown; + if (ISL_DEV_GEN(batch->blorp->isl_dev) >= 9) { + x_scaledown = aux_fmtl->bw * 8; + y_scaledown = aux_fmtl->bh * 8; + } else if (ISL_DEV_GEN(batch->blorp->isl_dev) >= 8) { + x_scaledown = aux_fmtl->bw * 8; + y_scaledown = aux_fmtl->bh * 16; + } else { + x_scaledown = aux_fmtl->bw / 2; + y_scaledown = aux_fmtl->bh / 2; + } + params.x0 = params.y0 = 0; + params.x1 = params.dst.aux_surf.logical_level0_px.width; + params.y1 = params.dst.aux_surf.logical_level0_px.height; + params.x1 = ALIGN(params.x1, x_scaledown) / x_scaledown; + params.y1 = ALIGN(params.y1, y_scaledown) / y_scaledown; + + if (batch->blorp->isl_dev->info->gen >= 9) { + if (params.dst.aux_usage == ISL_AUX_USAGE_CCS_E) + params.fast_clear_op = BLORP_FAST_CLEAR_OP_RESOLVE_FULL; + else + params.fast_clear_op = BLORP_FAST_CLEAR_OP_RESOLVE_PARTIAL; + } else { + /* Broadwell and earlier do not have a partial resolve */ + params.fast_clear_op = BLORP_FAST_CLEAR_OP_RESOLVE_FULL; + } + + /* Note: there is no need to initialize push constants because it doesn't + * matter what data gets dispatched to the render target. However, we must + * ensure that the fragment shader delivers the data using the "replicated + * color" message. + */ + + blorp_params_get_clear_kernel(batch->blorp, ¶ms, true); + + batch->blorp->exec(batch, ¶ms); +} diff --git a/src/intel/blorp/blorp_genX_exec.h b/src/intel/blorp/blorp_genX_exec.h new file mode 100644 index 00000000000..f44076e129f --- /dev/null +++ b/src/intel/blorp/blorp_genX_exec.h @@ -0,0 +1,1176 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "blorp_priv.h" +#include "brw_device_info.h" +#include "intel_aub.h" + +/** + * This file provides the blorp pipeline setup and execution functionality. + * It defines the following function: + * + * static void + * blorp_exec(struct blorp_context *blorp, void *batch_data, + * const struct blorp_params *params); + * + * It is the job of whoever includes this header to wrap this in something + * to get an externally visible symbol. + * + * In order for the blorp_exec function to work, the driver must provide + * implementations of the following static helper functions. + */ + +static void * +blorp_emit_dwords(struct blorp_batch *batch, unsigned n); + +static uint64_t +blorp_emit_reloc(struct blorp_batch *batch, + void *location, struct blorp_address address, uint32_t delta); + +static void * +blorp_alloc_dynamic_state(struct blorp_batch *batch, + enum aub_state_struct_type type, + uint32_t size, + uint32_t alignment, + uint32_t *offset); +static void * +blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size, + struct blorp_address *addr); + +static void +blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries, + unsigned state_size, unsigned state_alignment, + uint32_t *bt_offset, uint32_t **bt_map, + void **surface_maps); +static void +blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset, + struct blorp_address address, uint32_t delta); + +static void +blorp_emit_urb_config(struct blorp_batch *batch, unsigned vs_entry_size); +static void +blorp_emit_3dstate_multisample(struct blorp_batch *batch, unsigned samples); + +/***** BEGIN blorp_exec implementation ******/ + +#include "genxml/gen_macros.h" + +#define __gen_address_type struct blorp_address +#define __gen_user_data struct blorp_batch + +static uint64_t +__gen_combine_address(struct blorp_batch *batch, void *location, + struct blorp_address address, uint32_t delta) +{ + if (address.buffer == NULL) { + return address.offset + delta; + } else { + return blorp_emit_reloc(batch, location, address, delta); + } +} + +#include "genxml/genX_pack.h" + +#define _blorp_cmd_length(cmd) cmd ## _length +#define _blorp_cmd_length_bias(cmd) cmd ## _length_bias +#define _blorp_cmd_header(cmd) cmd ## _header +#define _blorp_cmd_pack(cmd) cmd ## _pack + +#define blorp_emit(batch, cmd, name) \ + for (struct cmd name = { _blorp_cmd_header(cmd) }, \ + *_dst = blorp_emit_dwords(batch, _blorp_cmd_length(cmd)); \ + __builtin_expect(_dst != NULL, 1); \ + _blorp_cmd_pack(cmd)(batch, (void *)_dst, &name), \ + _dst = NULL) + +#define blorp_emitn(batch, cmd, n) ({ \ + uint32_t *_dw = blorp_emit_dwords(batch, n); \ + struct cmd template = { \ + _blorp_cmd_header(cmd), \ + .DWordLength = n - _blorp_cmd_length_bias(cmd), \ + }; \ + _blorp_cmd_pack(cmd)(batch, _dw, &template); \ + _dw + 1; /* Array starts at dw[1] */ \ + }) + +/* Once vertex fetcher has written full VUE entries with complete + * header the space requirement is as follows per vertex (in bytes): + * + * Header Position Program constants + * +--------+------------+-------------------+ + * | 16 | 16 | n x 16 | + * +--------+------------+-------------------+ + * + * where 'n' stands for number of varying inputs expressed as vec4s. + * + * The URB size is in turn expressed in 64 bytes (512 bits). + */ +static inline unsigned +gen7_blorp_get_vs_entry_size(const struct blorp_params *params) +{ + const unsigned num_varyings = + params->wm_prog_data ? params->wm_prog_data->num_varying_inputs : 0; + const unsigned total_needed = 16 + 16 + num_varyings * 16; + + return DIV_ROUND_UP(total_needed, 64); +} + +/* 3DSTATE_URB + * 3DSTATE_URB_VS + * 3DSTATE_URB_HS + * 3DSTATE_URB_DS + * 3DSTATE_URB_GS + * + * Assign the entire URB to the VS. Even though the VS disabled, URB space + * is still needed because the clipper loads the VUE's from the URB. From + * the Sandybridge PRM, Volume 2, Part 1, Section 3DSTATE, + * Dword 1.15:0 "VS Number of URB Entries": + * This field is always used (even if VS Function Enable is DISABLED). + * + * The warning below appears in the PRM (Section 3DSTATE_URB), but we can + * safely ignore it because this batch contains only one draw call. + * Because of URB corruption caused by allocating a previous GS unit + * URB entry to the VS unit, software is required to send a “GS NULL + * Fence” (Send URB fence with VS URB size == 1 and GS URB size == 0) + * plus a dummy DRAW call before any case where VS will be taking over + * GS URB space. + * + * If the 3DSTATE_URB_VS is emitted, than the others must be also. + * From the Ivybridge PRM, Volume 2 Part 1, section 1.7.1 3DSTATE_URB_VS: + * + * 3DSTATE_URB_HS, 3DSTATE_URB_DS, and 3DSTATE_URB_GS must also be + * programmed in order for the programming of this state to be + * valid. + */ +static void +emit_urb_config(struct blorp_batch *batch, + const struct blorp_params *params) +{ + blorp_emit_urb_config(batch, gen7_blorp_get_vs_entry_size(params)); +} + +static void +blorp_emit_vertex_data(struct blorp_batch *batch, + const struct blorp_params *params, + struct blorp_address *addr, + uint32_t *size) +{ + const float vertices[] = { + /* v0 */ (float)params->x0, (float)params->y1, + /* v1 */ (float)params->x1, (float)params->y1, + /* v2 */ (float)params->x0, (float)params->y0, + }; + + void *data = blorp_alloc_vertex_buffer(batch, sizeof(vertices), addr); + memcpy(data, vertices, sizeof(vertices)); + *size = sizeof(vertices); +} + +static void +blorp_emit_input_varying_data(struct blorp_batch *batch, + const struct blorp_params *params, + struct blorp_address *addr, + uint32_t *size) +{ + const unsigned vec4_size_in_bytes = 4 * sizeof(float); + const unsigned max_num_varyings = + DIV_ROUND_UP(sizeof(params->wm_inputs), vec4_size_in_bytes); + const unsigned num_varyings = params->wm_prog_data->num_varying_inputs; + + *size = num_varyings * vec4_size_in_bytes; + + const float *const inputs_src = (const float *)¶ms->wm_inputs; + float *inputs = blorp_alloc_vertex_buffer(batch, *size, addr); + + /* Walk over the attribute slots, determine if the attribute is used by + * the program and when necessary copy the values from the input storage to + * the vertex data buffer. + */ + for (unsigned i = 0; i < max_num_varyings; i++) { + const gl_varying_slot attr = VARYING_SLOT_VAR0 + i; + + if (!(params->wm_prog_data->inputs_read & (1ull << attr))) + continue; + + memcpy(inputs, inputs_src + i * 4, vec4_size_in_bytes); + + inputs += 4; + } +} + +static void +blorp_emit_vertex_buffers(struct blorp_batch *batch, + const struct blorp_params *params) +{ + struct GENX(VERTEX_BUFFER_STATE) vb[2]; + memset(vb, 0, sizeof(vb)); + + unsigned num_buffers = 1; + + uint32_t size; + blorp_emit_vertex_data(batch, params, &vb[0].BufferStartingAddress, &size); + vb[0].VertexBufferIndex = 0; + vb[0].BufferPitch = 2 * sizeof(float); + vb[0].VertexBufferMOCS = batch->blorp->mocs.vb; +#if GEN_GEN >= 7 + vb[0].AddressModifyEnable = true; +#endif +#if GEN_GEN >= 8 + vb[0].BufferSize = size; +#else + vb[0].BufferAccessType = VERTEXDATA; + vb[0].EndAddress = vb[0].BufferStartingAddress; + vb[0].EndAddress.offset += size - 1; +#endif + + if (params->wm_prog_data && params->wm_prog_data->num_varying_inputs) { + blorp_emit_input_varying_data(batch, params, + &vb[1].BufferStartingAddress, &size); + vb[1].VertexBufferIndex = 1; + vb[1].BufferPitch = 0; + vb[1].VertexBufferMOCS = batch->blorp->mocs.vb; +#if GEN_GEN >= 7 + vb[1].AddressModifyEnable = true; +#endif +#if GEN_GEN >= 8 + vb[1].BufferSize = size; +#else + vb[1].BufferAccessType = INSTANCEDATA; + vb[1].EndAddress = vb[1].BufferStartingAddress; + vb[1].EndAddress.offset += size - 1; +#endif + num_buffers++; + } + + const unsigned num_dwords = + 1 + GENX(VERTEX_BUFFER_STATE_length) * num_buffers; + uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_BUFFERS), num_dwords); + + for (unsigned i = 0; i < num_buffers; i++) { + GENX(VERTEX_BUFFER_STATE_pack)(batch, dw, &vb[i]); + dw += GENX(VERTEX_BUFFER_STATE_length); + } +} + +static void +blorp_emit_vertex_elements(struct blorp_batch *batch, + const struct blorp_params *params) +{ + const unsigned num_varyings = + params->wm_prog_data ? params->wm_prog_data->num_varying_inputs : 0; + const unsigned num_elements = 2 + num_varyings; + + struct GENX(VERTEX_ELEMENT_STATE) ve[num_elements]; + memset(ve, 0, num_elements * sizeof(*ve)); + + /* Setup VBO for the rectangle primitive.. + * + * A rectangle primitive (3DPRIM_RECTLIST) consists of only three + * vertices. The vertices reside in screen space with DirectX + * coordinates (that is, (0, 0) is the upper left corner). + * + * v2 ------ implied + * | | + * | | + * v0 ----- v1 + * + * Since the VS is disabled, the clipper loads each VUE directly from + * the URB. This is controlled by the 3DSTATE_VERTEX_BUFFERS and + * 3DSTATE_VERTEX_ELEMENTS packets below. The VUE contents are as follows: + * dw0: Reserved, MBZ. + * dw1: Render Target Array Index. The HiZ op does not use indexed + * vertices, so set the dword to 0. + * dw2: Viewport Index. The HiZ op disables viewport mapping and + * scissoring, so set the dword to 0. + * dw3: Point Width: The HiZ op does not emit the POINTLIST primitive, + * so set the dword to 0. + * dw4: Vertex Position X. + * dw5: Vertex Position Y. + * dw6: Vertex Position Z. + * dw7: Vertex Position W. + * + * dw8: Flat vertex input 0 + * dw9: Flat vertex input 1 + * ... + * dwn: Flat vertex input n - 8 + * + * For details, see the Sandybridge PRM, Volume 2, Part 1, Section 1.5.1 + * "Vertex URB Entry (VUE) Formats". + * + * Only vertex position X and Y are going to be variable, Z is fixed to + * zero and W to one. Header words dw0-3 are all zero. There is no need to + * include the fixed values in the vertex buffer. Vertex fetcher can be + * instructed to fill vertex elements with constant values of one and zero + * instead of reading them from the buffer. + * Flat inputs are program constants that are not interpolated. Moreover + * their values will be the same between vertices. + * + * See the vertex element setup below. + */ + ve[0].VertexBufferIndex = 0; + ve[0].Valid = true; + ve[0].SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT; + ve[0].SourceElementOffset = 0; + ve[0].Component0Control = VFCOMP_STORE_0; + ve[0].Component1Control = VFCOMP_STORE_0; + ve[0].Component2Control = VFCOMP_STORE_0; + ve[0].Component3Control = VFCOMP_STORE_0; + + ve[1].VertexBufferIndex = 0; + ve[1].Valid = true; + ve[1].SourceElementFormat = ISL_FORMAT_R32G32_FLOAT; + ve[1].SourceElementOffset = 0; + ve[1].Component0Control = VFCOMP_STORE_SRC; + ve[1].Component1Control = VFCOMP_STORE_SRC; + ve[1].Component2Control = VFCOMP_STORE_0; + ve[1].Component3Control = VFCOMP_STORE_1_FP; + + for (unsigned i = 0; i < num_varyings; ++i) { + ve[i + 2].VertexBufferIndex = 1; + ve[i + 2].Valid = true; + ve[i + 2].SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT; + ve[i + 2].SourceElementOffset = i * 4 * sizeof(float); + ve[i + 2].Component0Control = VFCOMP_STORE_SRC; + ve[i + 2].Component1Control = VFCOMP_STORE_SRC; + ve[i + 2].Component2Control = VFCOMP_STORE_SRC; + ve[i + 2].Component3Control = VFCOMP_STORE_SRC; + } + + const unsigned num_dwords = + 1 + GENX(VERTEX_ELEMENT_STATE_length) * num_elements; + uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_ELEMENTS), num_dwords); + + for (unsigned i = 0; i < num_elements; i++) { + GENX(VERTEX_ELEMENT_STATE_pack)(batch, dw, &ve[i]); + dw += GENX(VERTEX_ELEMENT_STATE_length); + } + +#if GEN_GEN >= 8 + blorp_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs); + + for (unsigned i = 0; i < num_elements; i++) { + blorp_emit(batch, GENX(3DSTATE_VF_INSTANCING), vf) { + vf.VertexElementIndex = i; + vf.InstancingEnable = false; + } + } + + blorp_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) { + topo.PrimitiveTopologyType = _3DPRIM_RECTLIST; + } +#endif +} + +static void +blorp_emit_sf_config(struct blorp_batch *batch, + const struct blorp_params *params) +{ + const struct brw_blorp_prog_data *prog_data = params->wm_prog_data; + + /* 3DSTATE_SF + * + * Disable ViewportTransformEnable (dw2.1) + * + * From the SandyBridge PRM, Volume 2, Part 1, Section 1.3, "3D + * Primitives Overview": + * RECTLIST: Viewport Mapping must be DISABLED (as is typical with the + * use of screen- space coordinates). + * + * A solid rectangle must be rendered, so set FrontFaceFillMode (dw2.4:3) + * and BackFaceFillMode (dw2.5:6) to SOLID(0). + * + * From the Sandy Bridge PRM, Volume 2, Part 1, Section + * 6.4.1.1 3DSTATE_SF, Field FrontFaceFillMode: + * SOLID: Any triangle or rectangle object found to be front-facing + * is rendered as a solid object. This setting is required when + * (rendering rectangle (RECTLIST) objects. + */ + +#if GEN_GEN >= 8 + + blorp_emit(batch, GENX(3DSTATE_SF), sf); + + blorp_emit(batch, GENX(3DSTATE_RASTER), raster) { + raster.CullMode = CULLMODE_NONE; + } + + blorp_emit(batch, GENX(3DSTATE_SBE), sbe) { + sbe.VertexURBEntryReadOffset = 1; + sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs; + sbe.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data); + sbe.ForceVertexURBEntryReadLength = true; + sbe.ForceVertexURBEntryReadOffset = true; + sbe.ConstantInterpolationEnable = prog_data->flat_inputs; + +#if GEN_GEN >= 9 + for (unsigned i = 0; i < 32; i++) + sbe.AttributeActiveComponentFormat[i] = ACF_XYZW; +#endif + } + +#elif GEN_GEN >= 7 + + blorp_emit(batch, GENX(3DSTATE_SF), sf) { + sf.FrontFaceFillMode = FILL_MODE_SOLID; + sf.BackFaceFillMode = FILL_MODE_SOLID; + + sf.MultisampleRasterizationMode = params->dst.surf.samples > 1 ? + MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL; + +#if GEN_GEN == 7 + sf.DepthBufferSurfaceFormat = params->depth_format; +#endif + } + + blorp_emit(batch, GENX(3DSTATE_SBE), sbe) { + sbe.VertexURBEntryReadOffset = 1; + if (prog_data) { + sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs; + sbe.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data); + sbe.ConstantInterpolationEnable = prog_data->flat_inputs; + } else { + sbe.NumberofSFOutputAttributes = 0; + sbe.VertexURBEntryReadLength = 1; + } + } + +#else /* GEN_GEN <= 6 */ + + blorp_emit(batch, GENX(3DSTATE_SF), sf) { + sf.FrontFaceFillMode = FILL_MODE_SOLID; + sf.BackFaceFillMode = FILL_MODE_SOLID; + + sf.MultisampleRasterizationMode = params->dst.surf.samples > 1 ? + MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL; + + sf.VertexURBEntryReadOffset = 1; + if (prog_data) { + sf.NumberofSFOutputAttributes = prog_data->num_varying_inputs; + sf.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data); + sf.ConstantInterpolationEnable = prog_data->flat_inputs; + } else { + sf.NumberofSFOutputAttributes = 0; + sf.VertexURBEntryReadLength = 1; + } + } + +#endif /* GEN_GEN */ +} + +static void +blorp_emit_ps_config(struct blorp_batch *batch, + const struct blorp_params *params) +{ + const struct brw_blorp_prog_data *prog_data = params->wm_prog_data; + + /* Even when thread dispatch is disabled, max threads (dw5.25:31) must be + * nonzero to prevent the GPU from hanging. While the documentation doesn't + * mention this explicitly, it notes that the valid range for the field is + * [1,39] = [2,40] threads, which excludes zero. + * + * To be safe (and to minimize extraneous code) we go ahead and fully + * configure the WM state whether or not there is a WM program. + */ + +#if GEN_GEN >= 8 + + blorp_emit(batch, GENX(3DSTATE_WM), wm); + + blorp_emit(batch, GENX(3DSTATE_PS), ps) { + if (params->src.addr.buffer) { + ps.SamplerCount = 1; /* Up to 4 samplers */ + ps.BindingTableEntryCount = 2; + } else { + ps.BindingTableEntryCount = 1; + } + + ps.DispatchGRFStartRegisterForConstantSetupData0 = + prog_data->first_curbe_grf_0; + ps.DispatchGRFStartRegisterForConstantSetupData2 = + prog_data->first_curbe_grf_2; + + ps._8PixelDispatchEnable = prog_data->dispatch_8; + ps._16PixelDispatchEnable = prog_data->dispatch_16; + + ps.KernelStartPointer0 = params->wm_prog_kernel; + ps.KernelStartPointer2 = + params->wm_prog_kernel + prog_data->ksp_offset_2; + + /* 3DSTATE_PS expects the number of threads per PSD, which is always 64; + * it implicitly scales for different GT levels (which have some # of + * PSDs). + * + * In Gen8 the format is U8-2 whereas in Gen9 it is U8-1. + */ + if (GEN_GEN >= 9) + ps.MaximumNumberofThreadsPerPSD = 64 - 1; + else + ps.MaximumNumberofThreadsPerPSD = 64 - 2; + + switch (params->fast_clear_op) { + case BLORP_FAST_CLEAR_OP_NONE: + break; +#if GEN_GEN >= 9 + case BLORP_FAST_CLEAR_OP_RESOLVE_PARTIAL: + ps.RenderTargetResolveType = RESOLVE_PARTIAL; + break; + case BLORP_FAST_CLEAR_OP_RESOLVE_FULL: + ps.RenderTargetResolveType = RESOLVE_FULL; + break; +#else + case BLORP_FAST_CLEAR_OP_RESOLVE_FULL: + ps.RenderTargetResolveEnable = true; + break; +#endif + case BLORP_FAST_CLEAR_OP_CLEAR: + ps.RenderTargetFastClearEnable = true; + break; + default: + unreachable("Invalid fast clear op"); + } + } + + blorp_emit(batch, GENX(3DSTATE_PS_EXTRA), psx) { + psx.PixelShaderValid = true; + + if (params->src.addr.buffer) + psx.PixelShaderKillsPixel = true; + + psx.AttributeEnable = prog_data->num_varying_inputs > 0; + + if (prog_data && prog_data->persample_msaa_dispatch) + psx.PixelShaderIsPerSample = true; + } + +#elif GEN_GEN >= 7 + + blorp_emit(batch, GENX(3DSTATE_WM), wm) { + switch (params->hiz_op) { + case BLORP_HIZ_OP_DEPTH_CLEAR: + wm.DepthBufferClear = true; + break; + case BLORP_HIZ_OP_DEPTH_RESOLVE: + wm.DepthBufferResolveEnable = true; + break; + case BLORP_HIZ_OP_HIZ_RESOLVE: + wm.HierarchicalDepthBufferResolveEnable = true; + break; + case BLORP_HIZ_OP_NONE: + break; + default: + unreachable("not reached"); + } + + if (prog_data) + wm.ThreadDispatchEnable = true; + + if (params->src.addr.buffer) + wm.PixelShaderKillPixel = true; + + if (params->dst.surf.samples > 1) { + wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN; + wm.MultisampleDispatchMode = + (prog_data && prog_data->persample_msaa_dispatch) ? + MSDISPMODE_PERSAMPLE : MSDISPMODE_PERPIXEL; + } else { + wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL; + wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE; + } + } + + blorp_emit(batch, GENX(3DSTATE_PS), ps) { + ps.MaximumNumberofThreads = + batch->blorp->isl_dev->info->max_wm_threads - 1; + +#if GEN_IS_HASWELL + ps.SampleMask = 1; +#endif + + if (prog_data) { + ps.DispatchGRFStartRegisterforConstantSetupData0 = + prog_data->first_curbe_grf_0; + ps.DispatchGRFStartRegisterforConstantSetupData2 = + prog_data->first_curbe_grf_2; + + ps.KernelStartPointer0 = params->wm_prog_kernel; + ps.KernelStartPointer2 = + params->wm_prog_kernel + prog_data->ksp_offset_2; + + ps._8PixelDispatchEnable = prog_data->dispatch_8; + ps._16PixelDispatchEnable = prog_data->dispatch_16; + + ps.AttributeEnable = prog_data->num_varying_inputs > 0; + } else { + /* Gen7 hardware gets angry if we don't enable at least one dispatch + * mode, so just enable 16-pixel dispatch if we don't have a program. + */ + ps._16PixelDispatchEnable = true; + } + + if (params->src.addr.buffer) + ps.SamplerCount = 1; /* Up to 4 samplers */ + + switch (params->fast_clear_op) { + case BLORP_FAST_CLEAR_OP_NONE: + break; + case BLORP_FAST_CLEAR_OP_RESOLVE_FULL: + ps.RenderTargetResolveEnable = true; + break; + case BLORP_FAST_CLEAR_OP_CLEAR: + ps.RenderTargetFastClearEnable = true; + break; + default: + unreachable("Invalid fast clear op"); + } + } + +#else /* GEN_GEN <= 6 */ + + blorp_emit(batch, GENX(3DSTATE_WM), wm) { + wm.MaximumNumberofThreads = + batch->blorp->isl_dev->info->max_wm_threads - 1; + + switch (params->hiz_op) { + case BLORP_HIZ_OP_DEPTH_CLEAR: + wm.DepthBufferClear = true; + break; + case BLORP_HIZ_OP_DEPTH_RESOLVE: + wm.DepthBufferResolveEnable = true; + break; + case BLORP_HIZ_OP_HIZ_RESOLVE: + wm.HierarchicalDepthBufferResolveEnable = true; + break; + case BLORP_HIZ_OP_NONE: + break; + default: + unreachable("not reached"); + } + + if (prog_data) { + wm.ThreadDispatchEnable = true; + + wm.DispatchGRFStartRegisterforConstantSetupData0 = + prog_data->first_curbe_grf_0; + wm.DispatchGRFStartRegisterforConstantSetupData2 = + prog_data->first_curbe_grf_2; + + wm.KernelStartPointer0 = params->wm_prog_kernel; + wm.KernelStartPointer2 = + params->wm_prog_kernel + prog_data->ksp_offset_2; + + wm._8PixelDispatchEnable = prog_data->dispatch_8; + wm._16PixelDispatchEnable = prog_data->dispatch_16; + + wm.NumberofSFOutputAttributes = prog_data->num_varying_inputs; + } + + if (params->src.addr.buffer) { + wm.SamplerCount = 1; /* Up to 4 samplers */ + wm.PixelShaderKillPixel = true; /* TODO: temporarily smash on */ + } + + if (params->dst.surf.samples > 1) { + wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN; + wm.MultisampleDispatchMode = + (prog_data && prog_data->persample_msaa_dispatch) ? + MSDISPMODE_PERSAMPLE : MSDISPMODE_PERPIXEL; + } else { + wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL; + wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE; + } + } + +#endif /* GEN_GEN */ +} + + +static void +blorp_emit_depth_stencil_config(struct blorp_batch *batch, + const struct blorp_params *params) +{ +#if GEN_GEN >= 7 + const uint32_t mocs = 1; /* GEN7_MOCS_L3 */ +#else + const uint32_t mocs = 0; +#endif + + blorp_emit(batch, GENX(3DSTATE_DEPTH_BUFFER), db) { + switch (params->depth.surf.dim) { + case ISL_SURF_DIM_1D: + db.SurfaceType = SURFTYPE_1D; + break; + case ISL_SURF_DIM_2D: + db.SurfaceType = SURFTYPE_2D; + break; + case ISL_SURF_DIM_3D: + db.SurfaceType = SURFTYPE_3D; + break; + } + + db.SurfaceFormat = params->depth_format; + +#if GEN_GEN >= 7 + db.DepthWriteEnable = true; +#endif + +#if GEN_GEN <= 6 + db.TiledSurface = true; + db.TileWalk = TILEWALK_YMAJOR; + db.MIPMapLayoutMode = MIPLAYOUT_BELOW; + db.SeparateStencilBufferEnable = true; +#endif + + db.HierarchicalDepthBufferEnable = true; + + db.Width = params->depth.surf.logical_level0_px.width - 1; + db.Height = params->depth.surf.logical_level0_px.height - 1; + db.RenderTargetViewExtent = db.Depth = + MAX2(params->depth.surf.logical_level0_px.depth, + params->depth.surf.logical_level0_px.array_len) - 1; + + db.LOD = params->depth.view.base_level; + db.MinimumArrayElement = params->depth.view.base_array_layer; + + db.SurfacePitch = params->depth.surf.row_pitch - 1; + db.SurfaceBaseAddress = params->depth.addr; + db.DepthBufferMOCS = mocs; + } + + blorp_emit(batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hiz) { + hiz.SurfacePitch = params->depth.aux_surf.row_pitch - 1; + hiz.SurfaceBaseAddress = params->depth.aux_addr; + hiz.HierarchicalDepthBufferMOCS = mocs; + } + + blorp_emit(batch, GENX(3DSTATE_STENCIL_BUFFER), sb); +} + +static uint32_t +blorp_emit_blend_state(struct blorp_batch *batch, + const struct blorp_params *params) +{ + struct GENX(BLEND_STATE) blend; + memset(&blend, 0, sizeof(blend)); + + for (unsigned i = 0; i < params->num_draw_buffers; ++i) { + blend.Entry[i].PreBlendColorClampEnable = true; + blend.Entry[i].PostBlendColorClampEnable = true; + blend.Entry[i].ColorClampRange = COLORCLAMP_RTFORMAT; + + blend.Entry[i].WriteDisableRed = params->color_write_disable[0]; + blend.Entry[i].WriteDisableGreen = params->color_write_disable[1]; + blend.Entry[i].WriteDisableBlue = params->color_write_disable[2]; + blend.Entry[i].WriteDisableAlpha = params->color_write_disable[3]; + } + + uint32_t offset; + void *state = blorp_alloc_dynamic_state(batch, AUB_TRACE_BLEND_STATE, + GENX(BLEND_STATE_length) * 4, + 64, &offset); + GENX(BLEND_STATE_pack)(NULL, state, &blend); + +#if GEN_GEN >= 7 + blorp_emit(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), sp) { + sp.BlendStatePointer = offset; +#if GEN_GEN >= 8 + sp.BlendStatePointerValid = true; +#endif + } +#endif + +#if GEN_GEN >= 8 + blorp_emit(batch, GENX(3DSTATE_PS_BLEND), ps_blend) { + ps_blend.HasWriteableRT = true; + } +#endif + + return offset; +} + +static uint32_t +blorp_emit_color_calc_state(struct blorp_batch *batch, + const struct blorp_params *params) +{ + uint32_t offset; + void *state = blorp_alloc_dynamic_state(batch, AUB_TRACE_CC_STATE, + GENX(COLOR_CALC_STATE_length) * 4, + 64, &offset); + memset(state, 0, GENX(COLOR_CALC_STATE_length) * 4); + +#if GEN_GEN >= 7 + blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), sp) { + sp.ColorCalcStatePointer = offset; +#if GEN_GEN >= 8 + sp.ColorCalcStatePointerValid = true; +#endif + } +#endif + + return offset; +} + +static uint32_t +blorp_emit_depth_stencil_state(struct blorp_batch *batch, + const struct blorp_params *params) +{ +#if GEN_GEN >= 8 + + /* On gen8+, DEPTH_STENCIL state is simply an instruction */ + blorp_emit(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), ds); + return 0; + +#else /* GEN_GEN <= 7 */ + + /* See the following sections of the Sandy Bridge PRM, Volume 1, Part2: + * - 7.5.3.1 Depth Buffer Clear + * - 7.5.3.2 Depth Buffer Resolve + * - 7.5.3.3 Hierarchical Depth Buffer Resolve + */ + struct GENX(DEPTH_STENCIL_STATE) ds = { + .DepthBufferWriteEnable = true, + }; + + if (params->hiz_op == BLORP_HIZ_OP_DEPTH_RESOLVE) { + ds.DepthTestEnable = true; + ds.DepthTestFunction = COMPAREFUNCTION_NEVER; + } + + uint32_t offset; + void *state = blorp_alloc_dynamic_state(batch, AUB_TRACE_DEPTH_STENCIL_STATE, + GENX(DEPTH_STENCIL_STATE_length) * 4, + 64, &offset); + GENX(DEPTH_STENCIL_STATE_pack)(NULL, state, &ds); + +#if GEN_GEN >= 7 + blorp_emit(batch, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), sp) { + sp.PointertoDEPTH_STENCIL_STATE = offset; + } +#endif + + return offset; + +#endif /* GEN_GEN */ +} + +struct surface_state_info { + unsigned num_dwords; + unsigned ss_align; /* Required alignment of RENDER_SURFACE_STATE in bytes */ + unsigned reloc_dw; + unsigned aux_reloc_dw; +}; + +static const struct surface_state_info surface_state_infos[] = { + [6] = {6, 32, 1, 0}, + [7] = {8, 32, 1, 6}, + [8] = {13, 64, 8, 10}, + [9] = {16, 64, 8, 10}, +}; + +static void +blorp_emit_surface_state(struct blorp_batch *batch, + const struct brw_blorp_surface_info *surface, + uint32_t *state, uint32_t state_offset, + bool is_render_target) +{ + const struct surface_state_info ss_info = surface_state_infos[GEN_GEN]; + + struct isl_surf surf = surface->surf; + + if (surf.dim == ISL_SURF_DIM_1D && + surf.dim_layout == ISL_DIM_LAYOUT_GEN4_2D) { + assert(surf.logical_level0_px.height == 1); + surf.dim = ISL_SURF_DIM_2D; + } + + /* Blorp doesn't support HiZ in any of the blit or slow-clear paths */ + enum isl_aux_usage aux_usage = surface->aux_usage; + if (aux_usage == ISL_AUX_USAGE_HIZ) + aux_usage = ISL_AUX_USAGE_NONE; + + const uint32_t mocs = + is_render_target ? batch->blorp->mocs.rb : batch->blorp->mocs.tex; + + isl_surf_fill_state(batch->blorp->isl_dev, state, + .surf = &surf, .view = &surface->view, + .aux_surf = &surface->aux_surf, .aux_usage = aux_usage, + .mocs = mocs, .clear_color = surface->clear_color, + .x_offset_sa = surface->tile_x_sa, + .y_offset_sa = surface->tile_y_sa); + + blorp_surface_reloc(batch, state_offset + ss_info.reloc_dw * 4, + surface->addr, 0); + + if (aux_usage != ISL_AUX_USAGE_NONE) { + /* On gen7 and prior, the bottom 12 bits of the MCS base address are + * used to store other information. This should be ok, however, because + * surface buffer addresses are always 4K page alinged. + */ + assert((surface->aux_addr.offset & 0xfff) == 0); + blorp_surface_reloc(batch, state_offset + ss_info.aux_reloc_dw * 4, + surface->aux_addr, state[ss_info.aux_reloc_dw]); + } +} + +static void +blorp_emit_surface_states(struct blorp_batch *batch, + const struct blorp_params *params) +{ + uint32_t bind_offset, *bind_map; + void *surface_maps[2]; + + const unsigned ss_size = GENX(RENDER_SURFACE_STATE_length) * 4; + const unsigned ss_align = GENX(RENDER_SURFACE_STATE_length) > 8 ? 64 : 32; + + unsigned num_surfaces = 1 + (params->src.addr.buffer != NULL); + blorp_alloc_binding_table(batch, num_surfaces, ss_size, ss_align, + &bind_offset, &bind_map, surface_maps); + + blorp_emit_surface_state(batch, ¶ms->dst, + surface_maps[BLORP_RENDERBUFFER_BT_INDEX], + bind_map[BLORP_RENDERBUFFER_BT_INDEX], true); + if (params->src.addr.buffer) { + blorp_emit_surface_state(batch, ¶ms->src, + surface_maps[BLORP_TEXTURE_BT_INDEX], + bind_map[BLORP_TEXTURE_BT_INDEX], false); + } + +#if GEN_GEN >= 7 + blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_PS), bt) { + bt.PointertoPSBindingTable = bind_offset; + } +#else + blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), bt) { + bt.PSBindingTableChange = true; + bt.PointertoPSBindingTable = bind_offset; + } +#endif +} + +static void +blorp_emit_sampler_state(struct blorp_batch *batch, + const struct blorp_params *params) +{ + struct GENX(SAMPLER_STATE) sampler = { + .MipModeFilter = MIPFILTER_NONE, + .MagModeFilter = MAPFILTER_LINEAR, + .MinModeFilter = MAPFILTER_LINEAR, + .MinLOD = 0, + .MaxLOD = 0, + .TCXAddressControlMode = TCM_CLAMP, + .TCYAddressControlMode = TCM_CLAMP, + .TCZAddressControlMode = TCM_CLAMP, + .MaximumAnisotropy = RATIO21, + .RAddressMinFilterRoundingEnable = true, + .RAddressMagFilterRoundingEnable = true, + .VAddressMinFilterRoundingEnable = true, + .VAddressMagFilterRoundingEnable = true, + .UAddressMinFilterRoundingEnable = true, + .UAddressMagFilterRoundingEnable = true, + .NonnormalizedCoordinateEnable = true, + }; + + uint32_t offset; + void *state = blorp_alloc_dynamic_state(batch, AUB_TRACE_SAMPLER_STATE, + GENX(SAMPLER_STATE_length) * 4, + 32, &offset); + GENX(SAMPLER_STATE_pack)(NULL, state, &sampler); + +#if GEN_GEN >= 7 + blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_PS), ssp) { + ssp.PointertoPSSamplerState = offset; + } +#else + blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ssp) { + ssp.VSSamplerStateChange = true; + ssp.GSSamplerStateChange = true; + ssp.PSSamplerStateChange = true; + ssp.PointertoPSSamplerState = offset; + } +#endif +} + +/* 3DSTATE_VIEWPORT_STATE_POINTERS */ +static void +blorp_emit_viewport_state(struct blorp_batch *batch, + const struct blorp_params *params) +{ + uint32_t cc_vp_offset; + + void *state = blorp_alloc_dynamic_state(batch, AUB_TRACE_CC_VP_STATE, + GENX(CC_VIEWPORT_length) * 4, 32, + &cc_vp_offset); + + GENX(CC_VIEWPORT_pack)(batch, state, + &(struct GENX(CC_VIEWPORT)) { + .MinimumDepth = 0.0, + .MaximumDepth = 1.0, + }); + +#if GEN_GEN >= 7 + blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), vsp) { + vsp.CCViewportPointer = cc_vp_offset; + } +#else + blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vsp) { + vsp.CCViewportStateChange = true; + vsp.PointertoCC_VIEWPORT = cc_vp_offset; + } +#endif +} + + +/** + * \brief Execute a blit or render pass operation. + * + * To execute the operation, this function manually constructs and emits a + * batch to draw a rectangle primitive. The batchbuffer is flushed before + * constructing and after emitting the batch. + * + * This function alters no GL state. + */ +static void +blorp_exec(struct blorp_batch *batch, const struct blorp_params *params) +{ + uint32_t blend_state_offset = 0; + uint32_t color_calc_state_offset = 0; + uint32_t depth_stencil_state_offset; + + blorp_emit_vertex_buffers(batch, params); + blorp_emit_vertex_elements(batch, params); + + emit_urb_config(batch, params); + + if (params->wm_prog_data) { + blend_state_offset = blorp_emit_blend_state(batch, params); + color_calc_state_offset = blorp_emit_color_calc_state(batch, params); + } + depth_stencil_state_offset = blorp_emit_depth_stencil_state(batch, params); + +#if GEN_GEN <= 6 + /* 3DSTATE_CC_STATE_POINTERS + * + * The pointer offsets are relative to + * CMD_STATE_BASE_ADDRESS.DynamicStateBaseAddress. + * + * The HiZ op doesn't use BLEND_STATE or COLOR_CALC_STATE. + * + * The dynamic state emit helpers emit their own STATE_POINTERS packets on + * gen7+. However, on gen6 and earlier, they're all lumpped together in + * one CC_STATE_POINTERS packet so we have to emit that here. + */ + blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), cc) { + cc.BLEND_STATEChange = true; + cc.COLOR_CALC_STATEChange = true; + cc.DEPTH_STENCIL_STATEChange = true; + cc.PointertoBLEND_STATE = blend_state_offset; + cc.PointertoCOLOR_CALC_STATE = color_calc_state_offset; + cc.PointertoDEPTH_STENCIL_STATE = depth_stencil_state_offset; + } +#else + (void)blend_state_offset; + (void)color_calc_state_offset; + (void)depth_stencil_state_offset; +#endif + + blorp_emit(batch, GENX(3DSTATE_CONSTANT_VS), vs); +#if GEN_GEN >= 7 + blorp_emit(batch, GENX(3DSTATE_CONSTANT_HS), hs); + blorp_emit(batch, GENX(3DSTATE_CONSTANT_DS), DS); +#endif + blorp_emit(batch, GENX(3DSTATE_CONSTANT_GS), gs); + blorp_emit(batch, GENX(3DSTATE_CONSTANT_PS), ps); + + if (params->wm_prog_data) + blorp_emit_surface_states(batch, params); + + if (params->src.addr.buffer) + blorp_emit_sampler_state(batch, params); + + blorp_emit_3dstate_multisample(batch, params->dst.surf.samples); + + blorp_emit(batch, GENX(3DSTATE_SAMPLE_MASK), mask) { + mask.SampleMask = (1 << params->dst.surf.samples) - 1; + } + + /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State, + * 3DSTATE_VS, Dword 5.0 "VS Function Enable": + * + * [DevSNB] A pipeline flush must be programmed prior to a + * 3DSTATE_VS command that causes the VS Function Enable to + * toggle. Pipeline flush can be executed by sending a PIPE_CONTROL + * command with CS stall bit set and a post sync operation. + * + * We've already done one at the start of the BLORP operation. + */ + blorp_emit(batch, GENX(3DSTATE_VS), vs); +#if GEN_GEN >= 7 + blorp_emit(batch, GENX(3DSTATE_HS), hs); + blorp_emit(batch, GENX(3DSTATE_TE), te); + blorp_emit(batch, GENX(3DSTATE_DS), DS); + blorp_emit(batch, GENX(3DSTATE_STREAMOUT), so); +#endif + blorp_emit(batch, GENX(3DSTATE_GS), gs); + + blorp_emit(batch, GENX(3DSTATE_CLIP), clip) { + clip.PerspectiveDivideDisable = true; + } + + blorp_emit_sf_config(batch, params); + blorp_emit_ps_config(batch, params); + + blorp_emit_viewport_state(batch, params); + + if (params->depth.addr.buffer) { + blorp_emit_depth_stencil_config(batch, params); + } else { + blorp_emit(batch, GENX(3DSTATE_DEPTH_BUFFER), db) { + db.SurfaceType = SURFTYPE_NULL; + db.SurfaceFormat = D32_FLOAT; + } + blorp_emit(batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hiz); + blorp_emit(batch, GENX(3DSTATE_STENCIL_BUFFER), sb); + } + + /* 3DSTATE_CLEAR_PARAMS + * + * From the Sandybridge PRM, Volume 2, Part 1, Section 3DSTATE_CLEAR_PARAMS: + * [DevSNB] 3DSTATE_CLEAR_PARAMS packet must follow the DEPTH_BUFFER_STATE + * packet when HiZ is enabled and the DEPTH_BUFFER_STATE changes. + */ + blorp_emit(batch, GENX(3DSTATE_CLEAR_PARAMS), clear) { + clear.DepthClearValueValid = true; + clear.DepthClearValue = params->depth.clear_color.u32[0]; + } + + blorp_emit(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) { + rect.ClippedDrawingRectangleXMax = MAX2(params->x1, params->x0) - 1; + rect.ClippedDrawingRectangleYMax = MAX2(params->y1, params->y0) - 1; + } + + blorp_emit(batch, GENX(3DPRIMITIVE), prim) { + prim.VertexAccessType = SEQUENTIAL; + prim.PrimitiveTopologyType = _3DPRIM_RECTLIST; + prim.VertexCountPerInstance = 3; + prim.InstanceCount = params->num_layers; + } +} diff --git a/src/intel/blorp/blorp_priv.h b/src/intel/blorp/blorp_priv.h new file mode 100644 index 00000000000..33f197b523d --- /dev/null +++ b/src/intel/blorp/blorp_priv.h @@ -0,0 +1,291 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#include <stdint.h> + +#include "compiler/nir/nir.h" +#include "brw_compiler.h" + +#include "blorp.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Binding table indices used by BLORP. + */ +enum { + BLORP_RENDERBUFFER_BT_INDEX, + BLORP_TEXTURE_BT_INDEX, + BLORP_NUM_BT_ENTRIES +}; + +enum blorp_fast_clear_op { + BLORP_FAST_CLEAR_OP_NONE = 0, + BLORP_FAST_CLEAR_OP_CLEAR, + BLORP_FAST_CLEAR_OP_RESOLVE_PARTIAL, + BLORP_FAST_CLEAR_OP_RESOLVE_FULL, +}; + +struct brw_blorp_surface_info +{ + struct isl_surf surf; + struct blorp_address addr; + + struct isl_surf aux_surf; + struct blorp_address aux_addr; + enum isl_aux_usage aux_usage; + + union isl_color_value clear_color; + + struct isl_view view; + + /* Z offset into a 3-D texture or slice of a 2-D array texture. */ + uint32_t z_offset; + + uint32_t tile_x_sa, tile_y_sa; +}; + +void +brw_blorp_surface_info_init(struct blorp_context *blorp, + struct brw_blorp_surface_info *info, + const struct blorp_surf *surf, + unsigned int level, unsigned int layer, + enum isl_format format, bool is_render_target); + + +struct brw_blorp_coord_transform +{ + float multiplier; + float offset; +}; + +/** + * Bounding rectangle telling pixel discard which pixels are not to be + * touched. This is needed in when surfaces are configured as something else + * what they really are: + * + * - writing W-tiled stencil as Y-tiled + * - writing interleaved multisampled as single sampled. + * + * See blorp_nir_discard_if_outside_rect(). + */ +struct brw_blorp_discard_rect +{ + uint32_t x0; + uint32_t x1; + uint32_t y0; + uint32_t y1; +}; + +/** + * Grid needed for blended and scaled blits of integer formats, see + * blorp_nir_manual_blend_bilinear(). + */ +struct brw_blorp_rect_grid +{ + float x1; + float y1; + float pad[2]; +}; + +struct brw_blorp_wm_inputs +{ + struct brw_blorp_discard_rect discard_rect; + struct brw_blorp_rect_grid rect_grid; + struct brw_blorp_coord_transform coord_transform[2]; + + /* Minimum layer setting works for all the textures types but texture_3d + * for which the setting has no effect. Use the z-coordinate instead. + */ + uint32_t src_z; + + /* Pad out to an integral number of registers */ + uint32_t pad[3]; +}; + +struct brw_blorp_prog_data +{ + bool dispatch_8; + bool dispatch_16; + + uint8_t first_curbe_grf_0; + uint8_t first_curbe_grf_2; + + uint32_t ksp_offset_2; + + /** + * True if the WM program should be run in MSDISPMODE_PERSAMPLE with more + * than one sample per pixel. + */ + bool persample_msaa_dispatch; + + /** + * Mask of which FS inputs are marked flat by the shader source. This is + * needed for setting up 3DSTATE_SF/SBE. + */ + uint32_t flat_inputs; + unsigned num_varying_inputs; + uint64_t inputs_read; +}; + +static inline unsigned +brw_blorp_get_urb_length(const struct brw_blorp_prog_data *prog_data) +{ + if (prog_data == NULL) + return 1; + + /* From the BSpec: 3D Pipeline - Strips and Fans - 3DSTATE_SBE + * + * read_length = ceiling((max_source_attr+1)/2) + */ + return MAX2((prog_data->num_varying_inputs + 1) / 2, 1); +} + +struct blorp_params +{ + uint32_t x0; + uint32_t y0; + uint32_t x1; + uint32_t y1; + struct brw_blorp_surface_info depth; + uint32_t depth_format; + struct brw_blorp_surface_info src; + struct brw_blorp_surface_info dst; + enum blorp_hiz_op hiz_op; + enum blorp_fast_clear_op fast_clear_op; + bool color_write_disable[4]; + struct brw_blorp_wm_inputs wm_inputs; + unsigned num_draw_buffers; + unsigned num_layers; + uint32_t wm_prog_kernel; + struct brw_blorp_prog_data *wm_prog_data; +}; + +void blorp_params_init(struct blorp_params *params); + +struct brw_blorp_blit_prog_key +{ + /* Number of samples per pixel that have been configured in the surface + * state for texturing from. + */ + unsigned tex_samples; + + /* MSAA layout that has been configured in the surface state for texturing + * from. + */ + enum isl_msaa_layout tex_layout; + + enum isl_aux_usage tex_aux_usage; + + /* Actual number of samples per pixel in the source image. */ + unsigned src_samples; + + /* Actual MSAA layout used by the source image. */ + enum isl_msaa_layout src_layout; + + /* Number of samples per pixel that have been configured in the render + * target. + */ + unsigned rt_samples; + + /* MSAA layout that has been configured in the render target. */ + enum isl_msaa_layout rt_layout; + + /* Actual number of samples per pixel in the destination image. */ + unsigned dst_samples; + + /* Actual MSAA layout used by the destination image. */ + enum isl_msaa_layout dst_layout; + + /* Type of the data to be read from the texture (one of + * nir_type_(int|uint|float)). + */ + nir_alu_type texture_data_type; + + /* True if the source image is W tiled. If true, the surface state for the + * source image must be configured as Y tiled, and tex_samples must be 0. + */ + bool src_tiled_w; + + /* True if the destination image is W tiled. If true, the surface state + * for the render target must be configured as Y tiled, and rt_samples must + * be 0. + */ + bool dst_tiled_w; + + /* True if all source samples should be blended together to produce each + * destination pixel. If true, src_tiled_w must be false, tex_samples must + * equal src_samples, and tex_samples must be nonzero. + */ + bool blend; + + /* True if the rectangle being sent through the rendering pipeline might be + * larger than the destination rectangle, so the WM program should kill any + * pixels that are outside the destination rectangle. + */ + bool use_kill; + + /** + * True if the WM program should be run in MSDISPMODE_PERSAMPLE with more + * than one sample per pixel. + */ + bool persample_msaa_dispatch; + + /* True for scaled blitting. */ + bool blit_scaled; + + /* Scale factors between the pixel grid and the grid of samples. We're + * using grid of samples for bilinear filetring in multisample scaled blits. + */ + float x_scale; + float y_scale; + + /* True for blits with filter = GL_LINEAR. */ + bool bilinear_filter; +}; + +/** + * \name BLORP internals + * \{ + * + * Used internally by gen6_blorp_exec() and gen7_blorp_exec(). + */ + +void brw_blorp_init_wm_prog_key(struct brw_wm_prog_key *wm_key); + +const unsigned * +brw_blorp_compile_nir_shader(struct blorp_context *blorp, struct nir_shader *nir, + const struct brw_wm_prog_key *wm_key, + bool use_repclear, + struct brw_blorp_prog_data *prog_data, + unsigned *program_size); + +/** \} */ + +#ifdef __cplusplus +} /* end extern "C" */ +#endif /* __cplusplus */ |