aboutsummaryrefslogtreecommitdiffstats
path: root/src/intel/blorp
diff options
context:
space:
mode:
authorJason Ekstrand <[email protected]>2016-08-19 04:27:18 -0700
committerJason Ekstrand <[email protected]>2016-08-29 12:17:34 -0700
commit348509269ead23cb7f953c174d400e6e3d17d723 (patch)
treeb72ea058928189d5f56b605ca21447206df23834 /src/intel/blorp
parent8bd35d8bd2bff51b39baf559efd9f3a0e20fd2b0 (diff)
i965: Move blorp into src/intel/blorp
At this point, blorp is completely driver agnostic and can be safely moved into its own folder. Soon, we hope to start using it for doing blits in the Vulkan driver. Signed-off-by: Jason Ekstrand <[email protected]> Reviewed-by: Topi Pohjolainen <[email protected]>
Diffstat (limited to 'src/intel/blorp')
-rw-r--r--src/intel/blorp/blorp.c292
-rw-r--r--src/intel/blorp/blorp.h153
-rw-r--r--src/intel/blorp/blorp_blit.c1649
-rw-r--r--src/intel/blorp/blorp_clear.c344
-rw-r--r--src/intel/blorp/blorp_genX_exec.h1176
-rw-r--r--src/intel/blorp/blorp_priv.h291
6 files changed, 3905 insertions, 0 deletions
diff --git a/src/intel/blorp/blorp.c b/src/intel/blorp/blorp.c
new file mode 100644
index 00000000000..4dbba017489
--- /dev/null
+++ b/src/intel/blorp/blorp.c
@@ -0,0 +1,292 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+
+#include "program/prog_instruction.h"
+
+#include "blorp_priv.h"
+#include "brw_compiler.h"
+#include "brw_nir.h"
+
+void
+blorp_init(struct blorp_context *blorp, void *driver_ctx,
+ struct isl_device *isl_dev)
+{
+ blorp->driver_ctx = driver_ctx;
+ blorp->isl_dev = isl_dev;
+}
+
+void
+blorp_finish(struct blorp_context *blorp)
+{
+ blorp->driver_ctx = NULL;
+}
+
+void
+blorp_batch_init(struct blorp_context *blorp,
+ struct blorp_batch *batch, void *driver_batch)
+{
+ batch->blorp = blorp;
+ batch->driver_batch = driver_batch;
+}
+
+void
+blorp_batch_finish(struct blorp_batch *batch)
+{
+ batch->blorp = NULL;
+}
+
+void
+brw_blorp_surface_info_init(struct blorp_context *blorp,
+ struct brw_blorp_surface_info *info,
+ const struct blorp_surf *surf,
+ unsigned int level, unsigned int layer,
+ enum isl_format format, bool is_render_target)
+{
+ /* Layer is a physical layer, so if this is a 2D multisample array texture
+ * using INTEL_MSAA_LAYOUT_UMS or INTEL_MSAA_LAYOUT_CMS, then it had better
+ * be a multiple of num_samples.
+ */
+ unsigned layer_multiplier = 1;
+ if (surf->surf->msaa_layout == ISL_MSAA_LAYOUT_ARRAY) {
+ assert(layer % surf->surf->samples == 0);
+ layer_multiplier = surf->surf->samples;
+ }
+
+ if (format == ISL_FORMAT_UNSUPPORTED)
+ format = surf->surf->format;
+
+ if (format == ISL_FORMAT_R24_UNORM_X8_TYPELESS) {
+ /* Unfortunately, ISL_FORMAT_R24_UNORM_X8_TYPELESS it isn't supported as
+ * a render target, which would prevent us from blitting to 24-bit
+ * depth. The miptree consists of 32 bits per pixel, arranged as 24-bit
+ * depth values interleaved with 8 "don't care" bits. Since depth
+ * values don't require any blending, it doesn't matter how we interpret
+ * the bit pattern as long as we copy the right amount of data, so just
+ * map it as 8-bit BGRA.
+ */
+ format = ISL_FORMAT_B8G8R8A8_UNORM;
+ } else if (surf->surf->usage & ISL_SURF_USAGE_STENCIL_BIT) {
+ assert(surf->surf->format == ISL_FORMAT_R8_UINT);
+ /* Prior to Broadwell, we can't render to R8_UINT */
+ if (blorp->isl_dev->info->gen < 8)
+ format = ISL_FORMAT_R8_UNORM;
+ }
+
+ info->surf = *surf->surf;
+ info->addr = surf->addr;
+
+ info->aux_usage = surf->aux_usage;
+ if (info->aux_usage != ISL_AUX_USAGE_NONE) {
+ info->aux_surf = *surf->aux_surf;
+ info->aux_addr = surf->aux_addr;
+ }
+
+ info->clear_color = surf->clear_color;
+
+ info->view = (struct isl_view) {
+ .usage = is_render_target ? ISL_SURF_USAGE_RENDER_TARGET_BIT :
+ ISL_SURF_USAGE_TEXTURE_BIT,
+ .format = format,
+ .base_level = level,
+ .levels = 1,
+ .channel_select = {
+ ISL_CHANNEL_SELECT_RED,
+ ISL_CHANNEL_SELECT_GREEN,
+ ISL_CHANNEL_SELECT_BLUE,
+ ISL_CHANNEL_SELECT_ALPHA,
+ },
+ };
+
+ if (!is_render_target &&
+ (info->surf.dim == ISL_SURF_DIM_3D ||
+ info->surf.msaa_layout == ISL_MSAA_LAYOUT_ARRAY)) {
+ /* 3-D textures don't support base_array layer and neither do 2-D
+ * multisampled textures on IVB so we need to pass it through the
+ * sampler in those cases. These are also two cases where we are
+ * guaranteed that we won't be doing any funny surface hacks.
+ */
+ info->view.base_array_layer = 0;
+ info->view.array_len = MAX2(info->surf.logical_level0_px.depth,
+ info->surf.logical_level0_px.array_len);
+ info->z_offset = layer / layer_multiplier;
+ } else {
+ info->view.base_array_layer = layer / layer_multiplier;
+ info->view.array_len = 1;
+ info->z_offset = 0;
+ }
+}
+
+
+void
+blorp_params_init(struct blorp_params *params)
+{
+ memset(params, 0, sizeof(*params));
+ params->num_draw_buffers = 1;
+ params->num_layers = 1;
+}
+
+void
+brw_blorp_init_wm_prog_key(struct brw_wm_prog_key *wm_key)
+{
+ memset(wm_key, 0, sizeof(*wm_key));
+ wm_key->nr_color_regions = 1;
+ for (int i = 0; i < MAX_SAMPLERS; i++)
+ wm_key->tex.swizzles[i] = SWIZZLE_XYZW;
+}
+
+static int
+nir_uniform_type_size(const struct glsl_type *type)
+{
+ /* Only very basic types are allowed */
+ assert(glsl_type_is_vector_or_scalar(type));
+ assert(glsl_get_bit_size(type) == 32);
+
+ return glsl_get_vector_elements(type) * 4;
+}
+
+const unsigned *
+brw_blorp_compile_nir_shader(struct blorp_context *blorp, struct nir_shader *nir,
+ const struct brw_wm_prog_key *wm_key,
+ bool use_repclear,
+ struct brw_blorp_prog_data *prog_data,
+ unsigned *program_size)
+{
+ const struct brw_compiler *compiler = blorp->compiler;
+
+ void *mem_ctx = ralloc_context(NULL);
+
+ /* Calling brw_preprocess_nir and friends is destructive and, if cloning is
+ * enabled, may end up completely replacing the nir_shader. Therefore, we
+ * own it and might as well put it in our context for easy cleanup.
+ */
+ ralloc_steal(mem_ctx, nir);
+ nir->options =
+ compiler->glsl_compiler_options[MESA_SHADER_FRAGMENT].NirOptions;
+
+ struct brw_wm_prog_data wm_prog_data;
+ memset(&wm_prog_data, 0, sizeof(wm_prog_data));
+
+ wm_prog_data.base.nr_params = 0;
+ wm_prog_data.base.param = NULL;
+
+ /* BLORP always just uses the first two binding table entries */
+ wm_prog_data.binding_table.render_target_start = BLORP_RENDERBUFFER_BT_INDEX;
+ wm_prog_data.base.binding_table.texture_start = BLORP_TEXTURE_BT_INDEX;
+
+ nir = brw_preprocess_nir(compiler, nir);
+ nir_remove_dead_variables(nir, nir_var_shader_in);
+ nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
+
+ /* Uniforms are required to be lowered before going into compile_fs. For
+ * BLORP, we'll assume that whoever builds the shader sets the location
+ * they want so we just need to lower them and figure out how many we have
+ * in total.
+ */
+ nir->num_uniforms = 0;
+ nir_foreach_variable(var, &nir->uniforms) {
+ var->data.driver_location = var->data.location;
+ unsigned end = var->data.location + nir_uniform_type_size(var->type);
+ nir->num_uniforms = MAX2(nir->num_uniforms, end);
+ }
+ nir_lower_io(nir, nir_var_uniform, nir_uniform_type_size);
+
+ const unsigned *program =
+ brw_compile_fs(compiler, blorp->driver_ctx, mem_ctx,
+ wm_key, &wm_prog_data, nir,
+ NULL, -1, -1, false, use_repclear, program_size, NULL);
+
+ /* Copy the relavent bits of wm_prog_data over into the blorp prog data */
+ prog_data->dispatch_8 = wm_prog_data.dispatch_8;
+ prog_data->dispatch_16 = wm_prog_data.dispatch_16;
+ prog_data->first_curbe_grf_0 = wm_prog_data.base.dispatch_grf_start_reg;
+ prog_data->first_curbe_grf_2 = wm_prog_data.dispatch_grf_start_reg_2;
+ prog_data->ksp_offset_2 = wm_prog_data.prog_offset_2;
+ prog_data->persample_msaa_dispatch = wm_prog_data.persample_dispatch;
+ prog_data->flat_inputs = wm_prog_data.flat_inputs;
+ prog_data->num_varying_inputs = wm_prog_data.num_varying_inputs;
+ prog_data->inputs_read = nir->info.inputs_read;
+
+ assert(wm_prog_data.base.nr_params == 0);
+
+ return program;
+}
+
+void
+blorp_gen6_hiz_op(struct blorp_batch *batch,
+ struct blorp_surf *surf, unsigned level, unsigned layer,
+ enum blorp_hiz_op op)
+{
+ struct blorp_params params;
+ blorp_params_init(&params);
+
+ params.hiz_op = op;
+
+ brw_blorp_surface_info_init(batch->blorp, &params.depth, surf, level, layer,
+ surf->surf->format, true);
+
+ /* Align the rectangle primitive to 8x4 pixels.
+ *
+ * During fast depth clears, the emitted rectangle primitive must be
+ * aligned to 8x4 pixels. From the Ivybridge PRM, Vol 2 Part 1 Section
+ * 11.5.3.1 Depth Buffer Clear (and the matching section in the Sandybridge
+ * PRM):
+ * If Number of Multisamples is NUMSAMPLES_1, the rectangle must be
+ * aligned to an 8x4 pixel block relative to the upper left corner
+ * of the depth buffer [...]
+ *
+ * For hiz resolves, the rectangle must also be 8x4 aligned. Item
+ * WaHizAmbiguate8x4Aligned from the Haswell workarounds page and the
+ * Ivybridge simulator require the alignment.
+ *
+ * To be safe, let's just align the rect for all hiz operations and all
+ * hardware generations.
+ *
+ * However, for some miptree slices of a Z24 texture, emitting an 8x4
+ * aligned rectangle that covers the slice may clobber adjacent slices if
+ * we strictly adhered to the texture alignments specified in the PRM. The
+ * Ivybridge PRM, Section "Alignment Unit Size", states that
+ * SURFACE_STATE.Surface_Horizontal_Alignment should be 4 for Z24 surfaces,
+ * not 8. But commit 1f112cc increased the alignment from 4 to 8, which
+ * prevents the clobbering.
+ */
+ params.x1 = minify(params.depth.surf.logical_level0_px.width,
+ params.depth.view.base_level);
+ params.y1 = minify(params.depth.surf.logical_level0_px.height,
+ params.depth.view.base_level);
+ params.x1 = ALIGN(params.x1, 8);
+ params.y1 = ALIGN(params.y1, 4);
+
+ if (params.depth.view.base_level == 0) {
+ /* TODO: What about MSAA? */
+ params.depth.surf.logical_level0_px.width = params.x1;
+ params.depth.surf.logical_level0_px.height = params.y1;
+ }
+
+ params.dst.surf.samples = params.depth.surf.samples;
+ params.dst.surf.logical_level0_px = params.depth.surf.logical_level0_px;
+ params.depth_format = isl_format_get_depth_format(surf->surf->format, false);
+
+ batch->blorp->exec(batch, &params);
+}
diff --git a/src/intel/blorp/blorp.h b/src/intel/blorp/blorp.h
new file mode 100644
index 00000000000..a4fcfdfcf70
--- /dev/null
+++ b/src/intel/blorp/blorp.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "isl/isl.h"
+
+struct brw_context;
+struct brw_wm_prog_key;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct blorp_batch;
+struct blorp_params;
+
+struct blorp_context {
+ void *driver_ctx;
+
+ const struct isl_device *isl_dev;
+
+ const struct brw_compiler *compiler;
+
+ struct {
+ uint32_t tex;
+ uint32_t rb;
+ uint32_t vb;
+ } mocs;
+
+ bool (*lookup_shader)(struct blorp_context *blorp,
+ const void *key, uint32_t key_size,
+ uint32_t *kernel_out, void *prog_data_out);
+ void (*upload_shader)(struct blorp_context *blorp,
+ const void *key, uint32_t key_size,
+ const void *kernel, uint32_t kernel_size,
+ const void *prog_data, uint32_t prog_data_size,
+ uint32_t *kernel_out, void *prog_data_out);
+ void (*exec)(struct blorp_batch *batch, const struct blorp_params *params);
+};
+
+void blorp_init(struct blorp_context *blorp, void *driver_ctx,
+ struct isl_device *isl_dev);
+void blorp_finish(struct blorp_context *blorp);
+
+struct blorp_batch {
+ struct blorp_context *blorp;
+ void *driver_batch;
+};
+
+void blorp_batch_init(struct blorp_context *blorp, struct blorp_batch *batch,
+ void *driver_batch);
+void blorp_batch_finish(struct blorp_batch *batch);
+
+struct blorp_address {
+ void *buffer;
+ uint32_t read_domains;
+ uint32_t write_domain;
+ uint32_t offset;
+};
+
+struct blorp_surf
+{
+ const struct isl_surf *surf;
+ struct blorp_address addr;
+
+ const struct isl_surf *aux_surf;
+ struct blorp_address aux_addr;
+ enum isl_aux_usage aux_usage;
+
+ union isl_color_value clear_color;
+};
+
+void
+blorp_blit(struct blorp_batch *batch,
+ const struct blorp_surf *src_surf,
+ unsigned src_level, unsigned src_layer,
+ enum isl_format src_format, int src_swizzle,
+ const struct blorp_surf *dst_surf,
+ unsigned dst_level, unsigned dst_layer,
+ enum isl_format dst_format,
+ float src_x0, float src_y0,
+ float src_x1, float src_y1,
+ float dst_x0, float dst_y0,
+ float dst_x1, float dst_y1,
+ uint32_t filter, bool mirror_x, bool mirror_y);
+
+void
+blorp_fast_clear(struct blorp_batch *batch,
+ const struct blorp_surf *surf,
+ uint32_t level, uint32_t layer,
+ uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1);
+
+void
+blorp_clear(struct blorp_batch *batch,
+ const struct blorp_surf *surf,
+ uint32_t level, uint32_t layer,
+ uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1,
+ enum isl_format format, union isl_color_value clear_color,
+ bool color_write_disable[4]);
+
+void
+blorp_ccs_resolve(struct blorp_batch *batch,
+ struct blorp_surf *surf, enum isl_format format);
+
+/**
+ * For an overview of the HiZ operations, see the following sections of the
+ * Sandy Bridge PRM, Volume 1, Part2:
+ * - 7.5.3.1 Depth Buffer Clear
+ * - 7.5.3.2 Depth Buffer Resolve
+ * - 7.5.3.3 Hierarchical Depth Buffer Resolve
+ *
+ * Of these, two get entered in the resolve map as needing to be done to the
+ * buffer: depth resolve and hiz resolve.
+ */
+enum blorp_hiz_op {
+ BLORP_HIZ_OP_NONE,
+ BLORP_HIZ_OP_DEPTH_CLEAR,
+ BLORP_HIZ_OP_DEPTH_RESOLVE,
+ BLORP_HIZ_OP_HIZ_RESOLVE,
+};
+
+void
+blorp_gen6_hiz_op(struct blorp_batch *batch,
+ struct blorp_surf *surf, unsigned level, unsigned layer,
+ enum blorp_hiz_op op);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif /* __cplusplus */
diff --git a/src/intel/blorp/blorp_blit.c b/src/intel/blorp/blorp_blit.c
new file mode 100644
index 00000000000..170c3816e38
--- /dev/null
+++ b/src/intel/blorp/blorp_blit.c
@@ -0,0 +1,1649 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "program/prog_instruction.h"
+#include "compiler/nir/nir_builder.h"
+
+#include "blorp_priv.h"
+#include "brw_meta_util.h"
+
+#define FILE_DEBUG_FLAG DEBUG_BLORP
+
+/**
+ * Enum to specify the order of arguments in a sampler message
+ */
+enum sampler_message_arg
+{
+ SAMPLER_MESSAGE_ARG_U_FLOAT,
+ SAMPLER_MESSAGE_ARG_V_FLOAT,
+ SAMPLER_MESSAGE_ARG_U_INT,
+ SAMPLER_MESSAGE_ARG_V_INT,
+ SAMPLER_MESSAGE_ARG_R_INT,
+ SAMPLER_MESSAGE_ARG_SI_INT,
+ SAMPLER_MESSAGE_ARG_MCS_INT,
+ SAMPLER_MESSAGE_ARG_ZERO_INT,
+};
+
+struct brw_blorp_blit_vars {
+ /* Input values from brw_blorp_wm_inputs */
+ nir_variable *v_discard_rect;
+ nir_variable *v_rect_grid;
+ nir_variable *v_coord_transform;
+ nir_variable *v_src_z;
+
+ /* gl_FragCoord */
+ nir_variable *frag_coord;
+
+ /* gl_FragColor */
+ nir_variable *color_out;
+};
+
+static void
+brw_blorp_blit_vars_init(nir_builder *b, struct brw_blorp_blit_vars *v,
+ const struct brw_blorp_blit_prog_key *key)
+{
+ /* Blended and scaled blits never use pixel discard. */
+ assert(!key->use_kill || !(key->blend && key->blit_scaled));
+
+#define LOAD_INPUT(name, type)\
+ v->v_##name = nir_variable_create(b->shader, nir_var_shader_in, \
+ type, #name); \
+ v->v_##name->data.interpolation = INTERP_MODE_FLAT; \
+ v->v_##name->data.location = VARYING_SLOT_VAR0 + \
+ offsetof(struct brw_blorp_wm_inputs, name) / (4 * sizeof(float));
+
+ LOAD_INPUT(discard_rect, glsl_vec4_type())
+ LOAD_INPUT(rect_grid, glsl_vec4_type())
+ LOAD_INPUT(coord_transform, glsl_vec4_type())
+ LOAD_INPUT(src_z, glsl_uint_type())
+
+#undef LOAD_INPUT
+
+ v->frag_coord = nir_variable_create(b->shader, nir_var_shader_in,
+ glsl_vec4_type(), "gl_FragCoord");
+ v->frag_coord->data.location = VARYING_SLOT_POS;
+ v->frag_coord->data.origin_upper_left = true;
+
+ v->color_out = nir_variable_create(b->shader, nir_var_shader_out,
+ glsl_vec4_type(), "gl_FragColor");
+ v->color_out->data.location = FRAG_RESULT_COLOR;
+}
+
+static nir_ssa_def *
+blorp_blit_get_frag_coords(nir_builder *b,
+ const struct brw_blorp_blit_prog_key *key,
+ struct brw_blorp_blit_vars *v)
+{
+ nir_ssa_def *coord = nir_f2i(b, nir_load_var(b, v->frag_coord));
+
+ if (key->persample_msaa_dispatch) {
+ return nir_vec3(b, nir_channel(b, coord, 0), nir_channel(b, coord, 1),
+ nir_load_sample_id(b));
+ } else {
+ return nir_vec2(b, nir_channel(b, coord, 0), nir_channel(b, coord, 1));
+ }
+}
+
+/**
+ * Emit code to translate from destination (X, Y) coordinates to source (X, Y)
+ * coordinates.
+ */
+static nir_ssa_def *
+blorp_blit_apply_transform(nir_builder *b, nir_ssa_def *src_pos,
+ struct brw_blorp_blit_vars *v)
+{
+ nir_ssa_def *coord_transform = nir_load_var(b, v->v_coord_transform);
+
+ nir_ssa_def *offset = nir_vec2(b, nir_channel(b, coord_transform, 1),
+ nir_channel(b, coord_transform, 3));
+ nir_ssa_def *mul = nir_vec2(b, nir_channel(b, coord_transform, 0),
+ nir_channel(b, coord_transform, 2));
+
+ return nir_ffma(b, src_pos, mul, offset);
+}
+
+static inline void
+blorp_nir_discard_if_outside_rect(nir_builder *b, nir_ssa_def *pos,
+ struct brw_blorp_blit_vars *v)
+{
+ nir_ssa_def *c0, *c1, *c2, *c3;
+ nir_ssa_def *discard_rect = nir_load_var(b, v->v_discard_rect);
+ nir_ssa_def *dst_x0 = nir_channel(b, discard_rect, 0);
+ nir_ssa_def *dst_x1 = nir_channel(b, discard_rect, 1);
+ nir_ssa_def *dst_y0 = nir_channel(b, discard_rect, 2);
+ nir_ssa_def *dst_y1 = nir_channel(b, discard_rect, 3);
+
+ c0 = nir_ult(b, nir_channel(b, pos, 0), dst_x0);
+ c1 = nir_uge(b, nir_channel(b, pos, 0), dst_x1);
+ c2 = nir_ult(b, nir_channel(b, pos, 1), dst_y0);
+ c3 = nir_uge(b, nir_channel(b, pos, 1), dst_y1);
+
+ nir_ssa_def *oob = nir_ior(b, nir_ior(b, c0, c1), nir_ior(b, c2, c3));
+
+ nir_intrinsic_instr *discard =
+ nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard_if);
+ discard->src[0] = nir_src_for_ssa(oob);
+ nir_builder_instr_insert(b, &discard->instr);
+}
+
+static nir_tex_instr *
+blorp_create_nir_tex_instr(nir_builder *b, struct brw_blorp_blit_vars *v,
+ nir_texop op, nir_ssa_def *pos, unsigned num_srcs,
+ nir_alu_type dst_type)
+{
+ nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs);
+
+ tex->op = op;
+
+ tex->dest_type = dst_type;
+ tex->is_array = false;
+ tex->is_shadow = false;
+
+ /* Blorp only has one texture and it's bound at unit 0 */
+ tex->texture = NULL;
+ tex->sampler = NULL;
+ tex->texture_index = 0;
+ tex->sampler_index = 0;
+
+ /* To properly handle 3-D and 2-D array textures, we pull the Z component
+ * from an input. TODO: This is a bit magic; we should probably make this
+ * more explicit in the future.
+ */
+ assert(pos->num_components >= 2);
+ pos = nir_vec3(b, nir_channel(b, pos, 0), nir_channel(b, pos, 1),
+ nir_load_var(b, v->v_src_z));
+
+ tex->src[0].src_type = nir_tex_src_coord;
+ tex->src[0].src = nir_src_for_ssa(pos);
+ tex->coord_components = 3;
+
+ nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
+
+ return tex;
+}
+
+static nir_ssa_def *
+blorp_nir_tex(nir_builder *b, struct brw_blorp_blit_vars *v,
+ nir_ssa_def *pos, nir_alu_type dst_type)
+{
+ nir_tex_instr *tex =
+ blorp_create_nir_tex_instr(b, v, nir_texop_tex, pos, 2, dst_type);
+
+ assert(pos->num_components == 2);
+ tex->sampler_dim = GLSL_SAMPLER_DIM_2D;
+ tex->src[1].src_type = nir_tex_src_lod;
+ tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0));
+
+ nir_builder_instr_insert(b, &tex->instr);
+
+ return &tex->dest.ssa;
+}
+
+static nir_ssa_def *
+blorp_nir_txf(nir_builder *b, struct brw_blorp_blit_vars *v,
+ nir_ssa_def *pos, nir_alu_type dst_type)
+{
+ nir_tex_instr *tex =
+ blorp_create_nir_tex_instr(b, v, nir_texop_txf, pos, 2, dst_type);
+
+ tex->sampler_dim = GLSL_SAMPLER_DIM_3D;
+ tex->src[1].src_type = nir_tex_src_lod;
+ tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0));
+
+ nir_builder_instr_insert(b, &tex->instr);
+
+ return &tex->dest.ssa;
+}
+
+static nir_ssa_def *
+blorp_nir_txf_ms(nir_builder *b, struct brw_blorp_blit_vars *v,
+ nir_ssa_def *pos, nir_ssa_def *mcs, nir_alu_type dst_type)
+{
+ nir_tex_instr *tex =
+ blorp_create_nir_tex_instr(b, v, nir_texop_txf_ms, pos,
+ mcs != NULL ? 3 : 2, dst_type);
+
+ tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
+
+ tex->src[1].src_type = nir_tex_src_ms_index;
+ if (pos->num_components == 2) {
+ tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0));
+ } else {
+ assert(pos->num_components == 3);
+ tex->src[1].src = nir_src_for_ssa(nir_channel(b, pos, 2));
+ }
+
+ if (mcs) {
+ tex->src[2].src_type = nir_tex_src_ms_mcs;
+ tex->src[2].src = nir_src_for_ssa(mcs);
+ }
+
+ nir_builder_instr_insert(b, &tex->instr);
+
+ return &tex->dest.ssa;
+}
+
+static nir_ssa_def *
+blorp_nir_txf_ms_mcs(nir_builder *b, struct brw_blorp_blit_vars *v, nir_ssa_def *pos)
+{
+ nir_tex_instr *tex =
+ blorp_create_nir_tex_instr(b, v, nir_texop_txf_ms_mcs,
+ pos, 1, nir_type_int);
+
+ tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
+
+ nir_builder_instr_insert(b, &tex->instr);
+
+ return &tex->dest.ssa;
+}
+
+static nir_ssa_def *
+nir_mask_shift_or(struct nir_builder *b, nir_ssa_def *dst, nir_ssa_def *src,
+ uint32_t src_mask, int src_left_shift)
+{
+ nir_ssa_def *masked = nir_iand(b, src, nir_imm_int(b, src_mask));
+
+ nir_ssa_def *shifted;
+ if (src_left_shift > 0) {
+ shifted = nir_ishl(b, masked, nir_imm_int(b, src_left_shift));
+ } else if (src_left_shift < 0) {
+ shifted = nir_ushr(b, masked, nir_imm_int(b, -src_left_shift));
+ } else {
+ assert(src_left_shift == 0);
+ shifted = masked;
+ }
+
+ return nir_ior(b, dst, shifted);
+}
+
+/**
+ * Emit code to compensate for the difference between Y and W tiling.
+ *
+ * This code modifies the X and Y coordinates according to the formula:
+ *
+ * (X', Y', S') = detile(W-MAJOR, tile(Y-MAJOR, X, Y, S))
+ *
+ * (See brw_blorp_build_nir_shader).
+ */
+static inline nir_ssa_def *
+blorp_nir_retile_y_to_w(nir_builder *b, nir_ssa_def *pos)
+{
+ assert(pos->num_components == 2);
+ nir_ssa_def *x_Y = nir_channel(b, pos, 0);
+ nir_ssa_def *y_Y = nir_channel(b, pos, 1);
+
+ /* Given X and Y coordinates that describe an address using Y tiling,
+ * translate to the X and Y coordinates that describe the same address
+ * using W tiling.
+ *
+ * If we break down the low order bits of X and Y, using a
+ * single letter to represent each low-order bit:
+ *
+ * X = A << 7 | 0bBCDEFGH
+ * Y = J << 5 | 0bKLMNP (1)
+ *
+ * Then we can apply the Y tiling formula to see the memory offset being
+ * addressed:
+ *
+ * offset = (J * tile_pitch + A) << 12 | 0bBCDKLMNPEFGH (2)
+ *
+ * If we apply the W detiling formula to this memory location, that the
+ * corresponding X' and Y' coordinates are:
+ *
+ * X' = A << 6 | 0bBCDPFH (3)
+ * Y' = J << 6 | 0bKLMNEG
+ *
+ * Combining (1) and (3), we see that to transform (X, Y) to (X', Y'),
+ * we need to make the following computation:
+ *
+ * X' = (X & ~0b1011) >> 1 | (Y & 0b1) << 2 | X & 0b1 (4)
+ * Y' = (Y & ~0b1) << 1 | (X & 0b1000) >> 2 | (X & 0b10) >> 1
+ */
+ nir_ssa_def *x_W = nir_imm_int(b, 0);
+ x_W = nir_mask_shift_or(b, x_W, x_Y, 0xfffffff4, -1);
+ x_W = nir_mask_shift_or(b, x_W, y_Y, 0x1, 2);
+ x_W = nir_mask_shift_or(b, x_W, x_Y, 0x1, 0);
+
+ nir_ssa_def *y_W = nir_imm_int(b, 0);
+ y_W = nir_mask_shift_or(b, y_W, y_Y, 0xfffffffe, 1);
+ y_W = nir_mask_shift_or(b, y_W, x_Y, 0x8, -2);
+ y_W = nir_mask_shift_or(b, y_W, x_Y, 0x2, -1);
+
+ return nir_vec2(b, x_W, y_W);
+}
+
+/**
+ * Emit code to compensate for the difference between Y and W tiling.
+ *
+ * This code modifies the X and Y coordinates according to the formula:
+ *
+ * (X', Y', S') = detile(Y-MAJOR, tile(W-MAJOR, X, Y, S))
+ *
+ * (See brw_blorp_build_nir_shader).
+ */
+static inline nir_ssa_def *
+blorp_nir_retile_w_to_y(nir_builder *b, nir_ssa_def *pos)
+{
+ assert(pos->num_components == 2);
+ nir_ssa_def *x_W = nir_channel(b, pos, 0);
+ nir_ssa_def *y_W = nir_channel(b, pos, 1);
+
+ /* Applying the same logic as above, but in reverse, we obtain the
+ * formulas:
+ *
+ * X' = (X & ~0b101) << 1 | (Y & 0b10) << 2 | (Y & 0b1) << 1 | X & 0b1
+ * Y' = (Y & ~0b11) >> 1 | (X & 0b100) >> 2
+ */
+ nir_ssa_def *x_Y = nir_imm_int(b, 0);
+ x_Y = nir_mask_shift_or(b, x_Y, x_W, 0xfffffffa, 1);
+ x_Y = nir_mask_shift_or(b, x_Y, y_W, 0x2, 2);
+ x_Y = nir_mask_shift_or(b, x_Y, y_W, 0x1, 1);
+ x_Y = nir_mask_shift_or(b, x_Y, x_W, 0x1, 0);
+
+ nir_ssa_def *y_Y = nir_imm_int(b, 0);
+ y_Y = nir_mask_shift_or(b, y_Y, y_W, 0xfffffffc, -1);
+ y_Y = nir_mask_shift_or(b, y_Y, x_W, 0x4, -2);
+
+ return nir_vec2(b, x_Y, y_Y);
+}
+
+/**
+ * Emit code to compensate for the difference between MSAA and non-MSAA
+ * surfaces.
+ *
+ * This code modifies the X and Y coordinates according to the formula:
+ *
+ * (X', Y', S') = encode_msaa(num_samples, IMS, X, Y, S)
+ *
+ * (See brw_blorp_blit_program).
+ */
+static inline nir_ssa_def *
+blorp_nir_encode_msaa(nir_builder *b, nir_ssa_def *pos,
+ unsigned num_samples, enum isl_msaa_layout layout)
+{
+ assert(pos->num_components == 2 || pos->num_components == 3);
+
+ switch (layout) {
+ case ISL_MSAA_LAYOUT_NONE:
+ assert(pos->num_components == 2);
+ return pos;
+ case ISL_MSAA_LAYOUT_ARRAY:
+ /* No translation needed */
+ return pos;
+ case ISL_MSAA_LAYOUT_INTERLEAVED: {
+ nir_ssa_def *x_in = nir_channel(b, pos, 0);
+ nir_ssa_def *y_in = nir_channel(b, pos, 1);
+ nir_ssa_def *s_in = pos->num_components == 2 ? nir_imm_int(b, 0) :
+ nir_channel(b, pos, 2);
+
+ nir_ssa_def *x_out = nir_imm_int(b, 0);
+ nir_ssa_def *y_out = nir_imm_int(b, 0);
+ switch (num_samples) {
+ case 2:
+ case 4:
+ /* encode_msaa(2, IMS, X, Y, S) = (X', Y', 0)
+ * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
+ * Y' = Y
+ *
+ * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
+ * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
+ * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
+ */
+ x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 1);
+ x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1);
+ x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
+ if (num_samples == 2) {
+ y_out = y_in;
+ } else {
+ y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 1);
+ y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0);
+ y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
+ }
+ break;
+
+ case 8:
+ /* encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
+ * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1
+ * | (X & 0b1)
+ * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
+ */
+ x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 2);
+ x_out = nir_mask_shift_or(b, x_out, s_in, 0x4, 0);
+ x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1);
+ x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
+ y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 1);
+ y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0);
+ y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
+ break;
+
+ case 16:
+ /* encode_msaa(16, IMS, X, Y, S) = (X', Y', 0)
+ * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1
+ * | (X & 0b1)
+ * Y' = (Y & ~0b1) << 2 | (S & 0b1000) >> 1 (S & 0b10)
+ * | (Y & 0b1)
+ */
+ x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 2);
+ x_out = nir_mask_shift_or(b, x_out, s_in, 0x4, 0);
+ x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1);
+ x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
+ y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 2);
+ y_out = nir_mask_shift_or(b, y_out, s_in, 0x8, -1);
+ y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0);
+ y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
+ break;
+
+ default:
+ unreachable("Invalid number of samples for IMS layout");
+ }
+
+ return nir_vec2(b, x_out, y_out);
+ }
+
+ default:
+ unreachable("Invalid MSAA layout");
+ }
+}
+
+/**
+ * Emit code to compensate for the difference between MSAA and non-MSAA
+ * surfaces.
+ *
+ * This code modifies the X and Y coordinates according to the formula:
+ *
+ * (X', Y', S) = decode_msaa(num_samples, IMS, X, Y, S)
+ *
+ * (See brw_blorp_blit_program).
+ */
+static inline nir_ssa_def *
+blorp_nir_decode_msaa(nir_builder *b, nir_ssa_def *pos,
+ unsigned num_samples, enum isl_msaa_layout layout)
+{
+ assert(pos->num_components == 2 || pos->num_components == 3);
+
+ switch (layout) {
+ case ISL_MSAA_LAYOUT_NONE:
+ /* No translation necessary, and S should already be zero. */
+ assert(pos->num_components == 2);
+ return pos;
+ case ISL_MSAA_LAYOUT_ARRAY:
+ /* No translation necessary. */
+ return pos;
+ case ISL_MSAA_LAYOUT_INTERLEAVED: {
+ assert(pos->num_components == 2);
+
+ nir_ssa_def *x_in = nir_channel(b, pos, 0);
+ nir_ssa_def *y_in = nir_channel(b, pos, 1);
+
+ nir_ssa_def *x_out = nir_imm_int(b, 0);
+ nir_ssa_def *y_out = nir_imm_int(b, 0);
+ nir_ssa_def *s_out = nir_imm_int(b, 0);
+ switch (num_samples) {
+ case 2:
+ case 4:
+ /* decode_msaa(2, IMS, X, Y, 0) = (X', Y', S)
+ * where X' = (X & ~0b11) >> 1 | (X & 0b1)
+ * S = (X & 0b10) >> 1
+ *
+ * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
+ * where X' = (X & ~0b11) >> 1 | (X & 0b1)
+ * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
+ * S = (Y & 0b10) | (X & 0b10) >> 1
+ */
+ x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffc, -1);
+ x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
+ if (num_samples == 2) {
+ y_out = y_in;
+ s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
+ } else {
+ y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffc, -1);
+ y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
+ s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
+ s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0);
+ }
+ break;
+
+ case 8:
+ /* decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
+ * where X' = (X & ~0b111) >> 2 | (X & 0b1)
+ * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
+ * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1
+ */
+ x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffff8, -2);
+ x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
+ y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffc, -1);
+ y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
+ s_out = nir_mask_shift_or(b, s_out, x_in, 0x4, 0);
+ s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0);
+ s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
+ break;
+
+ case 16:
+ /* decode_msaa(16, IMS, X, Y, 0) = (X', Y', S)
+ * where X' = (X & ~0b111) >> 2 | (X & 0b1)
+ * Y' = (Y & ~0b111) >> 2 | (Y & 0b1)
+ * S = (Y & 0b100) << 1 | (X & 0b100) |
+ * (Y & 0b10) | (X & 0b10) >> 1
+ */
+ x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffff8, -2);
+ x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
+ y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffff8, -2);
+ y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
+ s_out = nir_mask_shift_or(b, s_out, y_in, 0x4, 1);
+ s_out = nir_mask_shift_or(b, s_out, x_in, 0x4, 0);
+ s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0);
+ s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
+ break;
+
+ default:
+ unreachable("Invalid number of samples for IMS layout");
+ }
+
+ return nir_vec3(b, x_out, y_out, s_out);
+ }
+
+ default:
+ unreachable("Invalid MSAA layout");
+ }
+}
+
+/**
+ * Count the number of trailing 1 bits in the given value. For example:
+ *
+ * count_trailing_one_bits(0) == 0
+ * count_trailing_one_bits(7) == 3
+ * count_trailing_one_bits(11) == 2
+ */
+static inline int count_trailing_one_bits(unsigned value)
+{
+#ifdef HAVE___BUILTIN_CTZ
+ return __builtin_ctz(~value);
+#else
+ return _mesa_bitcount(value & ~(value + 1));
+#endif
+}
+
+static nir_ssa_def *
+blorp_nir_manual_blend_average(nir_builder *b, struct brw_blorp_blit_vars *v,
+ nir_ssa_def *pos, unsigned tex_samples,
+ enum isl_aux_usage tex_aux_usage,
+ nir_alu_type dst_type)
+{
+ /* If non-null, this is the outer-most if statement */
+ nir_if *outer_if = NULL;
+
+ nir_variable *color =
+ nir_local_variable_create(b->impl, glsl_vec4_type(), "color");
+
+ nir_ssa_def *mcs = NULL;
+ if (tex_aux_usage == ISL_AUX_USAGE_MCS)
+ mcs = blorp_nir_txf_ms_mcs(b, v, pos);
+
+ /* We add together samples using a binary tree structure, e.g. for 4x MSAA:
+ *
+ * result = ((sample[0] + sample[1]) + (sample[2] + sample[3])) / 4
+ *
+ * This ensures that when all samples have the same value, no numerical
+ * precision is lost, since each addition operation always adds two equal
+ * values, and summing two equal floating point values does not lose
+ * precision.
+ *
+ * We perform this computation by treating the texture_data array as a
+ * stack and performing the following operations:
+ *
+ * - push sample 0 onto stack
+ * - push sample 1 onto stack
+ * - add top two stack entries
+ * - push sample 2 onto stack
+ * - push sample 3 onto stack
+ * - add top two stack entries
+ * - add top two stack entries
+ * - divide top stack entry by 4
+ *
+ * Note that after pushing sample i onto the stack, the number of add
+ * operations we do is equal to the number of trailing 1 bits in i. This
+ * works provided the total number of samples is a power of two, which it
+ * always is for i965.
+ *
+ * For integer formats, we replace the add operations with average
+ * operations and skip the final division.
+ */
+ nir_ssa_def *texture_data[5];
+ unsigned stack_depth = 0;
+ for (unsigned i = 0; i < tex_samples; ++i) {
+ assert(stack_depth == _mesa_bitcount(i)); /* Loop invariant */
+
+ /* Push sample i onto the stack */
+ assert(stack_depth < ARRAY_SIZE(texture_data));
+
+ nir_ssa_def *ms_pos = nir_vec3(b, nir_channel(b, pos, 0),
+ nir_channel(b, pos, 1),
+ nir_imm_int(b, i));
+ texture_data[stack_depth++] = blorp_nir_txf_ms(b, v, ms_pos, mcs, dst_type);
+
+ if (i == 0 && tex_aux_usage == ISL_AUX_USAGE_MCS) {
+ /* The Ivy Bridge PRM, Vol4 Part1 p27 (Multisample Control Surface)
+ * suggests an optimization:
+ *
+ * "A simple optimization with probable large return in
+ * performance is to compare the MCS value to zero (indicating
+ * all samples are on sample slice 0), and sample only from
+ * sample slice 0 using ld2dss if MCS is zero."
+ *
+ * Note that in the case where the MCS value is zero, sampling from
+ * sample slice 0 using ld2dss and sampling from sample 0 using
+ * ld2dms are equivalent (since all samples are on sample slice 0).
+ * Since we have already sampled from sample 0, all we need to do is
+ * skip the remaining fetches and averaging if MCS is zero.
+ */
+ nir_ssa_def *mcs_zero =
+ nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b, 0));
+ if (tex_samples == 16) {
+ mcs_zero = nir_iand(b, mcs_zero,
+ nir_ieq(b, nir_channel(b, mcs, 1), nir_imm_int(b, 0)));
+ }
+
+ nir_if *if_stmt = nir_if_create(b->shader);
+ if_stmt->condition = nir_src_for_ssa(mcs_zero);
+ nir_cf_node_insert(b->cursor, &if_stmt->cf_node);
+
+ b->cursor = nir_after_cf_list(&if_stmt->then_list);
+ nir_store_var(b, color, texture_data[0], 0xf);
+
+ b->cursor = nir_after_cf_list(&if_stmt->else_list);
+ outer_if = if_stmt;
+ }
+
+ for (int j = 0; j < count_trailing_one_bits(i); j++) {
+ assert(stack_depth >= 2);
+ --stack_depth;
+
+ assert(dst_type == nir_type_float);
+ texture_data[stack_depth - 1] =
+ nir_fadd(b, texture_data[stack_depth - 1],
+ texture_data[stack_depth]);
+ }
+ }
+
+ /* We should have just 1 sample on the stack now. */
+ assert(stack_depth == 1);
+
+ texture_data[0] = nir_fmul(b, texture_data[0],
+ nir_imm_float(b, 1.0 / tex_samples));
+
+ nir_store_var(b, color, texture_data[0], 0xf);
+
+ if (outer_if)
+ b->cursor = nir_after_cf_node(&outer_if->cf_node);
+
+ return nir_load_var(b, color);
+}
+
+static inline nir_ssa_def *
+nir_imm_vec2(nir_builder *build, float x, float y)
+{
+ nir_const_value v;
+
+ memset(&v, 0, sizeof(v));
+ v.f32[0] = x;
+ v.f32[1] = y;
+
+ return nir_build_imm(build, 4, 32, v);
+}
+
+static nir_ssa_def *
+blorp_nir_manual_blend_bilinear(nir_builder *b, nir_ssa_def *pos,
+ unsigned tex_samples,
+ const struct brw_blorp_blit_prog_key *key,
+ struct brw_blorp_blit_vars *v)
+{
+ nir_ssa_def *pos_xy = nir_channels(b, pos, 0x3);
+ nir_ssa_def *rect_grid = nir_load_var(b, v->v_rect_grid);
+ nir_ssa_def *scale = nir_imm_vec2(b, key->x_scale, key->y_scale);
+
+ /* Translate coordinates to lay out the samples in a rectangular grid
+ * roughly corresponding to sample locations.
+ */
+ pos_xy = nir_fmul(b, pos_xy, scale);
+ /* Adjust coordinates so that integers represent pixel centers rather
+ * than pixel edges.
+ */
+ pos_xy = nir_fadd(b, pos_xy, nir_imm_float(b, -0.5));
+ /* Clamp the X, Y texture coordinates to properly handle the sampling of
+ * texels on texture edges.
+ */
+ pos_xy = nir_fmin(b, nir_fmax(b, pos_xy, nir_imm_float(b, 0.0)),
+ nir_vec2(b, nir_channel(b, rect_grid, 0),
+ nir_channel(b, rect_grid, 1)));
+
+ /* Store the fractional parts to be used as bilinear interpolation
+ * coefficients.
+ */
+ nir_ssa_def *frac_xy = nir_ffract(b, pos_xy);
+ /* Round the float coordinates down to nearest integer */
+ pos_xy = nir_fdiv(b, nir_ftrunc(b, pos_xy), scale);
+
+ nir_ssa_def *tex_data[4];
+ for (unsigned i = 0; i < 4; ++i) {
+ float sample_off_x = (float)(i & 0x1) / key->x_scale;
+ float sample_off_y = (float)((i >> 1) & 0x1) / key->y_scale;
+ nir_ssa_def *sample_off = nir_imm_vec2(b, sample_off_x, sample_off_y);
+
+ nir_ssa_def *sample_coords = nir_fadd(b, pos_xy, sample_off);
+ nir_ssa_def *sample_coords_int = nir_f2i(b, sample_coords);
+
+ /* The MCS value we fetch has to match up with the pixel that we're
+ * sampling from. Since we sample from different pixels in each
+ * iteration of this "for" loop, the call to mcs_fetch() should be
+ * here inside the loop after computing the pixel coordinates.
+ */
+ nir_ssa_def *mcs = NULL;
+ if (key->tex_aux_usage == ISL_AUX_USAGE_MCS)
+ mcs = blorp_nir_txf_ms_mcs(b, v, sample_coords_int);
+
+ /* Compute sample index and map the sample index to a sample number.
+ * Sample index layout shows the numbering of slots in a rectangular
+ * grid of samples with in a pixel. Sample number layout shows the
+ * rectangular grid of samples roughly corresponding to the real sample
+ * locations with in a pixel.
+ * In case of 4x MSAA, layout of sample indices matches the layout of
+ * sample numbers:
+ * ---------
+ * | 0 | 1 |
+ * ---------
+ * | 2 | 3 |
+ * ---------
+ *
+ * In case of 8x MSAA the two layouts don't match.
+ * sample index layout : --------- sample number layout : ---------
+ * | 0 | 1 | | 3 | 7 |
+ * --------- ---------
+ * | 2 | 3 | | 5 | 0 |
+ * --------- ---------
+ * | 4 | 5 | | 1 | 2 |
+ * --------- ---------
+ * | 6 | 7 | | 4 | 6 |
+ * --------- ---------
+ *
+ * Fortunately, this can be done fairly easily as:
+ * S' = (0x17306425 >> (S * 4)) & 0xf
+ *
+ * In the case of 16x MSAA the two layouts don't match.
+ * Sample index layout: Sample number layout:
+ * --------------------- ---------------------
+ * | 0 | 1 | 2 | 3 | | 15 | 10 | 9 | 7 |
+ * --------------------- ---------------------
+ * | 4 | 5 | 6 | 7 | | 4 | 1 | 3 | 13 |
+ * --------------------- ---------------------
+ * | 8 | 9 | 10 | 11 | | 12 | 2 | 0 | 6 |
+ * --------------------- ---------------------
+ * | 12 | 13 | 14 | 15 | | 11 | 8 | 5 | 14 |
+ * --------------------- ---------------------
+ *
+ * This is equivalent to
+ * S' = (0xe58b602cd31479af >> (S * 4)) & 0xf
+ */
+ nir_ssa_def *frac = nir_ffract(b, sample_coords);
+ nir_ssa_def *sample =
+ nir_fdot2(b, frac, nir_imm_vec2(b, key->x_scale,
+ key->x_scale * key->y_scale));
+ sample = nir_f2i(b, sample);
+
+ if (tex_samples == 8) {
+ sample = nir_iand(b, nir_ishr(b, nir_imm_int(b, 0x64210573),
+ nir_ishl(b, sample, nir_imm_int(b, 2))),
+ nir_imm_int(b, 0xf));
+ } else if (tex_samples == 16) {
+ nir_ssa_def *sample_low =
+ nir_iand(b, nir_ishr(b, nir_imm_int(b, 0xd31479af),
+ nir_ishl(b, sample, nir_imm_int(b, 2))),
+ nir_imm_int(b, 0xf));
+ nir_ssa_def *sample_high =
+ nir_iand(b, nir_ishr(b, nir_imm_int(b, 0xe58b602c),
+ nir_ishl(b, nir_iadd(b, sample,
+ nir_imm_int(b, -8)),
+ nir_imm_int(b, 2))),
+ nir_imm_int(b, 0xf));
+
+ sample = nir_bcsel(b, nir_ilt(b, sample, nir_imm_int(b, 8)),
+ sample_low, sample_high);
+ }
+ nir_ssa_def *pos_ms = nir_vec3(b, nir_channel(b, sample_coords_int, 0),
+ nir_channel(b, sample_coords_int, 1),
+ sample);
+ tex_data[i] = blorp_nir_txf_ms(b, v, pos_ms, mcs, key->texture_data_type);
+ }
+
+ nir_ssa_def *frac_x = nir_channel(b, frac_xy, 0);
+ nir_ssa_def *frac_y = nir_channel(b, frac_xy, 1);
+ return nir_flrp(b, nir_flrp(b, tex_data[0], tex_data[1], frac_x),
+ nir_flrp(b, tex_data[2], tex_data[3], frac_x),
+ frac_y);
+}
+
+/**
+ * Generator for WM programs used in BLORP blits.
+ *
+ * The bulk of the work done by the WM program is to wrap and unwrap the
+ * coordinate transformations used by the hardware to store surfaces in
+ * memory. The hardware transforms a pixel location (X, Y, S) (where S is the
+ * sample index for a multisampled surface) to a memory offset by the
+ * following formulas:
+ *
+ * offset = tile(tiling_format, encode_msaa(num_samples, layout, X, Y, S))
+ * (X, Y, S) = decode_msaa(num_samples, layout, detile(tiling_format, offset))
+ *
+ * For a single-sampled surface, or for a multisampled surface using
+ * INTEL_MSAA_LAYOUT_UMS, encode_msaa() and decode_msaa are the identity
+ * function:
+ *
+ * encode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)
+ * decode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)
+ * encode_msaa(n, UMS, X, Y, S) = (X, Y, S)
+ * decode_msaa(n, UMS, X, Y, S) = (X, Y, S)
+ *
+ * For a 4x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()
+ * embeds the sample number into bit 1 of the X and Y coordinates:
+ *
+ * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
+ * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
+ * Y' = (Y & ~0b1 ) << 1 | (S & 0b10) | (Y & 0b1)
+ * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
+ * where X' = (X & ~0b11) >> 1 | (X & 0b1)
+ * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
+ * S = (Y & 0b10) | (X & 0b10) >> 1
+ *
+ * For an 8x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()
+ * embeds the sample number into bits 1 and 2 of the X coordinate and bit 1 of
+ * the Y coordinate:
+ *
+ * encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
+ * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 | (X & 0b1)
+ * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
+ * decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
+ * where X' = (X & ~0b111) >> 2 | (X & 0b1)
+ * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
+ * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1
+ *
+ * For X tiling, tile() combines together the low-order bits of the X and Y
+ * coordinates in the pattern 0byyyxxxxxxxxx, creating 4k tiles that are 512
+ * bytes wide and 8 rows high:
+ *
+ * tile(x_tiled, X, Y, S) = A
+ * where A = tile_num << 12 | offset
+ * tile_num = (Y' >> 3) * tile_pitch + (X' >> 9)
+ * offset = (Y' & 0b111) << 9
+ * | (X & 0b111111111)
+ * X' = X * cpp
+ * Y' = Y + S * qpitch
+ * detile(x_tiled, A) = (X, Y, S)
+ * where X = X' / cpp
+ * Y = Y' % qpitch
+ * S = Y' / qpitch
+ * Y' = (tile_num / tile_pitch) << 3
+ * | (A & 0b111000000000) >> 9
+ * X' = (tile_num % tile_pitch) << 9
+ * | (A & 0b111111111)
+ *
+ * (In all tiling formulas, cpp is the number of bytes occupied by a single
+ * sample ("chars per pixel"), tile_pitch is the number of 4k tiles required
+ * to fill the width of the surface, and qpitch is the spacing (in rows)
+ * between array slices).
+ *
+ * For Y tiling, tile() combines together the low-order bits of the X and Y
+ * coordinates in the pattern 0bxxxyyyyyxxxx, creating 4k tiles that are 128
+ * bytes wide and 32 rows high:
+ *
+ * tile(y_tiled, X, Y, S) = A
+ * where A = tile_num << 12 | offset
+ * tile_num = (Y' >> 5) * tile_pitch + (X' >> 7)
+ * offset = (X' & 0b1110000) << 5
+ * | (Y' & 0b11111) << 4
+ * | (X' & 0b1111)
+ * X' = X * cpp
+ * Y' = Y + S * qpitch
+ * detile(y_tiled, A) = (X, Y, S)
+ * where X = X' / cpp
+ * Y = Y' % qpitch
+ * S = Y' / qpitch
+ * Y' = (tile_num / tile_pitch) << 5
+ * | (A & 0b111110000) >> 4
+ * X' = (tile_num % tile_pitch) << 7
+ * | (A & 0b111000000000) >> 5
+ * | (A & 0b1111)
+ *
+ * For W tiling, tile() combines together the low-order bits of the X and Y
+ * coordinates in the pattern 0bxxxyyyyxyxyx, creating 4k tiles that are 64
+ * bytes wide and 64 rows high (note that W tiling is only used for stencil
+ * buffers, which always have cpp = 1 and S=0):
+ *
+ * tile(w_tiled, X, Y, S) = A
+ * where A = tile_num << 12 | offset
+ * tile_num = (Y' >> 6) * tile_pitch + (X' >> 6)
+ * offset = (X' & 0b111000) << 6
+ * | (Y' & 0b111100) << 3
+ * | (X' & 0b100) << 2
+ * | (Y' & 0b10) << 2
+ * | (X' & 0b10) << 1
+ * | (Y' & 0b1) << 1
+ * | (X' & 0b1)
+ * X' = X * cpp = X
+ * Y' = Y + S * qpitch
+ * detile(w_tiled, A) = (X, Y, S)
+ * where X = X' / cpp = X'
+ * Y = Y' % qpitch = Y'
+ * S = Y / qpitch = 0
+ * Y' = (tile_num / tile_pitch) << 6
+ * | (A & 0b111100000) >> 3
+ * | (A & 0b1000) >> 2
+ * | (A & 0b10) >> 1
+ * X' = (tile_num % tile_pitch) << 6
+ * | (A & 0b111000000000) >> 6
+ * | (A & 0b10000) >> 2
+ * | (A & 0b100) >> 1
+ * | (A & 0b1)
+ *
+ * Finally, for a non-tiled surface, tile() simply combines together the X and
+ * Y coordinates in the natural way:
+ *
+ * tile(untiled, X, Y, S) = A
+ * where A = Y * pitch + X'
+ * X' = X * cpp
+ * Y' = Y + S * qpitch
+ * detile(untiled, A) = (X, Y, S)
+ * where X = X' / cpp
+ * Y = Y' % qpitch
+ * S = Y' / qpitch
+ * X' = A % pitch
+ * Y' = A / pitch
+ *
+ * (In these formulas, pitch is the number of bytes occupied by a single row
+ * of samples).
+ */
+static nir_shader *
+brw_blorp_build_nir_shader(struct blorp_context *blorp,
+ const struct brw_blorp_blit_prog_key *key)
+{
+ const struct brw_device_info *devinfo = blorp->isl_dev->info;
+ nir_ssa_def *src_pos, *dst_pos, *color;
+
+ /* Sanity checks */
+ if (key->dst_tiled_w && key->rt_samples > 1) {
+ /* If the destination image is W tiled and multisampled, then the thread
+ * must be dispatched once per sample, not once per pixel. This is
+ * necessary because after conversion between W and Y tiling, there's no
+ * guarantee that all samples corresponding to a single pixel will still
+ * be together.
+ */
+ assert(key->persample_msaa_dispatch);
+ }
+
+ if (key->blend) {
+ /* We are blending, which means we won't have an opportunity to
+ * translate the tiling and sample count for the texture surface. So
+ * the surface state for the texture must be configured with the correct
+ * tiling and sample count.
+ */
+ assert(!key->src_tiled_w);
+ assert(key->tex_samples == key->src_samples);
+ assert(key->tex_layout == key->src_layout);
+ assert(key->tex_samples > 0);
+ }
+
+ if (key->persample_msaa_dispatch) {
+ /* It only makes sense to do persample dispatch if the render target is
+ * configured as multisampled.
+ */
+ assert(key->rt_samples > 0);
+ }
+
+ /* Make sure layout is consistent with sample count */
+ assert((key->tex_layout == ISL_MSAA_LAYOUT_NONE) ==
+ (key->tex_samples <= 1));
+ assert((key->rt_layout == ISL_MSAA_LAYOUT_NONE) ==
+ (key->rt_samples <= 1));
+ assert((key->src_layout == ISL_MSAA_LAYOUT_NONE) ==
+ (key->src_samples <= 1));
+ assert((key->dst_layout == ISL_MSAA_LAYOUT_NONE) ==
+ (key->dst_samples <= 1));
+
+ nir_builder b;
+ nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
+
+ struct brw_blorp_blit_vars v;
+ brw_blorp_blit_vars_init(&b, &v, key);
+
+ dst_pos = blorp_blit_get_frag_coords(&b, key, &v);
+
+ /* Render target and texture hardware don't support W tiling until Gen8. */
+ const bool rt_tiled_w = false;
+ const bool tex_tiled_w = devinfo->gen >= 8 && key->src_tiled_w;
+
+ /* The address that data will be written to is determined by the
+ * coordinates supplied to the WM thread and the tiling and sample count of
+ * the render target, according to the formula:
+ *
+ * (X, Y, S) = decode_msaa(rt_samples, detile(rt_tiling, offset))
+ *
+ * If the actual tiling and sample count of the destination surface are not
+ * the same as the configuration of the render target, then these
+ * coordinates are wrong and we have to adjust them to compensate for the
+ * difference.
+ */
+ if (rt_tiled_w != key->dst_tiled_w ||
+ key->rt_samples != key->dst_samples ||
+ key->rt_layout != key->dst_layout) {
+ dst_pos = blorp_nir_encode_msaa(&b, dst_pos, key->rt_samples,
+ key->rt_layout);
+ /* Now (X, Y, S) = detile(rt_tiling, offset) */
+ if (rt_tiled_w != key->dst_tiled_w)
+ dst_pos = blorp_nir_retile_y_to_w(&b, dst_pos);
+ /* Now (X, Y, S) = detile(rt_tiling, offset) */
+ dst_pos = blorp_nir_decode_msaa(&b, dst_pos, key->dst_samples,
+ key->dst_layout);
+ }
+
+ /* Now (X, Y, S) = decode_msaa(dst_samples, detile(dst_tiling, offset)).
+ *
+ * That is: X, Y and S now contain the true coordinates and sample index of
+ * the data that the WM thread should output.
+ *
+ * If we need to kill pixels that are outside the destination rectangle,
+ * now is the time to do it.
+ */
+ if (key->use_kill) {
+ assert(!(key->blend && key->blit_scaled));
+ blorp_nir_discard_if_outside_rect(&b, dst_pos, &v);
+ }
+
+ src_pos = blorp_blit_apply_transform(&b, nir_i2f(&b, dst_pos), &v);
+ if (dst_pos->num_components == 3) {
+ /* The sample coordinate is an integer that we want left alone but
+ * blorp_blit_apply_transform() blindly applies the transform to all
+ * three coordinates. Grab the original sample index.
+ */
+ src_pos = nir_vec3(&b, nir_channel(&b, src_pos, 0),
+ nir_channel(&b, src_pos, 1),
+ nir_channel(&b, dst_pos, 2));
+ }
+
+ /* If the source image is not multisampled, then we want to fetch sample
+ * number 0, because that's the only sample there is.
+ */
+ if (key->src_samples == 1)
+ src_pos = nir_channels(&b, src_pos, 0x3);
+
+ /* X, Y, and S are now the coordinates of the pixel in the source image
+ * that we want to texture from. Exception: if we are blending, then S is
+ * irrelevant, because we are going to fetch all samples.
+ */
+ if (key->blend && !key->blit_scaled) {
+ /* Resolves (effecively) use texelFetch, so we need integers and we
+ * don't care about the sample index if we got one.
+ */
+ src_pos = nir_f2i(&b, nir_channels(&b, src_pos, 0x3));
+
+ if (devinfo->gen == 6) {
+ /* Because gen6 only supports 4x interleved MSAA, we can do all the
+ * blending we need with a single linear-interpolated texture lookup
+ * at the center of the sample. The texture coordinates to be odd
+ * integers so that they correspond to the center of a 2x2 block
+ * representing the four samples that maxe up a pixel. So we need
+ * to multiply our X and Y coordinates each by 2 and then add 1.
+ */
+ src_pos = nir_ishl(&b, src_pos, nir_imm_int(&b, 1));
+ src_pos = nir_iadd(&b, src_pos, nir_imm_int(&b, 1));
+ src_pos = nir_i2f(&b, src_pos);
+ color = blorp_nir_tex(&b, &v, src_pos, key->texture_data_type);
+ } else {
+ /* Gen7+ hardware doesn't automaticaly blend. */
+ color = blorp_nir_manual_blend_average(&b, &v, src_pos, key->src_samples,
+ key->tex_aux_usage,
+ key->texture_data_type);
+ }
+ } else if (key->blend && key->blit_scaled) {
+ assert(!key->use_kill);
+ color = blorp_nir_manual_blend_bilinear(&b, src_pos, key->src_samples, key, &v);
+ } else {
+ if (key->bilinear_filter) {
+ color = blorp_nir_tex(&b, &v, src_pos, key->texture_data_type);
+ } else {
+ /* We're going to use texelFetch, so we need integers */
+ if (src_pos->num_components == 2) {
+ src_pos = nir_f2i(&b, src_pos);
+ } else {
+ assert(src_pos->num_components == 3);
+ src_pos = nir_vec3(&b, nir_channel(&b, nir_f2i(&b, src_pos), 0),
+ nir_channel(&b, nir_f2i(&b, src_pos), 1),
+ nir_channel(&b, src_pos, 2));
+ }
+
+ /* We aren't blending, which means we just want to fetch a single
+ * sample from the source surface. The address that we want to fetch
+ * from is related to the X, Y and S values according to the formula:
+ *
+ * (X, Y, S) = decode_msaa(src_samples, detile(src_tiling, offset)).
+ *
+ * If the actual tiling and sample count of the source surface are
+ * not the same as the configuration of the texture, then we need to
+ * adjust the coordinates to compensate for the difference.
+ */
+ if (tex_tiled_w != key->src_tiled_w ||
+ key->tex_samples != key->src_samples ||
+ key->tex_layout != key->src_layout) {
+ src_pos = blorp_nir_encode_msaa(&b, src_pos, key->src_samples,
+ key->src_layout);
+ /* Now (X, Y, S) = detile(src_tiling, offset) */
+ if (tex_tiled_w != key->src_tiled_w)
+ src_pos = blorp_nir_retile_w_to_y(&b, src_pos);
+ /* Now (X, Y, S) = detile(tex_tiling, offset) */
+ src_pos = blorp_nir_decode_msaa(&b, src_pos, key->tex_samples,
+ key->tex_layout);
+ }
+
+ /* Now (X, Y, S) = decode_msaa(tex_samples, detile(tex_tiling, offset)).
+ *
+ * In other words: X, Y, and S now contain values which, when passed to
+ * the texturing unit, will cause data to be read from the correct
+ * memory location. So we can fetch the texel now.
+ */
+ if (key->src_samples == 1) {
+ color = blorp_nir_txf(&b, &v, src_pos, key->texture_data_type);
+ } else {
+ nir_ssa_def *mcs = NULL;
+ if (key->tex_aux_usage == ISL_AUX_USAGE_MCS)
+ mcs = blorp_nir_txf_ms_mcs(&b, &v, src_pos);
+
+ color = blorp_nir_txf_ms(&b, &v, src_pos, mcs, key->texture_data_type);
+ }
+ }
+ }
+
+ nir_store_var(&b, v.color_out, color, 0xf);
+
+ return b.shader;
+}
+
+static void
+brw_blorp_get_blit_kernel(struct blorp_context *blorp,
+ struct blorp_params *params,
+ const struct brw_blorp_blit_prog_key *prog_key)
+{
+ if (blorp->lookup_shader(blorp, prog_key, sizeof(*prog_key),
+ &params->wm_prog_kernel, &params->wm_prog_data))
+ return;
+
+ const unsigned *program;
+ unsigned program_size;
+ struct brw_blorp_prog_data prog_data;
+
+ /* Try and compile with NIR first. If that fails, fall back to the old
+ * method of building shaders manually.
+ */
+ nir_shader *nir = brw_blorp_build_nir_shader(blorp, prog_key);
+ struct brw_wm_prog_key wm_key;
+ brw_blorp_init_wm_prog_key(&wm_key);
+ wm_key.tex.compressed_multisample_layout_mask =
+ prog_key->tex_aux_usage == ISL_AUX_USAGE_MCS;
+ wm_key.tex.msaa_16 = prog_key->tex_samples == 16;
+ wm_key.multisample_fbo = prog_key->rt_samples > 1;
+
+ program = brw_blorp_compile_nir_shader(blorp, nir, &wm_key, false,
+ &prog_data, &program_size);
+
+ blorp->upload_shader(blorp, prog_key, sizeof(*prog_key),
+ program, program_size,
+ &prog_data, sizeof(prog_data),
+ &params->wm_prog_kernel, &params->wm_prog_data);
+}
+
+static void
+brw_blorp_setup_coord_transform(struct brw_blorp_coord_transform *xform,
+ GLfloat src0, GLfloat src1,
+ GLfloat dst0, GLfloat dst1,
+ bool mirror)
+{
+ float scale = (src1 - src0) / (dst1 - dst0);
+ if (!mirror) {
+ /* When not mirroring a coordinate (say, X), we need:
+ * src_x - src_x0 = (dst_x - dst_x0 + 0.5) * scale
+ * Therefore:
+ * src_x = src_x0 + (dst_x - dst_x0 + 0.5) * scale
+ *
+ * blorp program uses "round toward zero" to convert the
+ * transformed floating point coordinates to integer coordinates,
+ * whereas the behaviour we actually want is "round to nearest",
+ * so 0.5 provides the necessary correction.
+ */
+ xform->multiplier = scale;
+ xform->offset = src0 + (-dst0 + 0.5f) * scale;
+ } else {
+ /* When mirroring X we need:
+ * src_x - src_x0 = dst_x1 - dst_x - 0.5
+ * Therefore:
+ * src_x = src_x0 + (dst_x1 -dst_x - 0.5) * scale
+ */
+ xform->multiplier = -scale;
+ xform->offset = src0 + (dst1 - 0.5f) * scale;
+ }
+}
+
+/**
+ * Convert an swizzle enumeration (i.e. SWIZZLE_X) to one of the Gen7.5+
+ * "Shader Channel Select" enumerations (i.e. HSW_SCS_RED). The mappings are
+ *
+ * SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_ZERO, SWIZZLE_ONE
+ * 0 1 2 3 4 5
+ * 4 5 6 7 0 1
+ * SCS_RED, SCS_GREEN, SCS_BLUE, SCS_ALPHA, SCS_ZERO, SCS_ONE
+ *
+ * which is simply adding 4 then modding by 8 (or anding with 7).
+ *
+ * We then may need to apply workarounds for textureGather hardware bugs.
+ */
+static enum isl_channel_select
+swizzle_to_scs(GLenum swizzle)
+{
+ return (enum isl_channel_select)((swizzle + 4) & 7);
+}
+
+static void
+surf_convert_to_single_slice(const struct isl_device *isl_dev,
+ struct brw_blorp_surface_info *info)
+{
+ /* This only makes sense for a single level and array slice */
+ assert(info->view.levels == 1 && info->view.array_len == 1);
+
+ /* Just bail if we have nothing to do. */
+ if (info->surf.dim == ISL_SURF_DIM_2D &&
+ info->view.base_level == 0 && info->view.base_array_layer == 0 &&
+ info->surf.levels == 0 && info->surf.logical_level0_px.array_len == 0)
+ return;
+
+ uint32_t x_offset_sa, y_offset_sa;
+ isl_surf_get_image_offset_sa(&info->surf, info->view.base_level,
+ info->view.base_array_layer, 0,
+ &x_offset_sa, &y_offset_sa);
+
+ uint32_t byte_offset;
+ isl_tiling_get_intratile_offset_sa(isl_dev, info->surf.tiling,
+ info->view.format, info->surf.row_pitch,
+ x_offset_sa, y_offset_sa,
+ &byte_offset,
+ &info->tile_x_sa, &info->tile_y_sa);
+ info->addr.offset += byte_offset;
+
+ /* TODO: Once this file gets converted to C, we shouls just use designated
+ * initializers.
+ */
+ struct isl_surf_init_info init_info = { 0, };
+
+ init_info.dim = ISL_SURF_DIM_2D;
+ init_info.format = ISL_FORMAT_R8_UINT;
+ init_info.width =
+ minify(info->surf.logical_level0_px.width, info->view.base_level);
+ init_info.height =
+ minify(info->surf.logical_level0_px.height, info->view.base_level);
+ init_info.depth = 1;
+ init_info.levels = 1;
+ init_info.array_len = 1;
+ init_info.samples = info->surf.samples;
+ init_info.min_pitch = info->surf.row_pitch;
+ init_info.usage = info->surf.usage;
+ init_info.tiling_flags = 1 << info->surf.tiling;
+
+ isl_surf_init_s(isl_dev, &info->surf, &init_info);
+ assert(info->surf.row_pitch == init_info.min_pitch);
+
+ /* The view is also different now. */
+ info->view.base_level = 0;
+ info->view.levels = 1;
+ info->view.base_array_layer = 0;
+ info->view.array_len = 1;
+}
+
+static void
+surf_fake_interleaved_msaa(const struct isl_device *isl_dev,
+ struct brw_blorp_surface_info *info)
+{
+ assert(info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED);
+
+ /* First, we need to convert it to a simple 1-level 1-layer 2-D surface */
+ surf_convert_to_single_slice(isl_dev, info);
+
+ info->surf.logical_level0_px = info->surf.phys_level0_sa;
+ info->surf.samples = 1;
+ info->surf.msaa_layout = ISL_MSAA_LAYOUT_NONE;
+}
+
+static void
+surf_retile_w_to_y(const struct isl_device *isl_dev,
+ struct brw_blorp_surface_info *info)
+{
+ assert(info->surf.tiling == ISL_TILING_W);
+
+ /* First, we need to convert it to a simple 1-level 1-layer 2-D surface */
+ surf_convert_to_single_slice(isl_dev, info);
+
+ /* On gen7+, we don't have interleaved multisampling for color render
+ * targets so we have to fake it.
+ *
+ * TODO: Are we sure we don't also need to fake it on gen6?
+ */
+ if (isl_dev->info->gen > 6 &&
+ info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) {
+ info->surf.logical_level0_px = info->surf.phys_level0_sa;
+ info->surf.samples = 1;
+ info->surf.msaa_layout = ISL_MSAA_LAYOUT_NONE;
+ }
+
+ if (isl_dev->info->gen == 6) {
+ /* Gen6 stencil buffers have a very large alignment coming in from the
+ * miptree. It's out-of-bounds for what the surface state can handle.
+ * Since we have a single layer and level, it doesn't really matter as
+ * long as we don't pass a bogus value into isl_surf_fill_state().
+ */
+ info->surf.image_alignment_el = isl_extent3d(4, 2, 1);
+ }
+
+ /* Now that we've converted everything to a simple 2-D surface with only
+ * one miplevel, we can go about retiling it.
+ */
+ const unsigned x_align = 8, y_align = info->surf.samples != 0 ? 8 : 4;
+ info->surf.tiling = ISL_TILING_Y0;
+ info->surf.logical_level0_px.width =
+ ALIGN(info->surf.logical_level0_px.width, x_align) * 2;
+ info->surf.logical_level0_px.height =
+ ALIGN(info->surf.logical_level0_px.height, y_align) / 2;
+ info->tile_x_sa *= 2;
+ info->tile_y_sa /= 2;
+}
+
+void
+blorp_blit(struct blorp_batch *batch,
+ const struct blorp_surf *src_surf,
+ unsigned src_level, unsigned src_layer,
+ enum isl_format src_format, int src_swizzle,
+ const struct blorp_surf *dst_surf,
+ unsigned dst_level, unsigned dst_layer,
+ enum isl_format dst_format,
+ float src_x0, float src_y0,
+ float src_x1, float src_y1,
+ float dst_x0, float dst_y0,
+ float dst_x1, float dst_y1,
+ GLenum filter, bool mirror_x, bool mirror_y)
+{
+ const struct brw_device_info *devinfo = batch->blorp->isl_dev->info;
+
+ struct blorp_params params;
+ blorp_params_init(&params);
+
+ brw_blorp_surface_info_init(batch->blorp, &params.src, src_surf, src_level,
+ src_layer, src_format, false);
+ brw_blorp_surface_info_init(batch->blorp, &params.dst, dst_surf, dst_level,
+ dst_layer, dst_format, true);
+
+ struct brw_blorp_blit_prog_key wm_prog_key;
+ memset(&wm_prog_key, 0, sizeof(wm_prog_key));
+
+ if (isl_format_has_sint_channel(params.src.view.format)) {
+ wm_prog_key.texture_data_type = nir_type_int;
+ } else if (isl_format_has_uint_channel(params.src.view.format)) {
+ wm_prog_key.texture_data_type = nir_type_uint;
+ } else {
+ wm_prog_key.texture_data_type = nir_type_float;
+ }
+
+ /* Scaled blitting or not. */
+ wm_prog_key.blit_scaled =
+ ((dst_x1 - dst_x0) == (src_x1 - src_x0) &&
+ (dst_y1 - dst_y0) == (src_y1 - src_y0)) ? false : true;
+
+ /* Scaling factors used for bilinear filtering in multisample scaled
+ * blits.
+ */
+ if (params.src.surf.samples == 16)
+ wm_prog_key.x_scale = 4.0f;
+ else
+ wm_prog_key.x_scale = 2.0f;
+ wm_prog_key.y_scale = params.src.surf.samples / wm_prog_key.x_scale;
+
+ if (filter == GL_LINEAR &&
+ params.src.surf.samples <= 1 && params.dst.surf.samples <= 1)
+ wm_prog_key.bilinear_filter = true;
+
+ if ((params.src.surf.usage & ISL_SURF_USAGE_DEPTH_BIT) == 0 &&
+ (params.src.surf.usage & ISL_SURF_USAGE_STENCIL_BIT) == 0 &&
+ !isl_format_has_int_channel(params.src.surf.format) &&
+ params.src.surf.samples > 1 && params.dst.surf.samples <= 1) {
+ /* We are downsampling a non-integer color buffer, so blend.
+ *
+ * Regarding integer color buffers, the OpenGL ES 3.2 spec says:
+ *
+ * "If the source formats are integer types or stencil values, a
+ * single sample's value is selected for each pixel."
+ *
+ * This implies we should not blend in that case.
+ */
+ wm_prog_key.blend = true;
+ }
+
+ /* src_samples and dst_samples are the true sample counts */
+ wm_prog_key.src_samples = params.src.surf.samples;
+ wm_prog_key.dst_samples = params.dst.surf.samples;
+
+ wm_prog_key.tex_aux_usage = params.src.aux_usage;
+
+ /* src_layout and dst_layout indicate the true MSAA layout used by src and
+ * dst.
+ */
+ wm_prog_key.src_layout = params.src.surf.msaa_layout;
+ wm_prog_key.dst_layout = params.dst.surf.msaa_layout;
+
+ /* Round floating point values to nearest integer to avoid "off by one texel"
+ * kind of errors when blitting.
+ */
+ params.x0 = params.wm_inputs.discard_rect.x0 = roundf(dst_x0);
+ params.y0 = params.wm_inputs.discard_rect.y0 = roundf(dst_y0);
+ params.x1 = params.wm_inputs.discard_rect.x1 = roundf(dst_x1);
+ params.y1 = params.wm_inputs.discard_rect.y1 = roundf(dst_y1);
+
+ params.wm_inputs.rect_grid.x1 =
+ minify(params.src.surf.logical_level0_px.width, src_level) *
+ wm_prog_key.x_scale - 1.0f;
+ params.wm_inputs.rect_grid.y1 =
+ minify(params.src.surf.logical_level0_px.height, src_level) *
+ wm_prog_key.y_scale - 1.0f;
+
+ brw_blorp_setup_coord_transform(&params.wm_inputs.coord_transform[0],
+ src_x0, src_x1, dst_x0, dst_x1, mirror_x);
+ brw_blorp_setup_coord_transform(&params.wm_inputs.coord_transform[1],
+ src_y0, src_y1, dst_y0, dst_y1, mirror_y);
+
+ /* For some texture types, we need to pass the layer through the sampler. */
+ params.wm_inputs.src_z = params.src.z_offset;
+
+ if (devinfo->gen > 6 &&
+ params.dst.surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) {
+ assert(params.dst.surf.samples > 1);
+
+ /* We must expand the rectangle we send through the rendering pipeline,
+ * to account for the fact that we are mapping the destination region as
+ * single-sampled when it is in fact multisampled. We must also align
+ * it to a multiple of the multisampling pattern, because the
+ * differences between multisampled and single-sampled surface formats
+ * will mean that pixels are scrambled within the multisampling pattern.
+ * TODO: what if this makes the coordinates too large?
+ *
+ * Note: this only works if the destination surface uses the IMS layout.
+ * If it's UMS, then we have no choice but to set up the rendering
+ * pipeline as multisampled.
+ */
+ switch (params.dst.surf.samples) {
+ case 2:
+ params.x0 = ROUND_DOWN_TO(params.x0 * 2, 4);
+ params.y0 = ROUND_DOWN_TO(params.y0, 4);
+ params.x1 = ALIGN(params.x1 * 2, 4);
+ params.y1 = ALIGN(params.y1, 4);
+ break;
+ case 4:
+ params.x0 = ROUND_DOWN_TO(params.x0 * 2, 4);
+ params.y0 = ROUND_DOWN_TO(params.y0 * 2, 4);
+ params.x1 = ALIGN(params.x1 * 2, 4);
+ params.y1 = ALIGN(params.y1 * 2, 4);
+ break;
+ case 8:
+ params.x0 = ROUND_DOWN_TO(params.x0 * 4, 8);
+ params.y0 = ROUND_DOWN_TO(params.y0 * 2, 4);
+ params.x1 = ALIGN(params.x1 * 4, 8);
+ params.y1 = ALIGN(params.y1 * 2, 4);
+ break;
+ case 16:
+ params.x0 = ROUND_DOWN_TO(params.x0 * 4, 8);
+ params.y0 = ROUND_DOWN_TO(params.y0 * 4, 8);
+ params.x1 = ALIGN(params.x1 * 4, 8);
+ params.y1 = ALIGN(params.y1 * 4, 8);
+ break;
+ default:
+ unreachable("Unrecognized sample count in brw_blorp_blit_params ctor");
+ }
+
+ surf_fake_interleaved_msaa(batch->blorp->isl_dev, &params.dst);
+
+ wm_prog_key.use_kill = true;
+ }
+
+ if (params.dst.surf.tiling == ISL_TILING_W) {
+ /* We must modify the rectangle we send through the rendering pipeline
+ * (and the size and x/y offset of the destination surface), to account
+ * for the fact that we are mapping it as Y-tiled when it is in fact
+ * W-tiled.
+ *
+ * Both Y tiling and W tiling can be understood as organizations of
+ * 32-byte sub-tiles; within each 32-byte sub-tile, the layout of pixels
+ * is different, but the layout of the 32-byte sub-tiles within the 4k
+ * tile is the same (8 sub-tiles across by 16 sub-tiles down, in
+ * column-major order). In Y tiling, the sub-tiles are 16 bytes wide
+ * and 2 rows high; in W tiling, they are 8 bytes wide and 4 rows high.
+ *
+ * Therefore, to account for the layout differences within the 32-byte
+ * sub-tiles, we must expand the rectangle so the X coordinates of its
+ * edges are multiples of 8 (the W sub-tile width), and its Y
+ * coordinates of its edges are multiples of 4 (the W sub-tile height).
+ * Then we need to scale the X and Y coordinates of the rectangle to
+ * account for the differences in aspect ratio between the Y and W
+ * sub-tiles. We need to modify the layer width and height similarly.
+ *
+ * A correction needs to be applied when MSAA is in use: since
+ * INTEL_MSAA_LAYOUT_IMS uses an interleaving pattern whose height is 4,
+ * we need to align the Y coordinates to multiples of 8, so that when
+ * they are divided by two they are still multiples of 4.
+ *
+ * Note: Since the x/y offset of the surface will be applied using the
+ * SURFACE_STATE command packet, it will be invisible to the swizzling
+ * code in the shader; therefore it needs to be in a multiple of the
+ * 32-byte sub-tile size. Fortunately it is, since the sub-tile is 8
+ * pixels wide and 4 pixels high (when viewed as a W-tiled stencil
+ * buffer), and the miplevel alignment used for stencil buffers is 8
+ * pixels horizontally and either 4 or 8 pixels vertically (see
+ * intel_horizontal_texture_alignment_unit() and
+ * intel_vertical_texture_alignment_unit()).
+ *
+ * Note: Also, since the SURFACE_STATE command packet can only apply
+ * offsets that are multiples of 4 pixels horizontally and 2 pixels
+ * vertically, it is important that the offsets will be multiples of
+ * these sizes after they are converted into Y-tiled coordinates.
+ * Fortunately they will be, since we know from above that the offsets
+ * are a multiple of the 32-byte sub-tile size, and in Y-tiled
+ * coordinates the sub-tile is 16 pixels wide and 2 pixels high.
+ *
+ * TODO: what if this makes the coordinates (or the texture size) too
+ * large?
+ */
+ const unsigned x_align = 8, y_align = params.dst.surf.samples != 0 ? 8 : 4;
+ params.x0 = ROUND_DOWN_TO(params.x0, x_align) * 2;
+ params.y0 = ROUND_DOWN_TO(params.y0, y_align) / 2;
+ params.x1 = ALIGN(params.x1, x_align) * 2;
+ params.y1 = ALIGN(params.y1, y_align) / 2;
+
+ /* Retile the surface to Y-tiled */
+ surf_retile_w_to_y(batch->blorp->isl_dev, &params.dst);
+
+ wm_prog_key.dst_tiled_w = true;
+ wm_prog_key.use_kill = true;
+
+ if (params.dst.surf.samples > 1) {
+ /* If the destination surface is a W-tiled multisampled stencil
+ * buffer that we're mapping as Y tiled, then we need to arrange for
+ * the WM program to run once per sample rather than once per pixel,
+ * because the memory layout of related samples doesn't match between
+ * W and Y tiling.
+ */
+ wm_prog_key.persample_msaa_dispatch = true;
+ }
+ }
+
+ if (devinfo->gen < 8 && params.src.surf.tiling == ISL_TILING_W) {
+ /* On Haswell and earlier, we have to fake W-tiled sources as Y-tiled.
+ * Broadwell adds support for sampling from stencil.
+ *
+ * See the comments above concerning x/y offset alignment for the
+ * destination surface.
+ *
+ * TODO: what if this makes the texture size too large?
+ */
+ surf_retile_w_to_y(batch->blorp->isl_dev, &params.src);
+
+ wm_prog_key.src_tiled_w = true;
+ }
+
+ /* tex_samples and rt_samples are the sample counts that are set up in
+ * SURFACE_STATE.
+ */
+ wm_prog_key.tex_samples = params.src.surf.samples;
+ wm_prog_key.rt_samples = params.dst.surf.samples;
+
+ /* tex_layout and rt_layout indicate the MSAA layout the GPU pipeline will
+ * use to access the source and destination surfaces.
+ */
+ wm_prog_key.tex_layout = params.src.surf.msaa_layout;
+ wm_prog_key.rt_layout = params.dst.surf.msaa_layout;
+
+ if (params.src.surf.samples > 0 && params.dst.surf.samples > 1) {
+ /* We are blitting from a multisample buffer to a multisample buffer, so
+ * we must preserve samples within a pixel. This means we have to
+ * arrange for the WM program to run once per sample rather than once
+ * per pixel.
+ */
+ wm_prog_key.persample_msaa_dispatch = true;
+ }
+
+ brw_blorp_get_blit_kernel(batch->blorp, &params, &wm_prog_key);
+
+ for (unsigned i = 0; i < 4; i++) {
+ params.src.view.channel_select[i] =
+ swizzle_to_scs(GET_SWZ(src_swizzle, i));
+ }
+
+ batch->blorp->exec(batch, &params);
+}
diff --git a/src/intel/blorp/blorp_clear.c b/src/intel/blorp/blorp_clear.c
new file mode 100644
index 00000000000..a371dfd31ef
--- /dev/null
+++ b/src/intel/blorp/blorp_clear.c
@@ -0,0 +1,344 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/ralloc.h"
+
+#include "blorp_priv.h"
+#include "brw_defines.h"
+
+#include "compiler/nir/nir_builder.h"
+
+#define FILE_DEBUG_FLAG DEBUG_BLORP
+
+struct brw_blorp_const_color_prog_key
+{
+ bool use_simd16_replicated_data;
+ bool pad[3];
+};
+
+static void
+blorp_params_get_clear_kernel(struct blorp_context *blorp,
+ struct blorp_params *params,
+ bool use_replicated_data)
+{
+ struct brw_blorp_const_color_prog_key blorp_key;
+ memset(&blorp_key, 0, sizeof(blorp_key));
+ blorp_key.use_simd16_replicated_data = use_replicated_data;
+
+ if (blorp->lookup_shader(blorp, &blorp_key, sizeof(blorp_key),
+ &params->wm_prog_kernel, &params->wm_prog_data))
+ return;
+
+ void *mem_ctx = ralloc_context(NULL);
+
+ nir_builder b;
+ nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
+ b.shader->info.name = ralloc_strdup(b.shader, "BLORP-clear");
+
+ nir_variable *v_color = nir_variable_create(b.shader, nir_var_shader_in,
+ glsl_vec4_type(), "v_color");
+ v_color->data.location = VARYING_SLOT_VAR0;
+ v_color->data.interpolation = INTERP_MODE_FLAT;
+
+ nir_variable *frag_color = nir_variable_create(b.shader, nir_var_shader_out,
+ glsl_vec4_type(),
+ "gl_FragColor");
+ frag_color->data.location = FRAG_RESULT_COLOR;
+
+ nir_copy_var(&b, frag_color, v_color);
+
+ struct brw_wm_prog_key wm_key;
+ brw_blorp_init_wm_prog_key(&wm_key);
+
+ struct brw_blorp_prog_data prog_data;
+ unsigned program_size;
+ const unsigned *program =
+ brw_blorp_compile_nir_shader(blorp, b.shader, &wm_key, use_replicated_data,
+ &prog_data, &program_size);
+
+ blorp->upload_shader(blorp, &blorp_key, sizeof(blorp_key),
+ program, program_size,
+ &prog_data, sizeof(prog_data),
+ &params->wm_prog_kernel, &params->wm_prog_data);
+
+ ralloc_free(mem_ctx);
+}
+
+/* The x0, y0, x1, and y1 parameters must already be populated with the render
+ * area of the framebuffer to be cleared.
+ */
+static void
+get_fast_clear_rect(const struct isl_device *dev,
+ const struct isl_surf *aux_surf,
+ unsigned *x0, unsigned *y0,
+ unsigned *x1, unsigned *y1)
+{
+ unsigned int x_align, y_align;
+ unsigned int x_scaledown, y_scaledown;
+
+ /* Only single sampled surfaces need to (and actually can) be resolved. */
+ if (aux_surf->usage == ISL_SURF_USAGE_CCS_BIT) {
+ /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
+ * Target(s)", beneath the "Fast Color Clear" bullet (p327):
+ *
+ * Clear pass must have a clear rectangle that must follow
+ * alignment rules in terms of pixels and lines as shown in the
+ * table below. Further, the clear-rectangle height and width
+ * must be multiple of the following dimensions. If the height
+ * and width of the render target being cleared do not meet these
+ * requirements, an MCS buffer can be created such that it
+ * follows the requirement and covers the RT.
+ *
+ * The alignment size in the table that follows is related to the
+ * alignment size that is baked into the CCS surface format but with X
+ * alignment multiplied by 16 and Y alignment multiplied by 32.
+ */
+ x_align = isl_format_get_layout(aux_surf->format)->bw;
+ y_align = isl_format_get_layout(aux_surf->format)->bh;
+
+ x_align *= 16;
+
+ /* SKL+ line alignment requirement for Y-tiled are half those of the prior
+ * generations.
+ */
+ if (dev->info->gen >= 9)
+ y_align *= 16;
+ else
+ y_align *= 32;
+
+ /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
+ * Target(s)", beneath the "Fast Color Clear" bullet (p327):
+ *
+ * In order to optimize the performance MCS buffer (when bound to
+ * 1X RT) clear similarly to MCS buffer clear for MSRT case,
+ * clear rect is required to be scaled by the following factors
+ * in the horizontal and vertical directions:
+ *
+ * The X and Y scale down factors in the table that follows are each
+ * equal to half the alignment value computed above.
+ */
+ x_scaledown = x_align / 2;
+ y_scaledown = y_align / 2;
+
+ /* From BSpec: 3D-Media-GPGPU Engine > 3D Pipeline > Pixel > Pixel
+ * Backend > MCS Buffer for Render Target(s) [DevIVB+] > Table "Color
+ * Clear of Non-MultiSampled Render Target Restrictions":
+ *
+ * Clear rectangle must be aligned to two times the number of
+ * pixels in the table shown below due to 16x16 hashing across the
+ * slice.
+ */
+ x_align *= 2;
+ y_align *= 2;
+ } else {
+ assert(aux_surf->usage == ISL_SURF_USAGE_MCS_BIT);
+
+ /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
+ * Target(s)", beneath the "MSAA Compression" bullet (p326):
+ *
+ * Clear pass for this case requires that scaled down primitive
+ * is sent down with upper left co-ordinate to coincide with
+ * actual rectangle being cleared. For MSAA, clear rectangle’s
+ * height and width need to as show in the following table in
+ * terms of (width,height) of the RT.
+ *
+ * MSAA Width of Clear Rect Height of Clear Rect
+ * 2X Ceil(1/8*width) Ceil(1/2*height)
+ * 4X Ceil(1/8*width) Ceil(1/2*height)
+ * 8X Ceil(1/2*width) Ceil(1/2*height)
+ * 16X width Ceil(1/2*height)
+ *
+ * The text "with upper left co-ordinate to coincide with actual
+ * rectangle being cleared" is a little confusing--it seems to imply
+ * that to clear a rectangle from (x,y) to (x+w,y+h), one needs to
+ * feed the pipeline using the rectangle (x,y) to
+ * (x+Ceil(w/N),y+Ceil(h/2)), where N is either 2 or 8 depending on
+ * the number of samples. Experiments indicate that this is not
+ * quite correct; actually, what the hardware appears to do is to
+ * align whatever rectangle is sent down the pipeline to the nearest
+ * multiple of 2x2 blocks, and then scale it up by a factor of N
+ * horizontally and 2 vertically. So the resulting alignment is 4
+ * vertically and either 4 or 16 horizontally, and the scaledown
+ * factor is 2 vertically and either 2 or 8 horizontally.
+ */
+ switch (aux_surf->format) {
+ case ISL_FORMAT_MCS_2X:
+ case ISL_FORMAT_MCS_4X:
+ x_scaledown = 8;
+ break;
+ case ISL_FORMAT_MCS_8X:
+ x_scaledown = 2;
+ break;
+ case ISL_FORMAT_MCS_16X:
+ x_scaledown = 1;
+ break;
+ default:
+ unreachable("Unexpected MCS format for fast clear");
+ }
+ y_scaledown = 2;
+ x_align = x_scaledown * 2;
+ y_align = y_scaledown * 2;
+ }
+
+ *x0 = ROUND_DOWN_TO(*x0, x_align) / x_scaledown;
+ *y0 = ROUND_DOWN_TO(*y0, y_align) / y_scaledown;
+ *x1 = ALIGN(*x1, x_align) / x_scaledown;
+ *y1 = ALIGN(*y1, y_align) / y_scaledown;
+}
+
+void
+blorp_fast_clear(struct blorp_batch *batch,
+ const struct blorp_surf *surf,
+ uint32_t level, uint32_t layer,
+ uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1)
+{
+ struct blorp_params params;
+ blorp_params_init(&params);
+
+ params.x0 = x0;
+ params.y0 = y0;
+ params.x1 = x1;
+ params.y1 = y1;
+
+ memset(&params.wm_inputs, 0xff, 4*sizeof(float));
+ params.fast_clear_op = BLORP_FAST_CLEAR_OP_CLEAR;
+
+ get_fast_clear_rect(batch->blorp->isl_dev, surf->aux_surf,
+ &params.x0, &params.y0, &params.x1, &params.y1);
+
+ blorp_params_get_clear_kernel(batch->blorp, &params, true);
+
+ brw_blorp_surface_info_init(batch->blorp, &params.dst, surf, level, layer,
+ surf->surf->format, true);
+
+ batch->blorp->exec(batch, &params);
+}
+
+
+void
+blorp_clear(struct blorp_batch *batch,
+ const struct blorp_surf *surf,
+ uint32_t level, uint32_t layer,
+ uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1,
+ enum isl_format format, union isl_color_value clear_color,
+ bool color_write_disable[4])
+{
+ struct blorp_params params;
+ blorp_params_init(&params);
+
+ params.x0 = x0;
+ params.y0 = y0;
+ params.x1 = x1;
+ params.y1 = y1;
+
+ memcpy(&params.wm_inputs, clear_color.f32, sizeof(float) * 4);
+
+ bool use_simd16_replicated_data = true;
+
+ /* From the SNB PRM (Vol4_Part1):
+ *
+ * "Replicated data (Message Type = 111) is only supported when
+ * accessing tiled memory. Using this Message Type to access linear
+ * (untiled) memory is UNDEFINED."
+ */
+ if (surf->surf->tiling == ISL_TILING_LINEAR)
+ use_simd16_replicated_data = false;
+
+ /* Constant color writes ignore everyting in blend and color calculator
+ * state. This is not documented.
+ */
+ for (unsigned i = 0; i < 4; i++) {
+ params.color_write_disable[i] = color_write_disable[i];
+ if (color_write_disable[i])
+ use_simd16_replicated_data = false;
+ }
+
+ blorp_params_get_clear_kernel(batch->blorp, &params,
+ use_simd16_replicated_data);
+
+ brw_blorp_surface_info_init(batch->blorp, &params.dst, surf, level, layer,
+ format, true);
+
+ batch->blorp->exec(batch, &params);
+}
+
+void
+blorp_ccs_resolve(struct blorp_batch *batch,
+ struct blorp_surf *surf, enum isl_format format)
+{
+ struct blorp_params params;
+ blorp_params_init(&params);
+
+ brw_blorp_surface_info_init(batch->blorp, &params.dst, surf,
+ 0 /* level */, 0 /* layer */, format, true);
+
+ /* From the Ivy Bridge PRM, Vol2 Part1 11.9 "Render Target Resolve":
+ *
+ * A rectangle primitive must be scaled down by the following factors
+ * with respect to render target being resolved.
+ *
+ * The scaledown factors in the table that follows are related to the block
+ * size of the CCS format. For IVB and HSW, we divide by two, for BDW we
+ * multiply by 8 and 16. On Sky Lake, we multiply by 8.
+ */
+ const struct isl_format_layout *aux_fmtl =
+ isl_format_get_layout(params.dst.aux_surf.format);
+ assert(aux_fmtl->txc == ISL_TXC_CCS);
+
+ unsigned x_scaledown, y_scaledown;
+ if (ISL_DEV_GEN(batch->blorp->isl_dev) >= 9) {
+ x_scaledown = aux_fmtl->bw * 8;
+ y_scaledown = aux_fmtl->bh * 8;
+ } else if (ISL_DEV_GEN(batch->blorp->isl_dev) >= 8) {
+ x_scaledown = aux_fmtl->bw * 8;
+ y_scaledown = aux_fmtl->bh * 16;
+ } else {
+ x_scaledown = aux_fmtl->bw / 2;
+ y_scaledown = aux_fmtl->bh / 2;
+ }
+ params.x0 = params.y0 = 0;
+ params.x1 = params.dst.aux_surf.logical_level0_px.width;
+ params.y1 = params.dst.aux_surf.logical_level0_px.height;
+ params.x1 = ALIGN(params.x1, x_scaledown) / x_scaledown;
+ params.y1 = ALIGN(params.y1, y_scaledown) / y_scaledown;
+
+ if (batch->blorp->isl_dev->info->gen >= 9) {
+ if (params.dst.aux_usage == ISL_AUX_USAGE_CCS_E)
+ params.fast_clear_op = BLORP_FAST_CLEAR_OP_RESOLVE_FULL;
+ else
+ params.fast_clear_op = BLORP_FAST_CLEAR_OP_RESOLVE_PARTIAL;
+ } else {
+ /* Broadwell and earlier do not have a partial resolve */
+ params.fast_clear_op = BLORP_FAST_CLEAR_OP_RESOLVE_FULL;
+ }
+
+ /* Note: there is no need to initialize push constants because it doesn't
+ * matter what data gets dispatched to the render target. However, we must
+ * ensure that the fragment shader delivers the data using the "replicated
+ * color" message.
+ */
+
+ blorp_params_get_clear_kernel(batch->blorp, &params, true);
+
+ batch->blorp->exec(batch, &params);
+}
diff --git a/src/intel/blorp/blorp_genX_exec.h b/src/intel/blorp/blorp_genX_exec.h
new file mode 100644
index 00000000000..f44076e129f
--- /dev/null
+++ b/src/intel/blorp/blorp_genX_exec.h
@@ -0,0 +1,1176 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "blorp_priv.h"
+#include "brw_device_info.h"
+#include "intel_aub.h"
+
+/**
+ * This file provides the blorp pipeline setup and execution functionality.
+ * It defines the following function:
+ *
+ * static void
+ * blorp_exec(struct blorp_context *blorp, void *batch_data,
+ * const struct blorp_params *params);
+ *
+ * It is the job of whoever includes this header to wrap this in something
+ * to get an externally visible symbol.
+ *
+ * In order for the blorp_exec function to work, the driver must provide
+ * implementations of the following static helper functions.
+ */
+
+static void *
+blorp_emit_dwords(struct blorp_batch *batch, unsigned n);
+
+static uint64_t
+blorp_emit_reloc(struct blorp_batch *batch,
+ void *location, struct blorp_address address, uint32_t delta);
+
+static void *
+blorp_alloc_dynamic_state(struct blorp_batch *batch,
+ enum aub_state_struct_type type,
+ uint32_t size,
+ uint32_t alignment,
+ uint32_t *offset);
+static void *
+blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
+ struct blorp_address *addr);
+
+static void
+blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries,
+ unsigned state_size, unsigned state_alignment,
+ uint32_t *bt_offset, uint32_t **bt_map,
+ void **surface_maps);
+static void
+blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset,
+ struct blorp_address address, uint32_t delta);
+
+static void
+blorp_emit_urb_config(struct blorp_batch *batch, unsigned vs_entry_size);
+static void
+blorp_emit_3dstate_multisample(struct blorp_batch *batch, unsigned samples);
+
+/***** BEGIN blorp_exec implementation ******/
+
+#include "genxml/gen_macros.h"
+
+#define __gen_address_type struct blorp_address
+#define __gen_user_data struct blorp_batch
+
+static uint64_t
+__gen_combine_address(struct blorp_batch *batch, void *location,
+ struct blorp_address address, uint32_t delta)
+{
+ if (address.buffer == NULL) {
+ return address.offset + delta;
+ } else {
+ return blorp_emit_reloc(batch, location, address, delta);
+ }
+}
+
+#include "genxml/genX_pack.h"
+
+#define _blorp_cmd_length(cmd) cmd ## _length
+#define _blorp_cmd_length_bias(cmd) cmd ## _length_bias
+#define _blorp_cmd_header(cmd) cmd ## _header
+#define _blorp_cmd_pack(cmd) cmd ## _pack
+
+#define blorp_emit(batch, cmd, name) \
+ for (struct cmd name = { _blorp_cmd_header(cmd) }, \
+ *_dst = blorp_emit_dwords(batch, _blorp_cmd_length(cmd)); \
+ __builtin_expect(_dst != NULL, 1); \
+ _blorp_cmd_pack(cmd)(batch, (void *)_dst, &name), \
+ _dst = NULL)
+
+#define blorp_emitn(batch, cmd, n) ({ \
+ uint32_t *_dw = blorp_emit_dwords(batch, n); \
+ struct cmd template = { \
+ _blorp_cmd_header(cmd), \
+ .DWordLength = n - _blorp_cmd_length_bias(cmd), \
+ }; \
+ _blorp_cmd_pack(cmd)(batch, _dw, &template); \
+ _dw + 1; /* Array starts at dw[1] */ \
+ })
+
+/* Once vertex fetcher has written full VUE entries with complete
+ * header the space requirement is as follows per vertex (in bytes):
+ *
+ * Header Position Program constants
+ * +--------+------------+-------------------+
+ * | 16 | 16 | n x 16 |
+ * +--------+------------+-------------------+
+ *
+ * where 'n' stands for number of varying inputs expressed as vec4s.
+ *
+ * The URB size is in turn expressed in 64 bytes (512 bits).
+ */
+static inline unsigned
+gen7_blorp_get_vs_entry_size(const struct blorp_params *params)
+{
+ const unsigned num_varyings =
+ params->wm_prog_data ? params->wm_prog_data->num_varying_inputs : 0;
+ const unsigned total_needed = 16 + 16 + num_varyings * 16;
+
+ return DIV_ROUND_UP(total_needed, 64);
+}
+
+/* 3DSTATE_URB
+ * 3DSTATE_URB_VS
+ * 3DSTATE_URB_HS
+ * 3DSTATE_URB_DS
+ * 3DSTATE_URB_GS
+ *
+ * Assign the entire URB to the VS. Even though the VS disabled, URB space
+ * is still needed because the clipper loads the VUE's from the URB. From
+ * the Sandybridge PRM, Volume 2, Part 1, Section 3DSTATE,
+ * Dword 1.15:0 "VS Number of URB Entries":
+ * This field is always used (even if VS Function Enable is DISABLED).
+ *
+ * The warning below appears in the PRM (Section 3DSTATE_URB), but we can
+ * safely ignore it because this batch contains only one draw call.
+ * Because of URB corruption caused by allocating a previous GS unit
+ * URB entry to the VS unit, software is required to send a “GS NULL
+ * Fence” (Send URB fence with VS URB size == 1 and GS URB size == 0)
+ * plus a dummy DRAW call before any case where VS will be taking over
+ * GS URB space.
+ *
+ * If the 3DSTATE_URB_VS is emitted, than the others must be also.
+ * From the Ivybridge PRM, Volume 2 Part 1, section 1.7.1 3DSTATE_URB_VS:
+ *
+ * 3DSTATE_URB_HS, 3DSTATE_URB_DS, and 3DSTATE_URB_GS must also be
+ * programmed in order for the programming of this state to be
+ * valid.
+ */
+static void
+emit_urb_config(struct blorp_batch *batch,
+ const struct blorp_params *params)
+{
+ blorp_emit_urb_config(batch, gen7_blorp_get_vs_entry_size(params));
+}
+
+static void
+blorp_emit_vertex_data(struct blorp_batch *batch,
+ const struct blorp_params *params,
+ struct blorp_address *addr,
+ uint32_t *size)
+{
+ const float vertices[] = {
+ /* v0 */ (float)params->x0, (float)params->y1,
+ /* v1 */ (float)params->x1, (float)params->y1,
+ /* v2 */ (float)params->x0, (float)params->y0,
+ };
+
+ void *data = blorp_alloc_vertex_buffer(batch, sizeof(vertices), addr);
+ memcpy(data, vertices, sizeof(vertices));
+ *size = sizeof(vertices);
+}
+
+static void
+blorp_emit_input_varying_data(struct blorp_batch *batch,
+ const struct blorp_params *params,
+ struct blorp_address *addr,
+ uint32_t *size)
+{
+ const unsigned vec4_size_in_bytes = 4 * sizeof(float);
+ const unsigned max_num_varyings =
+ DIV_ROUND_UP(sizeof(params->wm_inputs), vec4_size_in_bytes);
+ const unsigned num_varyings = params->wm_prog_data->num_varying_inputs;
+
+ *size = num_varyings * vec4_size_in_bytes;
+
+ const float *const inputs_src = (const float *)&params->wm_inputs;
+ float *inputs = blorp_alloc_vertex_buffer(batch, *size, addr);
+
+ /* Walk over the attribute slots, determine if the attribute is used by
+ * the program and when necessary copy the values from the input storage to
+ * the vertex data buffer.
+ */
+ for (unsigned i = 0; i < max_num_varyings; i++) {
+ const gl_varying_slot attr = VARYING_SLOT_VAR0 + i;
+
+ if (!(params->wm_prog_data->inputs_read & (1ull << attr)))
+ continue;
+
+ memcpy(inputs, inputs_src + i * 4, vec4_size_in_bytes);
+
+ inputs += 4;
+ }
+}
+
+static void
+blorp_emit_vertex_buffers(struct blorp_batch *batch,
+ const struct blorp_params *params)
+{
+ struct GENX(VERTEX_BUFFER_STATE) vb[2];
+ memset(vb, 0, sizeof(vb));
+
+ unsigned num_buffers = 1;
+
+ uint32_t size;
+ blorp_emit_vertex_data(batch, params, &vb[0].BufferStartingAddress, &size);
+ vb[0].VertexBufferIndex = 0;
+ vb[0].BufferPitch = 2 * sizeof(float);
+ vb[0].VertexBufferMOCS = batch->blorp->mocs.vb;
+#if GEN_GEN >= 7
+ vb[0].AddressModifyEnable = true;
+#endif
+#if GEN_GEN >= 8
+ vb[0].BufferSize = size;
+#else
+ vb[0].BufferAccessType = VERTEXDATA;
+ vb[0].EndAddress = vb[0].BufferStartingAddress;
+ vb[0].EndAddress.offset += size - 1;
+#endif
+
+ if (params->wm_prog_data && params->wm_prog_data->num_varying_inputs) {
+ blorp_emit_input_varying_data(batch, params,
+ &vb[1].BufferStartingAddress, &size);
+ vb[1].VertexBufferIndex = 1;
+ vb[1].BufferPitch = 0;
+ vb[1].VertexBufferMOCS = batch->blorp->mocs.vb;
+#if GEN_GEN >= 7
+ vb[1].AddressModifyEnable = true;
+#endif
+#if GEN_GEN >= 8
+ vb[1].BufferSize = size;
+#else
+ vb[1].BufferAccessType = INSTANCEDATA;
+ vb[1].EndAddress = vb[1].BufferStartingAddress;
+ vb[1].EndAddress.offset += size - 1;
+#endif
+ num_buffers++;
+ }
+
+ const unsigned num_dwords =
+ 1 + GENX(VERTEX_BUFFER_STATE_length) * num_buffers;
+ uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_BUFFERS), num_dwords);
+
+ for (unsigned i = 0; i < num_buffers; i++) {
+ GENX(VERTEX_BUFFER_STATE_pack)(batch, dw, &vb[i]);
+ dw += GENX(VERTEX_BUFFER_STATE_length);
+ }
+}
+
+static void
+blorp_emit_vertex_elements(struct blorp_batch *batch,
+ const struct blorp_params *params)
+{
+ const unsigned num_varyings =
+ params->wm_prog_data ? params->wm_prog_data->num_varying_inputs : 0;
+ const unsigned num_elements = 2 + num_varyings;
+
+ struct GENX(VERTEX_ELEMENT_STATE) ve[num_elements];
+ memset(ve, 0, num_elements * sizeof(*ve));
+
+ /* Setup VBO for the rectangle primitive..
+ *
+ * A rectangle primitive (3DPRIM_RECTLIST) consists of only three
+ * vertices. The vertices reside in screen space with DirectX
+ * coordinates (that is, (0, 0) is the upper left corner).
+ *
+ * v2 ------ implied
+ * | |
+ * | |
+ * v0 ----- v1
+ *
+ * Since the VS is disabled, the clipper loads each VUE directly from
+ * the URB. This is controlled by the 3DSTATE_VERTEX_BUFFERS and
+ * 3DSTATE_VERTEX_ELEMENTS packets below. The VUE contents are as follows:
+ * dw0: Reserved, MBZ.
+ * dw1: Render Target Array Index. The HiZ op does not use indexed
+ * vertices, so set the dword to 0.
+ * dw2: Viewport Index. The HiZ op disables viewport mapping and
+ * scissoring, so set the dword to 0.
+ * dw3: Point Width: The HiZ op does not emit the POINTLIST primitive,
+ * so set the dword to 0.
+ * dw4: Vertex Position X.
+ * dw5: Vertex Position Y.
+ * dw6: Vertex Position Z.
+ * dw7: Vertex Position W.
+ *
+ * dw8: Flat vertex input 0
+ * dw9: Flat vertex input 1
+ * ...
+ * dwn: Flat vertex input n - 8
+ *
+ * For details, see the Sandybridge PRM, Volume 2, Part 1, Section 1.5.1
+ * "Vertex URB Entry (VUE) Formats".
+ *
+ * Only vertex position X and Y are going to be variable, Z is fixed to
+ * zero and W to one. Header words dw0-3 are all zero. There is no need to
+ * include the fixed values in the vertex buffer. Vertex fetcher can be
+ * instructed to fill vertex elements with constant values of one and zero
+ * instead of reading them from the buffer.
+ * Flat inputs are program constants that are not interpolated. Moreover
+ * their values will be the same between vertices.
+ *
+ * See the vertex element setup below.
+ */
+ ve[0].VertexBufferIndex = 0;
+ ve[0].Valid = true;
+ ve[0].SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
+ ve[0].SourceElementOffset = 0;
+ ve[0].Component0Control = VFCOMP_STORE_0;
+ ve[0].Component1Control = VFCOMP_STORE_0;
+ ve[0].Component2Control = VFCOMP_STORE_0;
+ ve[0].Component3Control = VFCOMP_STORE_0;
+
+ ve[1].VertexBufferIndex = 0;
+ ve[1].Valid = true;
+ ve[1].SourceElementFormat = ISL_FORMAT_R32G32_FLOAT;
+ ve[1].SourceElementOffset = 0;
+ ve[1].Component0Control = VFCOMP_STORE_SRC;
+ ve[1].Component1Control = VFCOMP_STORE_SRC;
+ ve[1].Component2Control = VFCOMP_STORE_0;
+ ve[1].Component3Control = VFCOMP_STORE_1_FP;
+
+ for (unsigned i = 0; i < num_varyings; ++i) {
+ ve[i + 2].VertexBufferIndex = 1;
+ ve[i + 2].Valid = true;
+ ve[i + 2].SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
+ ve[i + 2].SourceElementOffset = i * 4 * sizeof(float);
+ ve[i + 2].Component0Control = VFCOMP_STORE_SRC;
+ ve[i + 2].Component1Control = VFCOMP_STORE_SRC;
+ ve[i + 2].Component2Control = VFCOMP_STORE_SRC;
+ ve[i + 2].Component3Control = VFCOMP_STORE_SRC;
+ }
+
+ const unsigned num_dwords =
+ 1 + GENX(VERTEX_ELEMENT_STATE_length) * num_elements;
+ uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_ELEMENTS), num_dwords);
+
+ for (unsigned i = 0; i < num_elements; i++) {
+ GENX(VERTEX_ELEMENT_STATE_pack)(batch, dw, &ve[i]);
+ dw += GENX(VERTEX_ELEMENT_STATE_length);
+ }
+
+#if GEN_GEN >= 8
+ blorp_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs);
+
+ for (unsigned i = 0; i < num_elements; i++) {
+ blorp_emit(batch, GENX(3DSTATE_VF_INSTANCING), vf) {
+ vf.VertexElementIndex = i;
+ vf.InstancingEnable = false;
+ }
+ }
+
+ blorp_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
+ topo.PrimitiveTopologyType = _3DPRIM_RECTLIST;
+ }
+#endif
+}
+
+static void
+blorp_emit_sf_config(struct blorp_batch *batch,
+ const struct blorp_params *params)
+{
+ const struct brw_blorp_prog_data *prog_data = params->wm_prog_data;
+
+ /* 3DSTATE_SF
+ *
+ * Disable ViewportTransformEnable (dw2.1)
+ *
+ * From the SandyBridge PRM, Volume 2, Part 1, Section 1.3, "3D
+ * Primitives Overview":
+ * RECTLIST: Viewport Mapping must be DISABLED (as is typical with the
+ * use of screen- space coordinates).
+ *
+ * A solid rectangle must be rendered, so set FrontFaceFillMode (dw2.4:3)
+ * and BackFaceFillMode (dw2.5:6) to SOLID(0).
+ *
+ * From the Sandy Bridge PRM, Volume 2, Part 1, Section
+ * 6.4.1.1 3DSTATE_SF, Field FrontFaceFillMode:
+ * SOLID: Any triangle or rectangle object found to be front-facing
+ * is rendered as a solid object. This setting is required when
+ * (rendering rectangle (RECTLIST) objects.
+ */
+
+#if GEN_GEN >= 8
+
+ blorp_emit(batch, GENX(3DSTATE_SF), sf);
+
+ blorp_emit(batch, GENX(3DSTATE_RASTER), raster) {
+ raster.CullMode = CULLMODE_NONE;
+ }
+
+ blorp_emit(batch, GENX(3DSTATE_SBE), sbe) {
+ sbe.VertexURBEntryReadOffset = 1;
+ sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
+ sbe.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data);
+ sbe.ForceVertexURBEntryReadLength = true;
+ sbe.ForceVertexURBEntryReadOffset = true;
+ sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
+
+#if GEN_GEN >= 9
+ for (unsigned i = 0; i < 32; i++)
+ sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
+#endif
+ }
+
+#elif GEN_GEN >= 7
+
+ blorp_emit(batch, GENX(3DSTATE_SF), sf) {
+ sf.FrontFaceFillMode = FILL_MODE_SOLID;
+ sf.BackFaceFillMode = FILL_MODE_SOLID;
+
+ sf.MultisampleRasterizationMode = params->dst.surf.samples > 1 ?
+ MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
+
+#if GEN_GEN == 7
+ sf.DepthBufferSurfaceFormat = params->depth_format;
+#endif
+ }
+
+ blorp_emit(batch, GENX(3DSTATE_SBE), sbe) {
+ sbe.VertexURBEntryReadOffset = 1;
+ if (prog_data) {
+ sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
+ sbe.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data);
+ sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
+ } else {
+ sbe.NumberofSFOutputAttributes = 0;
+ sbe.VertexURBEntryReadLength = 1;
+ }
+ }
+
+#else /* GEN_GEN <= 6 */
+
+ blorp_emit(batch, GENX(3DSTATE_SF), sf) {
+ sf.FrontFaceFillMode = FILL_MODE_SOLID;
+ sf.BackFaceFillMode = FILL_MODE_SOLID;
+
+ sf.MultisampleRasterizationMode = params->dst.surf.samples > 1 ?
+ MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
+
+ sf.VertexURBEntryReadOffset = 1;
+ if (prog_data) {
+ sf.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
+ sf.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data);
+ sf.ConstantInterpolationEnable = prog_data->flat_inputs;
+ } else {
+ sf.NumberofSFOutputAttributes = 0;
+ sf.VertexURBEntryReadLength = 1;
+ }
+ }
+
+#endif /* GEN_GEN */
+}
+
+static void
+blorp_emit_ps_config(struct blorp_batch *batch,
+ const struct blorp_params *params)
+{
+ const struct brw_blorp_prog_data *prog_data = params->wm_prog_data;
+
+ /* Even when thread dispatch is disabled, max threads (dw5.25:31) must be
+ * nonzero to prevent the GPU from hanging. While the documentation doesn't
+ * mention this explicitly, it notes that the valid range for the field is
+ * [1,39] = [2,40] threads, which excludes zero.
+ *
+ * To be safe (and to minimize extraneous code) we go ahead and fully
+ * configure the WM state whether or not there is a WM program.
+ */
+
+#if GEN_GEN >= 8
+
+ blorp_emit(batch, GENX(3DSTATE_WM), wm);
+
+ blorp_emit(batch, GENX(3DSTATE_PS), ps) {
+ if (params->src.addr.buffer) {
+ ps.SamplerCount = 1; /* Up to 4 samplers */
+ ps.BindingTableEntryCount = 2;
+ } else {
+ ps.BindingTableEntryCount = 1;
+ }
+
+ ps.DispatchGRFStartRegisterForConstantSetupData0 =
+ prog_data->first_curbe_grf_0;
+ ps.DispatchGRFStartRegisterForConstantSetupData2 =
+ prog_data->first_curbe_grf_2;
+
+ ps._8PixelDispatchEnable = prog_data->dispatch_8;
+ ps._16PixelDispatchEnable = prog_data->dispatch_16;
+
+ ps.KernelStartPointer0 = params->wm_prog_kernel;
+ ps.KernelStartPointer2 =
+ params->wm_prog_kernel + prog_data->ksp_offset_2;
+
+ /* 3DSTATE_PS expects the number of threads per PSD, which is always 64;
+ * it implicitly scales for different GT levels (which have some # of
+ * PSDs).
+ *
+ * In Gen8 the format is U8-2 whereas in Gen9 it is U8-1.
+ */
+ if (GEN_GEN >= 9)
+ ps.MaximumNumberofThreadsPerPSD = 64 - 1;
+ else
+ ps.MaximumNumberofThreadsPerPSD = 64 - 2;
+
+ switch (params->fast_clear_op) {
+ case BLORP_FAST_CLEAR_OP_NONE:
+ break;
+#if GEN_GEN >= 9
+ case BLORP_FAST_CLEAR_OP_RESOLVE_PARTIAL:
+ ps.RenderTargetResolveType = RESOLVE_PARTIAL;
+ break;
+ case BLORP_FAST_CLEAR_OP_RESOLVE_FULL:
+ ps.RenderTargetResolveType = RESOLVE_FULL;
+ break;
+#else
+ case BLORP_FAST_CLEAR_OP_RESOLVE_FULL:
+ ps.RenderTargetResolveEnable = true;
+ break;
+#endif
+ case BLORP_FAST_CLEAR_OP_CLEAR:
+ ps.RenderTargetFastClearEnable = true;
+ break;
+ default:
+ unreachable("Invalid fast clear op");
+ }
+ }
+
+ blorp_emit(batch, GENX(3DSTATE_PS_EXTRA), psx) {
+ psx.PixelShaderValid = true;
+
+ if (params->src.addr.buffer)
+ psx.PixelShaderKillsPixel = true;
+
+ psx.AttributeEnable = prog_data->num_varying_inputs > 0;
+
+ if (prog_data && prog_data->persample_msaa_dispatch)
+ psx.PixelShaderIsPerSample = true;
+ }
+
+#elif GEN_GEN >= 7
+
+ blorp_emit(batch, GENX(3DSTATE_WM), wm) {
+ switch (params->hiz_op) {
+ case BLORP_HIZ_OP_DEPTH_CLEAR:
+ wm.DepthBufferClear = true;
+ break;
+ case BLORP_HIZ_OP_DEPTH_RESOLVE:
+ wm.DepthBufferResolveEnable = true;
+ break;
+ case BLORP_HIZ_OP_HIZ_RESOLVE:
+ wm.HierarchicalDepthBufferResolveEnable = true;
+ break;
+ case BLORP_HIZ_OP_NONE:
+ break;
+ default:
+ unreachable("not reached");
+ }
+
+ if (prog_data)
+ wm.ThreadDispatchEnable = true;
+
+ if (params->src.addr.buffer)
+ wm.PixelShaderKillPixel = true;
+
+ if (params->dst.surf.samples > 1) {
+ wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
+ wm.MultisampleDispatchMode =
+ (prog_data && prog_data->persample_msaa_dispatch) ?
+ MSDISPMODE_PERSAMPLE : MSDISPMODE_PERPIXEL;
+ } else {
+ wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
+ wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
+ }
+ }
+
+ blorp_emit(batch, GENX(3DSTATE_PS), ps) {
+ ps.MaximumNumberofThreads =
+ batch->blorp->isl_dev->info->max_wm_threads - 1;
+
+#if GEN_IS_HASWELL
+ ps.SampleMask = 1;
+#endif
+
+ if (prog_data) {
+ ps.DispatchGRFStartRegisterforConstantSetupData0 =
+ prog_data->first_curbe_grf_0;
+ ps.DispatchGRFStartRegisterforConstantSetupData2 =
+ prog_data->first_curbe_grf_2;
+
+ ps.KernelStartPointer0 = params->wm_prog_kernel;
+ ps.KernelStartPointer2 =
+ params->wm_prog_kernel + prog_data->ksp_offset_2;
+
+ ps._8PixelDispatchEnable = prog_data->dispatch_8;
+ ps._16PixelDispatchEnable = prog_data->dispatch_16;
+
+ ps.AttributeEnable = prog_data->num_varying_inputs > 0;
+ } else {
+ /* Gen7 hardware gets angry if we don't enable at least one dispatch
+ * mode, so just enable 16-pixel dispatch if we don't have a program.
+ */
+ ps._16PixelDispatchEnable = true;
+ }
+
+ if (params->src.addr.buffer)
+ ps.SamplerCount = 1; /* Up to 4 samplers */
+
+ switch (params->fast_clear_op) {
+ case BLORP_FAST_CLEAR_OP_NONE:
+ break;
+ case BLORP_FAST_CLEAR_OP_RESOLVE_FULL:
+ ps.RenderTargetResolveEnable = true;
+ break;
+ case BLORP_FAST_CLEAR_OP_CLEAR:
+ ps.RenderTargetFastClearEnable = true;
+ break;
+ default:
+ unreachable("Invalid fast clear op");
+ }
+ }
+
+#else /* GEN_GEN <= 6 */
+
+ blorp_emit(batch, GENX(3DSTATE_WM), wm) {
+ wm.MaximumNumberofThreads =
+ batch->blorp->isl_dev->info->max_wm_threads - 1;
+
+ switch (params->hiz_op) {
+ case BLORP_HIZ_OP_DEPTH_CLEAR:
+ wm.DepthBufferClear = true;
+ break;
+ case BLORP_HIZ_OP_DEPTH_RESOLVE:
+ wm.DepthBufferResolveEnable = true;
+ break;
+ case BLORP_HIZ_OP_HIZ_RESOLVE:
+ wm.HierarchicalDepthBufferResolveEnable = true;
+ break;
+ case BLORP_HIZ_OP_NONE:
+ break;
+ default:
+ unreachable("not reached");
+ }
+
+ if (prog_data) {
+ wm.ThreadDispatchEnable = true;
+
+ wm.DispatchGRFStartRegisterforConstantSetupData0 =
+ prog_data->first_curbe_grf_0;
+ wm.DispatchGRFStartRegisterforConstantSetupData2 =
+ prog_data->first_curbe_grf_2;
+
+ wm.KernelStartPointer0 = params->wm_prog_kernel;
+ wm.KernelStartPointer2 =
+ params->wm_prog_kernel + prog_data->ksp_offset_2;
+
+ wm._8PixelDispatchEnable = prog_data->dispatch_8;
+ wm._16PixelDispatchEnable = prog_data->dispatch_16;
+
+ wm.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
+ }
+
+ if (params->src.addr.buffer) {
+ wm.SamplerCount = 1; /* Up to 4 samplers */
+ wm.PixelShaderKillPixel = true; /* TODO: temporarily smash on */
+ }
+
+ if (params->dst.surf.samples > 1) {
+ wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
+ wm.MultisampleDispatchMode =
+ (prog_data && prog_data->persample_msaa_dispatch) ?
+ MSDISPMODE_PERSAMPLE : MSDISPMODE_PERPIXEL;
+ } else {
+ wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
+ wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
+ }
+ }
+
+#endif /* GEN_GEN */
+}
+
+
+static void
+blorp_emit_depth_stencil_config(struct blorp_batch *batch,
+ const struct blorp_params *params)
+{
+#if GEN_GEN >= 7
+ const uint32_t mocs = 1; /* GEN7_MOCS_L3 */
+#else
+ const uint32_t mocs = 0;
+#endif
+
+ blorp_emit(batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
+ switch (params->depth.surf.dim) {
+ case ISL_SURF_DIM_1D:
+ db.SurfaceType = SURFTYPE_1D;
+ break;
+ case ISL_SURF_DIM_2D:
+ db.SurfaceType = SURFTYPE_2D;
+ break;
+ case ISL_SURF_DIM_3D:
+ db.SurfaceType = SURFTYPE_3D;
+ break;
+ }
+
+ db.SurfaceFormat = params->depth_format;
+
+#if GEN_GEN >= 7
+ db.DepthWriteEnable = true;
+#endif
+
+#if GEN_GEN <= 6
+ db.TiledSurface = true;
+ db.TileWalk = TILEWALK_YMAJOR;
+ db.MIPMapLayoutMode = MIPLAYOUT_BELOW;
+ db.SeparateStencilBufferEnable = true;
+#endif
+
+ db.HierarchicalDepthBufferEnable = true;
+
+ db.Width = params->depth.surf.logical_level0_px.width - 1;
+ db.Height = params->depth.surf.logical_level0_px.height - 1;
+ db.RenderTargetViewExtent = db.Depth =
+ MAX2(params->depth.surf.logical_level0_px.depth,
+ params->depth.surf.logical_level0_px.array_len) - 1;
+
+ db.LOD = params->depth.view.base_level;
+ db.MinimumArrayElement = params->depth.view.base_array_layer;
+
+ db.SurfacePitch = params->depth.surf.row_pitch - 1;
+ db.SurfaceBaseAddress = params->depth.addr;
+ db.DepthBufferMOCS = mocs;
+ }
+
+ blorp_emit(batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hiz) {
+ hiz.SurfacePitch = params->depth.aux_surf.row_pitch - 1;
+ hiz.SurfaceBaseAddress = params->depth.aux_addr;
+ hiz.HierarchicalDepthBufferMOCS = mocs;
+ }
+
+ blorp_emit(batch, GENX(3DSTATE_STENCIL_BUFFER), sb);
+}
+
+static uint32_t
+blorp_emit_blend_state(struct blorp_batch *batch,
+ const struct blorp_params *params)
+{
+ struct GENX(BLEND_STATE) blend;
+ memset(&blend, 0, sizeof(blend));
+
+ for (unsigned i = 0; i < params->num_draw_buffers; ++i) {
+ blend.Entry[i].PreBlendColorClampEnable = true;
+ blend.Entry[i].PostBlendColorClampEnable = true;
+ blend.Entry[i].ColorClampRange = COLORCLAMP_RTFORMAT;
+
+ blend.Entry[i].WriteDisableRed = params->color_write_disable[0];
+ blend.Entry[i].WriteDisableGreen = params->color_write_disable[1];
+ blend.Entry[i].WriteDisableBlue = params->color_write_disable[2];
+ blend.Entry[i].WriteDisableAlpha = params->color_write_disable[3];
+ }
+
+ uint32_t offset;
+ void *state = blorp_alloc_dynamic_state(batch, AUB_TRACE_BLEND_STATE,
+ GENX(BLEND_STATE_length) * 4,
+ 64, &offset);
+ GENX(BLEND_STATE_pack)(NULL, state, &blend);
+
+#if GEN_GEN >= 7
+ blorp_emit(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), sp) {
+ sp.BlendStatePointer = offset;
+#if GEN_GEN >= 8
+ sp.BlendStatePointerValid = true;
+#endif
+ }
+#endif
+
+#if GEN_GEN >= 8
+ blorp_emit(batch, GENX(3DSTATE_PS_BLEND), ps_blend) {
+ ps_blend.HasWriteableRT = true;
+ }
+#endif
+
+ return offset;
+}
+
+static uint32_t
+blorp_emit_color_calc_state(struct blorp_batch *batch,
+ const struct blorp_params *params)
+{
+ uint32_t offset;
+ void *state = blorp_alloc_dynamic_state(batch, AUB_TRACE_CC_STATE,
+ GENX(COLOR_CALC_STATE_length) * 4,
+ 64, &offset);
+ memset(state, 0, GENX(COLOR_CALC_STATE_length) * 4);
+
+#if GEN_GEN >= 7
+ blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), sp) {
+ sp.ColorCalcStatePointer = offset;
+#if GEN_GEN >= 8
+ sp.ColorCalcStatePointerValid = true;
+#endif
+ }
+#endif
+
+ return offset;
+}
+
+static uint32_t
+blorp_emit_depth_stencil_state(struct blorp_batch *batch,
+ const struct blorp_params *params)
+{
+#if GEN_GEN >= 8
+
+ /* On gen8+, DEPTH_STENCIL state is simply an instruction */
+ blorp_emit(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), ds);
+ return 0;
+
+#else /* GEN_GEN <= 7 */
+
+ /* See the following sections of the Sandy Bridge PRM, Volume 1, Part2:
+ * - 7.5.3.1 Depth Buffer Clear
+ * - 7.5.3.2 Depth Buffer Resolve
+ * - 7.5.3.3 Hierarchical Depth Buffer Resolve
+ */
+ struct GENX(DEPTH_STENCIL_STATE) ds = {
+ .DepthBufferWriteEnable = true,
+ };
+
+ if (params->hiz_op == BLORP_HIZ_OP_DEPTH_RESOLVE) {
+ ds.DepthTestEnable = true;
+ ds.DepthTestFunction = COMPAREFUNCTION_NEVER;
+ }
+
+ uint32_t offset;
+ void *state = blorp_alloc_dynamic_state(batch, AUB_TRACE_DEPTH_STENCIL_STATE,
+ GENX(DEPTH_STENCIL_STATE_length) * 4,
+ 64, &offset);
+ GENX(DEPTH_STENCIL_STATE_pack)(NULL, state, &ds);
+
+#if GEN_GEN >= 7
+ blorp_emit(batch, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), sp) {
+ sp.PointertoDEPTH_STENCIL_STATE = offset;
+ }
+#endif
+
+ return offset;
+
+#endif /* GEN_GEN */
+}
+
+struct surface_state_info {
+ unsigned num_dwords;
+ unsigned ss_align; /* Required alignment of RENDER_SURFACE_STATE in bytes */
+ unsigned reloc_dw;
+ unsigned aux_reloc_dw;
+};
+
+static const struct surface_state_info surface_state_infos[] = {
+ [6] = {6, 32, 1, 0},
+ [7] = {8, 32, 1, 6},
+ [8] = {13, 64, 8, 10},
+ [9] = {16, 64, 8, 10},
+};
+
+static void
+blorp_emit_surface_state(struct blorp_batch *batch,
+ const struct brw_blorp_surface_info *surface,
+ uint32_t *state, uint32_t state_offset,
+ bool is_render_target)
+{
+ const struct surface_state_info ss_info = surface_state_infos[GEN_GEN];
+
+ struct isl_surf surf = surface->surf;
+
+ if (surf.dim == ISL_SURF_DIM_1D &&
+ surf.dim_layout == ISL_DIM_LAYOUT_GEN4_2D) {
+ assert(surf.logical_level0_px.height == 1);
+ surf.dim = ISL_SURF_DIM_2D;
+ }
+
+ /* Blorp doesn't support HiZ in any of the blit or slow-clear paths */
+ enum isl_aux_usage aux_usage = surface->aux_usage;
+ if (aux_usage == ISL_AUX_USAGE_HIZ)
+ aux_usage = ISL_AUX_USAGE_NONE;
+
+ const uint32_t mocs =
+ is_render_target ? batch->blorp->mocs.rb : batch->blorp->mocs.tex;
+
+ isl_surf_fill_state(batch->blorp->isl_dev, state,
+ .surf = &surf, .view = &surface->view,
+ .aux_surf = &surface->aux_surf, .aux_usage = aux_usage,
+ .mocs = mocs, .clear_color = surface->clear_color,
+ .x_offset_sa = surface->tile_x_sa,
+ .y_offset_sa = surface->tile_y_sa);
+
+ blorp_surface_reloc(batch, state_offset + ss_info.reloc_dw * 4,
+ surface->addr, 0);
+
+ if (aux_usage != ISL_AUX_USAGE_NONE) {
+ /* On gen7 and prior, the bottom 12 bits of the MCS base address are
+ * used to store other information. This should be ok, however, because
+ * surface buffer addresses are always 4K page alinged.
+ */
+ assert((surface->aux_addr.offset & 0xfff) == 0);
+ blorp_surface_reloc(batch, state_offset + ss_info.aux_reloc_dw * 4,
+ surface->aux_addr, state[ss_info.aux_reloc_dw]);
+ }
+}
+
+static void
+blorp_emit_surface_states(struct blorp_batch *batch,
+ const struct blorp_params *params)
+{
+ uint32_t bind_offset, *bind_map;
+ void *surface_maps[2];
+
+ const unsigned ss_size = GENX(RENDER_SURFACE_STATE_length) * 4;
+ const unsigned ss_align = GENX(RENDER_SURFACE_STATE_length) > 8 ? 64 : 32;
+
+ unsigned num_surfaces = 1 + (params->src.addr.buffer != NULL);
+ blorp_alloc_binding_table(batch, num_surfaces, ss_size, ss_align,
+ &bind_offset, &bind_map, surface_maps);
+
+ blorp_emit_surface_state(batch, &params->dst,
+ surface_maps[BLORP_RENDERBUFFER_BT_INDEX],
+ bind_map[BLORP_RENDERBUFFER_BT_INDEX], true);
+ if (params->src.addr.buffer) {
+ blorp_emit_surface_state(batch, &params->src,
+ surface_maps[BLORP_TEXTURE_BT_INDEX],
+ bind_map[BLORP_TEXTURE_BT_INDEX], false);
+ }
+
+#if GEN_GEN >= 7
+ blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_PS), bt) {
+ bt.PointertoPSBindingTable = bind_offset;
+ }
+#else
+ blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), bt) {
+ bt.PSBindingTableChange = true;
+ bt.PointertoPSBindingTable = bind_offset;
+ }
+#endif
+}
+
+static void
+blorp_emit_sampler_state(struct blorp_batch *batch,
+ const struct blorp_params *params)
+{
+ struct GENX(SAMPLER_STATE) sampler = {
+ .MipModeFilter = MIPFILTER_NONE,
+ .MagModeFilter = MAPFILTER_LINEAR,
+ .MinModeFilter = MAPFILTER_LINEAR,
+ .MinLOD = 0,
+ .MaxLOD = 0,
+ .TCXAddressControlMode = TCM_CLAMP,
+ .TCYAddressControlMode = TCM_CLAMP,
+ .TCZAddressControlMode = TCM_CLAMP,
+ .MaximumAnisotropy = RATIO21,
+ .RAddressMinFilterRoundingEnable = true,
+ .RAddressMagFilterRoundingEnable = true,
+ .VAddressMinFilterRoundingEnable = true,
+ .VAddressMagFilterRoundingEnable = true,
+ .UAddressMinFilterRoundingEnable = true,
+ .UAddressMagFilterRoundingEnable = true,
+ .NonnormalizedCoordinateEnable = true,
+ };
+
+ uint32_t offset;
+ void *state = blorp_alloc_dynamic_state(batch, AUB_TRACE_SAMPLER_STATE,
+ GENX(SAMPLER_STATE_length) * 4,
+ 32, &offset);
+ GENX(SAMPLER_STATE_pack)(NULL, state, &sampler);
+
+#if GEN_GEN >= 7
+ blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_PS), ssp) {
+ ssp.PointertoPSSamplerState = offset;
+ }
+#else
+ blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ssp) {
+ ssp.VSSamplerStateChange = true;
+ ssp.GSSamplerStateChange = true;
+ ssp.PSSamplerStateChange = true;
+ ssp.PointertoPSSamplerState = offset;
+ }
+#endif
+}
+
+/* 3DSTATE_VIEWPORT_STATE_POINTERS */
+static void
+blorp_emit_viewport_state(struct blorp_batch *batch,
+ const struct blorp_params *params)
+{
+ uint32_t cc_vp_offset;
+
+ void *state = blorp_alloc_dynamic_state(batch, AUB_TRACE_CC_VP_STATE,
+ GENX(CC_VIEWPORT_length) * 4, 32,
+ &cc_vp_offset);
+
+ GENX(CC_VIEWPORT_pack)(batch, state,
+ &(struct GENX(CC_VIEWPORT)) {
+ .MinimumDepth = 0.0,
+ .MaximumDepth = 1.0,
+ });
+
+#if GEN_GEN >= 7
+ blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), vsp) {
+ vsp.CCViewportPointer = cc_vp_offset;
+ }
+#else
+ blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vsp) {
+ vsp.CCViewportStateChange = true;
+ vsp.PointertoCC_VIEWPORT = cc_vp_offset;
+ }
+#endif
+}
+
+
+/**
+ * \brief Execute a blit or render pass operation.
+ *
+ * To execute the operation, this function manually constructs and emits a
+ * batch to draw a rectangle primitive. The batchbuffer is flushed before
+ * constructing and after emitting the batch.
+ *
+ * This function alters no GL state.
+ */
+static void
+blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
+{
+ uint32_t blend_state_offset = 0;
+ uint32_t color_calc_state_offset = 0;
+ uint32_t depth_stencil_state_offset;
+
+ blorp_emit_vertex_buffers(batch, params);
+ blorp_emit_vertex_elements(batch, params);
+
+ emit_urb_config(batch, params);
+
+ if (params->wm_prog_data) {
+ blend_state_offset = blorp_emit_blend_state(batch, params);
+ color_calc_state_offset = blorp_emit_color_calc_state(batch, params);
+ }
+ depth_stencil_state_offset = blorp_emit_depth_stencil_state(batch, params);
+
+#if GEN_GEN <= 6
+ /* 3DSTATE_CC_STATE_POINTERS
+ *
+ * The pointer offsets are relative to
+ * CMD_STATE_BASE_ADDRESS.DynamicStateBaseAddress.
+ *
+ * The HiZ op doesn't use BLEND_STATE or COLOR_CALC_STATE.
+ *
+ * The dynamic state emit helpers emit their own STATE_POINTERS packets on
+ * gen7+. However, on gen6 and earlier, they're all lumpped together in
+ * one CC_STATE_POINTERS packet so we have to emit that here.
+ */
+ blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), cc) {
+ cc.BLEND_STATEChange = true;
+ cc.COLOR_CALC_STATEChange = true;
+ cc.DEPTH_STENCIL_STATEChange = true;
+ cc.PointertoBLEND_STATE = blend_state_offset;
+ cc.PointertoCOLOR_CALC_STATE = color_calc_state_offset;
+ cc.PointertoDEPTH_STENCIL_STATE = depth_stencil_state_offset;
+ }
+#else
+ (void)blend_state_offset;
+ (void)color_calc_state_offset;
+ (void)depth_stencil_state_offset;
+#endif
+
+ blorp_emit(batch, GENX(3DSTATE_CONSTANT_VS), vs);
+#if GEN_GEN >= 7
+ blorp_emit(batch, GENX(3DSTATE_CONSTANT_HS), hs);
+ blorp_emit(batch, GENX(3DSTATE_CONSTANT_DS), DS);
+#endif
+ blorp_emit(batch, GENX(3DSTATE_CONSTANT_GS), gs);
+ blorp_emit(batch, GENX(3DSTATE_CONSTANT_PS), ps);
+
+ if (params->wm_prog_data)
+ blorp_emit_surface_states(batch, params);
+
+ if (params->src.addr.buffer)
+ blorp_emit_sampler_state(batch, params);
+
+ blorp_emit_3dstate_multisample(batch, params->dst.surf.samples);
+
+ blorp_emit(batch, GENX(3DSTATE_SAMPLE_MASK), mask) {
+ mask.SampleMask = (1 << params->dst.surf.samples) - 1;
+ }
+
+ /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
+ * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
+ *
+ * [DevSNB] A pipeline flush must be programmed prior to a
+ * 3DSTATE_VS command that causes the VS Function Enable to
+ * toggle. Pipeline flush can be executed by sending a PIPE_CONTROL
+ * command with CS stall bit set and a post sync operation.
+ *
+ * We've already done one at the start of the BLORP operation.
+ */
+ blorp_emit(batch, GENX(3DSTATE_VS), vs);
+#if GEN_GEN >= 7
+ blorp_emit(batch, GENX(3DSTATE_HS), hs);
+ blorp_emit(batch, GENX(3DSTATE_TE), te);
+ blorp_emit(batch, GENX(3DSTATE_DS), DS);
+ blorp_emit(batch, GENX(3DSTATE_STREAMOUT), so);
+#endif
+ blorp_emit(batch, GENX(3DSTATE_GS), gs);
+
+ blorp_emit(batch, GENX(3DSTATE_CLIP), clip) {
+ clip.PerspectiveDivideDisable = true;
+ }
+
+ blorp_emit_sf_config(batch, params);
+ blorp_emit_ps_config(batch, params);
+
+ blorp_emit_viewport_state(batch, params);
+
+ if (params->depth.addr.buffer) {
+ blorp_emit_depth_stencil_config(batch, params);
+ } else {
+ blorp_emit(batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
+ db.SurfaceType = SURFTYPE_NULL;
+ db.SurfaceFormat = D32_FLOAT;
+ }
+ blorp_emit(batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hiz);
+ blorp_emit(batch, GENX(3DSTATE_STENCIL_BUFFER), sb);
+ }
+
+ /* 3DSTATE_CLEAR_PARAMS
+ *
+ * From the Sandybridge PRM, Volume 2, Part 1, Section 3DSTATE_CLEAR_PARAMS:
+ * [DevSNB] 3DSTATE_CLEAR_PARAMS packet must follow the DEPTH_BUFFER_STATE
+ * packet when HiZ is enabled and the DEPTH_BUFFER_STATE changes.
+ */
+ blorp_emit(batch, GENX(3DSTATE_CLEAR_PARAMS), clear) {
+ clear.DepthClearValueValid = true;
+ clear.DepthClearValue = params->depth.clear_color.u32[0];
+ }
+
+ blorp_emit(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
+ rect.ClippedDrawingRectangleXMax = MAX2(params->x1, params->x0) - 1;
+ rect.ClippedDrawingRectangleYMax = MAX2(params->y1, params->y0) - 1;
+ }
+
+ blorp_emit(batch, GENX(3DPRIMITIVE), prim) {
+ prim.VertexAccessType = SEQUENTIAL;
+ prim.PrimitiveTopologyType = _3DPRIM_RECTLIST;
+ prim.VertexCountPerInstance = 3;
+ prim.InstanceCount = params->num_layers;
+ }
+}
diff --git a/src/intel/blorp/blorp_priv.h b/src/intel/blorp/blorp_priv.h
new file mode 100644
index 00000000000..33f197b523d
--- /dev/null
+++ b/src/intel/blorp/blorp_priv.h
@@ -0,0 +1,291 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "compiler/nir/nir.h"
+#include "brw_compiler.h"
+
+#include "blorp.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Binding table indices used by BLORP.
+ */
+enum {
+ BLORP_RENDERBUFFER_BT_INDEX,
+ BLORP_TEXTURE_BT_INDEX,
+ BLORP_NUM_BT_ENTRIES
+};
+
+enum blorp_fast_clear_op {
+ BLORP_FAST_CLEAR_OP_NONE = 0,
+ BLORP_FAST_CLEAR_OP_CLEAR,
+ BLORP_FAST_CLEAR_OP_RESOLVE_PARTIAL,
+ BLORP_FAST_CLEAR_OP_RESOLVE_FULL,
+};
+
+struct brw_blorp_surface_info
+{
+ struct isl_surf surf;
+ struct blorp_address addr;
+
+ struct isl_surf aux_surf;
+ struct blorp_address aux_addr;
+ enum isl_aux_usage aux_usage;
+
+ union isl_color_value clear_color;
+
+ struct isl_view view;
+
+ /* Z offset into a 3-D texture or slice of a 2-D array texture. */
+ uint32_t z_offset;
+
+ uint32_t tile_x_sa, tile_y_sa;
+};
+
+void
+brw_blorp_surface_info_init(struct blorp_context *blorp,
+ struct brw_blorp_surface_info *info,
+ const struct blorp_surf *surf,
+ unsigned int level, unsigned int layer,
+ enum isl_format format, bool is_render_target);
+
+
+struct brw_blorp_coord_transform
+{
+ float multiplier;
+ float offset;
+};
+
+/**
+ * Bounding rectangle telling pixel discard which pixels are not to be
+ * touched. This is needed in when surfaces are configured as something else
+ * what they really are:
+ *
+ * - writing W-tiled stencil as Y-tiled
+ * - writing interleaved multisampled as single sampled.
+ *
+ * See blorp_nir_discard_if_outside_rect().
+ */
+struct brw_blorp_discard_rect
+{
+ uint32_t x0;
+ uint32_t x1;
+ uint32_t y0;
+ uint32_t y1;
+};
+
+/**
+ * Grid needed for blended and scaled blits of integer formats, see
+ * blorp_nir_manual_blend_bilinear().
+ */
+struct brw_blorp_rect_grid
+{
+ float x1;
+ float y1;
+ float pad[2];
+};
+
+struct brw_blorp_wm_inputs
+{
+ struct brw_blorp_discard_rect discard_rect;
+ struct brw_blorp_rect_grid rect_grid;
+ struct brw_blorp_coord_transform coord_transform[2];
+
+ /* Minimum layer setting works for all the textures types but texture_3d
+ * for which the setting has no effect. Use the z-coordinate instead.
+ */
+ uint32_t src_z;
+
+ /* Pad out to an integral number of registers */
+ uint32_t pad[3];
+};
+
+struct brw_blorp_prog_data
+{
+ bool dispatch_8;
+ bool dispatch_16;
+
+ uint8_t first_curbe_grf_0;
+ uint8_t first_curbe_grf_2;
+
+ uint32_t ksp_offset_2;
+
+ /**
+ * True if the WM program should be run in MSDISPMODE_PERSAMPLE with more
+ * than one sample per pixel.
+ */
+ bool persample_msaa_dispatch;
+
+ /**
+ * Mask of which FS inputs are marked flat by the shader source. This is
+ * needed for setting up 3DSTATE_SF/SBE.
+ */
+ uint32_t flat_inputs;
+ unsigned num_varying_inputs;
+ uint64_t inputs_read;
+};
+
+static inline unsigned
+brw_blorp_get_urb_length(const struct brw_blorp_prog_data *prog_data)
+{
+ if (prog_data == NULL)
+ return 1;
+
+ /* From the BSpec: 3D Pipeline - Strips and Fans - 3DSTATE_SBE
+ *
+ * read_length = ceiling((max_source_attr+1)/2)
+ */
+ return MAX2((prog_data->num_varying_inputs + 1) / 2, 1);
+}
+
+struct blorp_params
+{
+ uint32_t x0;
+ uint32_t y0;
+ uint32_t x1;
+ uint32_t y1;
+ struct brw_blorp_surface_info depth;
+ uint32_t depth_format;
+ struct brw_blorp_surface_info src;
+ struct brw_blorp_surface_info dst;
+ enum blorp_hiz_op hiz_op;
+ enum blorp_fast_clear_op fast_clear_op;
+ bool color_write_disable[4];
+ struct brw_blorp_wm_inputs wm_inputs;
+ unsigned num_draw_buffers;
+ unsigned num_layers;
+ uint32_t wm_prog_kernel;
+ struct brw_blorp_prog_data *wm_prog_data;
+};
+
+void blorp_params_init(struct blorp_params *params);
+
+struct brw_blorp_blit_prog_key
+{
+ /* Number of samples per pixel that have been configured in the surface
+ * state for texturing from.
+ */
+ unsigned tex_samples;
+
+ /* MSAA layout that has been configured in the surface state for texturing
+ * from.
+ */
+ enum isl_msaa_layout tex_layout;
+
+ enum isl_aux_usage tex_aux_usage;
+
+ /* Actual number of samples per pixel in the source image. */
+ unsigned src_samples;
+
+ /* Actual MSAA layout used by the source image. */
+ enum isl_msaa_layout src_layout;
+
+ /* Number of samples per pixel that have been configured in the render
+ * target.
+ */
+ unsigned rt_samples;
+
+ /* MSAA layout that has been configured in the render target. */
+ enum isl_msaa_layout rt_layout;
+
+ /* Actual number of samples per pixel in the destination image. */
+ unsigned dst_samples;
+
+ /* Actual MSAA layout used by the destination image. */
+ enum isl_msaa_layout dst_layout;
+
+ /* Type of the data to be read from the texture (one of
+ * nir_type_(int|uint|float)).
+ */
+ nir_alu_type texture_data_type;
+
+ /* True if the source image is W tiled. If true, the surface state for the
+ * source image must be configured as Y tiled, and tex_samples must be 0.
+ */
+ bool src_tiled_w;
+
+ /* True if the destination image is W tiled. If true, the surface state
+ * for the render target must be configured as Y tiled, and rt_samples must
+ * be 0.
+ */
+ bool dst_tiled_w;
+
+ /* True if all source samples should be blended together to produce each
+ * destination pixel. If true, src_tiled_w must be false, tex_samples must
+ * equal src_samples, and tex_samples must be nonzero.
+ */
+ bool blend;
+
+ /* True if the rectangle being sent through the rendering pipeline might be
+ * larger than the destination rectangle, so the WM program should kill any
+ * pixels that are outside the destination rectangle.
+ */
+ bool use_kill;
+
+ /**
+ * True if the WM program should be run in MSDISPMODE_PERSAMPLE with more
+ * than one sample per pixel.
+ */
+ bool persample_msaa_dispatch;
+
+ /* True for scaled blitting. */
+ bool blit_scaled;
+
+ /* Scale factors between the pixel grid and the grid of samples. We're
+ * using grid of samples for bilinear filetring in multisample scaled blits.
+ */
+ float x_scale;
+ float y_scale;
+
+ /* True for blits with filter = GL_LINEAR. */
+ bool bilinear_filter;
+};
+
+/**
+ * \name BLORP internals
+ * \{
+ *
+ * Used internally by gen6_blorp_exec() and gen7_blorp_exec().
+ */
+
+void brw_blorp_init_wm_prog_key(struct brw_wm_prog_key *wm_key);
+
+const unsigned *
+brw_blorp_compile_nir_shader(struct blorp_context *blorp, struct nir_shader *nir,
+ const struct brw_wm_prog_key *wm_key,
+ bool use_repclear,
+ struct brw_blorp_prog_data *prog_data,
+ unsigned *program_size);
+
+/** \} */
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif /* __cplusplus */