summaryrefslogtreecommitdiffstats
path: root/src/intel/compiler
diff options
context:
space:
mode:
authorJason Ekstrand <[email protected]>2017-02-28 09:10:43 -0800
committerEmil Velikov <[email protected]>2017-03-13 11:16:34 +0000
commit700bebb958e93f4d472c383de62ced9db8e64bec (patch)
tree0075c098c56c338f38ba0db80b9dba3e7e268a17 /src/intel/compiler
parentd0d4a5f43b4dd79bd7bfff7c7deaade10bfebf7c (diff)
i965: Move the back-end compiler to src/intel/compiler
Mostly a dummy git mv with a couple of noticable parts: - With the earlier header cleanups, nothing in src/intel depends files from src/mesa/drivers/dri/i965/ - Both Autoconf and Android builds are addressed. Thanks to Mauro and Tapani for the fixups in the latter - brw_util.[ch] is not really compiler specific, so it's moved to i965. v2: - move brw_eu_defines.h instead of brw_defines.h - remove no-longer applicable includes - add missing vulkan/ prefix in the Android build (thanks Tapani) v3: - don't list brw_defines.h in src/intel/Makefile.sources (Jason) - rebase on top of the oa patches [Emil Velikov: commit message, various small fixes througout] Signed-off-by: Emil Velikov <[email protected]> Reviewed-by: Jason Ekstrand <[email protected]>
Diffstat (limited to 'src/intel/compiler')
-rw-r--r--src/intel/compiler/.gitignore10
-rw-r--r--src/intel/compiler/brw_cfg.cpp531
-rw-r--r--src/intel/compiler/brw_cfg.h358
-rw-r--r--src/intel/compiler/brw_compiler.c160
-rw-r--r--src/intel/compiler/brw_compiler.h1057
-rw-r--r--src/intel/compiler/brw_dead_control_flow.cpp119
-rw-r--r--src/intel/compiler/brw_dead_control_flow.h26
-rw-r--r--src/intel/compiler/brw_disasm.c1646
-rw-r--r--src/intel/compiler/brw_eu.c719
-rw-r--r--src/intel/compiler/brw_eu.h612
-rw-r--r--src/intel/compiler/brw_eu_compact.c1579
-rw-r--r--src/intel/compiler/brw_eu_defines.h1246
-rw-r--r--src/intel/compiler/brw_eu_emit.c3675
-rw-r--r--src/intel/compiler/brw_eu_util.c123
-rw-r--r--src/intel/compiler/brw_eu_validate.c1051
-rw-r--r--src/intel/compiler/brw_fs.cpp6805
-rw-r--r--src/intel/compiler/brw_fs.h500
-rw-r--r--src/intel/compiler/brw_fs_builder.h662
-rw-r--r--src/intel/compiler/brw_fs_cmod_propagation.cpp183
-rw-r--r--src/intel/compiler/brw_fs_combine_constants.cpp329
-rw-r--r--src/intel/compiler/brw_fs_copy_propagation.cpp869
-rw-r--r--src/intel/compiler/brw_fs_cse.cpp380
-rw-r--r--src/intel/compiler/brw_fs_dead_code_eliminate.cpp148
-rw-r--r--src/intel/compiler/brw_fs_generator.cpp2126
-rw-r--r--src/intel/compiler/brw_fs_live_variables.cpp334
-rw-r--r--src/intel/compiler/brw_fs_live_variables.h115
-rw-r--r--src/intel/compiler/brw_fs_lower_d2x.cpp78
-rw-r--r--src/intel/compiler/brw_fs_lower_pack.cpp55
-rw-r--r--src/intel/compiler/brw_fs_nir.cpp4679
-rw-r--r--src/intel/compiler/brw_fs_reg_allocate.cpp992
-rw-r--r--src/intel/compiler/brw_fs_register_coalesce.cpp295
-rw-r--r--src/intel/compiler/brw_fs_saturate_propagation.cpp156
-rw-r--r--src/intel/compiler/brw_fs_sel_peephole.cpp220
-rw-r--r--src/intel/compiler/brw_fs_surface_builder.cpp1194
-rw-r--r--src/intel/compiler/brw_fs_surface_builder.h88
-rw-r--r--src/intel/compiler/brw_fs_validate.cpp57
-rw-r--r--src/intel/compiler/brw_fs_visitor.cpp953
-rw-r--r--src/intel/compiler/brw_inst.h866
-rw-r--r--src/intel/compiler/brw_interpolation_map.c109
-rw-r--r--src/intel/compiler/brw_ir_allocator.h87
-rw-r--r--src/intel/compiler/brw_ir_fs.h451
-rw-r--r--src/intel/compiler/brw_ir_vec4.h409
-rw-r--r--src/intel/compiler/brw_nir.c764
-rw-r--r--src/intel/compiler/brw_nir.h154
-rw-r--r--src/intel/compiler/brw_nir_analyze_boolean_resolves.c269
-rw-r--r--src/intel/compiler/brw_nir_attribute_workarounds.c176
-rw-r--r--src/intel/compiler/brw_nir_intrinsics.c186
-rw-r--r--src/intel/compiler/brw_nir_opt_peephole_ffma.c297
-rw-r--r--src/intel/compiler/brw_nir_tcs_workarounds.c152
-rw-r--r--src/intel/compiler/brw_nir_trig_workarounds.py43
-rw-r--r--src/intel/compiler/brw_packed_float.c75
-rw-r--r--src/intel/compiler/brw_predicated_break.cpp148
-rw-r--r--src/intel/compiler/brw_reg.h1135
-rw-r--r--src/intel/compiler/brw_schedule_instructions.cpp1753
-rw-r--r--src/intel/compiler/brw_shader.cpp1273
-rw-r--r--src/intel/compiler/brw_shader.h295
-rw-r--r--src/intel/compiler/brw_vec4.cpp2851
-rw-r--r--src/intel/compiler/brw_vec4.h399
-rw-r--r--src/intel/compiler/brw_vec4_builder.h634
-rw-r--r--src/intel/compiler/brw_vec4_cmod_propagation.cpp172
-rw-r--r--src/intel/compiler/brw_vec4_copy_propagation.cpp558
-rw-r--r--src/intel/compiler/brw_vec4_cse.cpp296
-rw-r--r--src/intel/compiler/brw_vec4_dead_code_eliminate.cpp160
-rw-r--r--src/intel/compiler/brw_vec4_generator.cpp2217
-rw-r--r--src/intel/compiler/brw_vec4_gs_nir.cpp145
-rw-r--r--src/intel/compiler/brw_vec4_gs_visitor.cpp933
-rw-r--r--src/intel/compiler/brw_vec4_gs_visitor.h81
-rw-r--r--src/intel/compiler/brw_vec4_live_variables.cpp343
-rw-r--r--src/intel/compiler/brw_vec4_live_variables.h112
-rw-r--r--src/intel/compiler/brw_vec4_nir.cpp2407
-rw-r--r--src/intel/compiler/brw_vec4_reg_allocate.cpp558
-rw-r--r--src/intel/compiler/brw_vec4_surface_builder.cpp332
-rw-r--r--src/intel/compiler/brw_vec4_surface_builder.h69
-rw-r--r--src/intel/compiler/brw_vec4_tcs.cpp516
-rw-r--r--src/intel/compiler/brw_vec4_tcs.h88
-rw-r--r--src/intel/compiler/brw_vec4_tes.cpp296
-rw-r--r--src/intel/compiler/brw_vec4_tes.h68
-rw-r--r--src/intel/compiler/brw_vec4_visitor.cpp1917
-rw-r--r--src/intel/compiler/brw_vec4_vs.h68
-rw-r--r--src/intel/compiler/brw_vec4_vs_visitor.cpp221
-rw-r--r--src/intel/compiler/brw_vue_map.c307
-rw-r--r--src/intel/compiler/brw_wm_iz.cpp169
-rw-r--r--src/intel/compiler/gen6_gs_visitor.cpp753
-rw-r--r--src/intel/compiler/gen6_gs_visitor.h91
-rw-r--r--src/intel/compiler/intel_asm_annotation.c198
-rw-r--r--src/intel/compiler/intel_asm_annotation.h80
-rw-r--r--src/intel/compiler/test_eu_compact.c300
-rw-r--r--src/intel/compiler/test_eu_validate.cpp847
-rw-r--r--src/intel/compiler/test_fs_cmod_propagation.cpp556
-rw-r--r--src/intel/compiler/test_fs_copy_propagation.cpp213
-rw-r--r--src/intel/compiler/test_fs_saturate_propagation.cpp600
-rw-r--r--src/intel/compiler/test_vec4_cmod_propagation.cpp823
-rw-r--r--src/intel/compiler/test_vec4_copy_propagation.cpp181
-rw-r--r--src/intel/compiler/test_vec4_register_coalesce.cpp242
-rw-r--r--src/intel/compiler/test_vf_float_conversions.cpp110
95 files changed, 63683 insertions, 0 deletions
diff --git a/src/intel/compiler/.gitignore b/src/intel/compiler/.gitignore
new file mode 100644
index 00000000000..e844421b336
--- /dev/null
+++ b/src/intel/compiler/.gitignore
@@ -0,0 +1,10 @@
+brw_nir_trig_workarounds.c
+test_eu_compact
+test_eu_validate
+test_fs_cmod_propagation
+test_fs_copy_propagation
+test_fs_saturate_propagation
+test_vec4_cmod_propagation
+test_vec4_copy_propagation
+test_vec4_register_coalesce
+test_vf_float_conversions
diff --git a/src/intel/compiler/brw_cfg.cpp b/src/intel/compiler/brw_cfg.cpp
new file mode 100644
index 00000000000..fad12eec588
--- /dev/null
+++ b/src/intel/compiler/brw_cfg.cpp
@@ -0,0 +1,531 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ * Eric Anholt <[email protected]>
+ *
+ */
+
+#include "brw_cfg.h"
+
+/** @file brw_cfg.cpp
+ *
+ * Walks the shader instructions generated and creates a set of basic
+ * blocks with successor/predecessor edges connecting them.
+ */
+
+static bblock_t *
+pop_stack(exec_list *list)
+{
+ bblock_link *link = (bblock_link *)list->get_tail();
+ bblock_t *block = link->block;
+ link->link.remove();
+
+ return block;
+}
+
+static exec_node *
+link(void *mem_ctx, bblock_t *block)
+{
+ bblock_link *l = new(mem_ctx) bblock_link(block);
+ return &l->link;
+}
+
+bblock_t::bblock_t(cfg_t *cfg) :
+ cfg(cfg), idom(NULL), start_ip(0), end_ip(0), num(0), cycle_count(0)
+{
+ instructions.make_empty();
+ parents.make_empty();
+ children.make_empty();
+}
+
+void
+bblock_t::add_successor(void *mem_ctx, bblock_t *successor)
+{
+ successor->parents.push_tail(::link(mem_ctx, this));
+ children.push_tail(::link(mem_ctx, successor));
+}
+
+bool
+bblock_t::is_predecessor_of(const bblock_t *block) const
+{
+ foreach_list_typed_safe (bblock_link, parent, link, &block->parents) {
+ if (parent->block == this) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool
+bblock_t::is_successor_of(const bblock_t *block) const
+{
+ foreach_list_typed_safe (bblock_link, child, link, &block->children) {
+ if (child->block == this) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool
+ends_block(const backend_instruction *inst)
+{
+ enum opcode op = inst->opcode;
+
+ return op == BRW_OPCODE_IF ||
+ op == BRW_OPCODE_ELSE ||
+ op == BRW_OPCODE_CONTINUE ||
+ op == BRW_OPCODE_BREAK ||
+ op == BRW_OPCODE_WHILE;
+}
+
+static bool
+starts_block(const backend_instruction *inst)
+{
+ enum opcode op = inst->opcode;
+
+ return op == BRW_OPCODE_DO ||
+ op == BRW_OPCODE_ENDIF;
+}
+
+bool
+bblock_t::can_combine_with(const bblock_t *that) const
+{
+ if ((const bblock_t *)this->link.next != that)
+ return false;
+
+ if (ends_block(this->end()) ||
+ starts_block(that->start()))
+ return false;
+
+ return true;
+}
+
+void
+bblock_t::combine_with(bblock_t *that)
+{
+ assert(this->can_combine_with(that));
+ foreach_list_typed (bblock_link, link, link, &this->children) {
+ assert(link->block == that);
+ }
+ foreach_list_typed (bblock_link, link, link, &that->parents) {
+ assert(link->block == this);
+ }
+
+ this->end_ip = that->end_ip;
+ this->instructions.append_list(&that->instructions);
+
+ this->cfg->remove_block(that);
+}
+
+void
+bblock_t::dump(backend_shader *s) const
+{
+ int ip = this->start_ip;
+ foreach_inst_in_block(backend_instruction, inst, this) {
+ fprintf(stderr, "%5d: ", ip);
+ s->dump_instruction(inst);
+ ip++;
+ }
+}
+
+cfg_t::cfg_t(exec_list *instructions)
+{
+ mem_ctx = ralloc_context(NULL);
+ block_list.make_empty();
+ blocks = NULL;
+ num_blocks = 0;
+ idom_dirty = true;
+ cycle_count = 0;
+
+ bblock_t *cur = NULL;
+ int ip = 0;
+
+ bblock_t *entry = new_block();
+ bblock_t *cur_if = NULL; /**< BB ending with IF. */
+ bblock_t *cur_else = NULL; /**< BB ending with ELSE. */
+ bblock_t *cur_endif = NULL; /**< BB starting with ENDIF. */
+ bblock_t *cur_do = NULL; /**< BB starting with DO. */
+ bblock_t *cur_while = NULL; /**< BB immediately following WHILE. */
+ exec_list if_stack, else_stack, do_stack, while_stack;
+ bblock_t *next;
+
+ set_next_block(&cur, entry, ip);
+
+ foreach_in_list_safe(backend_instruction, inst, instructions) {
+ /* set_next_block wants the post-incremented ip */
+ ip++;
+
+ inst->exec_node::remove();
+
+ switch (inst->opcode) {
+ case BRW_OPCODE_IF:
+ cur->instructions.push_tail(inst);
+
+ /* Push our information onto a stack so we can recover from
+ * nested ifs.
+ */
+ if_stack.push_tail(link(mem_ctx, cur_if));
+ else_stack.push_tail(link(mem_ctx, cur_else));
+
+ cur_if = cur;
+ cur_else = NULL;
+ cur_endif = NULL;
+
+ /* Set up our immediately following block, full of "then"
+ * instructions.
+ */
+ next = new_block();
+ cur_if->add_successor(mem_ctx, next);
+
+ set_next_block(&cur, next, ip);
+ break;
+
+ case BRW_OPCODE_ELSE:
+ cur->instructions.push_tail(inst);
+
+ cur_else = cur;
+
+ next = new_block();
+ assert(cur_if != NULL);
+ cur_if->add_successor(mem_ctx, next);
+
+ set_next_block(&cur, next, ip);
+ break;
+
+ case BRW_OPCODE_ENDIF: {
+ if (cur->instructions.is_empty()) {
+ /* New block was just created; use it. */
+ cur_endif = cur;
+ } else {
+ cur_endif = new_block();
+
+ cur->add_successor(mem_ctx, cur_endif);
+
+ set_next_block(&cur, cur_endif, ip - 1);
+ }
+
+ cur->instructions.push_tail(inst);
+
+ if (cur_else) {
+ cur_else->add_successor(mem_ctx, cur_endif);
+ } else {
+ assert(cur_if != NULL);
+ cur_if->add_successor(mem_ctx, cur_endif);
+ }
+
+ assert(cur_if->end()->opcode == BRW_OPCODE_IF);
+ assert(!cur_else || cur_else->end()->opcode == BRW_OPCODE_ELSE);
+
+ /* Pop the stack so we're in the previous if/else/endif */
+ cur_if = pop_stack(&if_stack);
+ cur_else = pop_stack(&else_stack);
+ break;
+ }
+ case BRW_OPCODE_DO:
+ /* Push our information onto a stack so we can recover from
+ * nested loops.
+ */
+ do_stack.push_tail(link(mem_ctx, cur_do));
+ while_stack.push_tail(link(mem_ctx, cur_while));
+
+ /* Set up the block just after the while. Don't know when exactly
+ * it will start, yet.
+ */
+ cur_while = new_block();
+
+ if (cur->instructions.is_empty()) {
+ /* New block was just created; use it. */
+ cur_do = cur;
+ } else {
+ cur_do = new_block();
+
+ cur->add_successor(mem_ctx, cur_do);
+
+ set_next_block(&cur, cur_do, ip - 1);
+ }
+
+ cur->instructions.push_tail(inst);
+ break;
+
+ case BRW_OPCODE_CONTINUE:
+ cur->instructions.push_tail(inst);
+
+ assert(cur_do != NULL);
+ cur->add_successor(mem_ctx, cur_do);
+
+ next = new_block();
+ if (inst->predicate)
+ cur->add_successor(mem_ctx, next);
+
+ set_next_block(&cur, next, ip);
+ break;
+
+ case BRW_OPCODE_BREAK:
+ cur->instructions.push_tail(inst);
+
+ assert(cur_while != NULL);
+ cur->add_successor(mem_ctx, cur_while);
+
+ next = new_block();
+ if (inst->predicate)
+ cur->add_successor(mem_ctx, next);
+
+ set_next_block(&cur, next, ip);
+ break;
+
+ case BRW_OPCODE_WHILE:
+ cur->instructions.push_tail(inst);
+
+ assert(cur_do != NULL && cur_while != NULL);
+ cur->add_successor(mem_ctx, cur_do);
+
+ if (inst->predicate)
+ cur->add_successor(mem_ctx, cur_while);
+
+ set_next_block(&cur, cur_while, ip);
+
+ /* Pop the stack so we're in the previous loop */
+ cur_do = pop_stack(&do_stack);
+ cur_while = pop_stack(&while_stack);
+ break;
+
+ default:
+ cur->instructions.push_tail(inst);
+ break;
+ }
+ }
+
+ cur->end_ip = ip - 1;
+
+ make_block_array();
+}
+
+cfg_t::~cfg_t()
+{
+ ralloc_free(mem_ctx);
+}
+
+void
+cfg_t::remove_block(bblock_t *block)
+{
+ foreach_list_typed_safe (bblock_link, predecessor, link, &block->parents) {
+ /* Remove block from all of its predecessors' successor lists. */
+ foreach_list_typed_safe (bblock_link, successor, link,
+ &predecessor->block->children) {
+ if (block == successor->block) {
+ successor->link.remove();
+ ralloc_free(successor);
+ }
+ }
+
+ /* Add removed-block's successors to its predecessors' successor lists. */
+ foreach_list_typed (bblock_link, successor, link, &block->children) {
+ if (!successor->block->is_successor_of(predecessor->block)) {
+ predecessor->block->children.push_tail(link(mem_ctx,
+ successor->block));
+ }
+ }
+ }
+
+ foreach_list_typed_safe (bblock_link, successor, link, &block->children) {
+ /* Remove block from all of its childrens' parents lists. */
+ foreach_list_typed_safe (bblock_link, predecessor, link,
+ &successor->block->parents) {
+ if (block == predecessor->block) {
+ predecessor->link.remove();
+ ralloc_free(predecessor);
+ }
+ }
+
+ /* Add removed-block's predecessors to its successors' predecessor lists. */
+ foreach_list_typed (bblock_link, predecessor, link, &block->parents) {
+ if (!predecessor->block->is_predecessor_of(successor->block)) {
+ successor->block->parents.push_tail(link(mem_ctx,
+ predecessor->block));
+ }
+ }
+ }
+
+ block->link.remove();
+
+ for (int b = block->num; b < this->num_blocks - 1; b++) {
+ this->blocks[b] = this->blocks[b + 1];
+ this->blocks[b]->num = b;
+ }
+
+ this->blocks[this->num_blocks - 1]->num = this->num_blocks - 2;
+ this->num_blocks--;
+ idom_dirty = true;
+}
+
+bblock_t *
+cfg_t::new_block()
+{
+ bblock_t *block = new(mem_ctx) bblock_t(this);
+
+ return block;
+}
+
+void
+cfg_t::set_next_block(bblock_t **cur, bblock_t *block, int ip)
+{
+ if (*cur) {
+ (*cur)->end_ip = ip - 1;
+ }
+
+ block->start_ip = ip;
+ block->num = num_blocks++;
+ block_list.push_tail(&block->link);
+ *cur = block;
+}
+
+void
+cfg_t::make_block_array()
+{
+ blocks = ralloc_array(mem_ctx, bblock_t *, num_blocks);
+
+ int i = 0;
+ foreach_block (block, this) {
+ blocks[i++] = block;
+ }
+ assert(i == num_blocks);
+}
+
+void
+cfg_t::dump(backend_shader *s)
+{
+ if (idom_dirty)
+ calculate_idom();
+
+ foreach_block (block, this) {
+ if (block->idom)
+ fprintf(stderr, "START B%d IDOM(B%d)", block->num, block->idom->num);
+ else
+ fprintf(stderr, "START B%d IDOM(none)", block->num);
+
+ foreach_list_typed(bblock_link, link, link, &block->parents) {
+ fprintf(stderr, " <-B%d",
+ link->block->num);
+ }
+ fprintf(stderr, "\n");
+ if (s != NULL)
+ block->dump(s);
+ fprintf(stderr, "END B%d", block->num);
+ foreach_list_typed(bblock_link, link, link, &block->children) {
+ fprintf(stderr, " ->B%d",
+ link->block->num);
+ }
+ fprintf(stderr, "\n");
+ }
+}
+
+/* Calculates the immediate dominator of each block, according to "A Simple,
+ * Fast Dominance Algorithm" by Keith D. Cooper, Timothy J. Harvey, and Ken
+ * Kennedy.
+ *
+ * The authors claim that for control flow graphs of sizes normally encountered
+ * (less than 1000 nodes) that this algorithm is significantly faster than
+ * others like Lengauer-Tarjan.
+ */
+void
+cfg_t::calculate_idom()
+{
+ foreach_block(block, this) {
+ block->idom = NULL;
+ }
+ blocks[0]->idom = blocks[0];
+
+ bool changed;
+ do {
+ changed = false;
+
+ foreach_block(block, this) {
+ if (block->num == 0)
+ continue;
+
+ bblock_t *new_idom = NULL;
+ foreach_list_typed(bblock_link, parent, link, &block->parents) {
+ if (parent->block->idom) {
+ if (new_idom == NULL) {
+ new_idom = parent->block;
+ } else if (parent->block->idom != NULL) {
+ new_idom = intersect(parent->block, new_idom);
+ }
+ }
+ }
+
+ if (block->idom != new_idom) {
+ block->idom = new_idom;
+ changed = true;
+ }
+ }
+ } while (changed);
+
+ idom_dirty = false;
+}
+
+bblock_t *
+cfg_t::intersect(bblock_t *b1, bblock_t *b2)
+{
+ /* Note, the comparisons here are the opposite of what the paper says
+ * because we index blocks from beginning -> end (i.e. reverse post-order)
+ * instead of post-order like they assume.
+ */
+ while (b1->num != b2->num) {
+ while (b1->num > b2->num)
+ b1 = b1->idom;
+ while (b2->num > b1->num)
+ b2 = b2->idom;
+ }
+ assert(b1);
+ return b1;
+}
+
+void
+cfg_t::dump_cfg()
+{
+ printf("digraph CFG {\n");
+ for (int b = 0; b < num_blocks; b++) {
+ bblock_t *block = this->blocks[b];
+
+ foreach_list_typed_safe (bblock_link, child, link, &block->children) {
+ printf("\t%d -> %d\n", b, child->block->num);
+ }
+ }
+ printf("}\n");
+}
+
+void
+cfg_t::dump_domtree()
+{
+ printf("digraph DominanceTree {\n");
+ foreach_block(block, this) {
+ if (block->idom) {
+ printf("\t%d -> %d\n", block->idom->num, block->num);
+ }
+ }
+ printf("}\n");
+}
diff --git a/src/intel/compiler/brw_cfg.h b/src/intel/compiler/brw_cfg.h
new file mode 100644
index 00000000000..b8af40f725f
--- /dev/null
+++ b/src/intel/compiler/brw_cfg.h
@@ -0,0 +1,358 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ * Eric Anholt <[email protected]>
+ *
+ */
+
+#pragma once
+#ifndef BRW_CFG_H
+#define BRW_CFG_H
+
+#include "brw_shader.h"
+
+struct bblock_t;
+
+struct bblock_link {
+#ifdef __cplusplus
+ DECLARE_RALLOC_CXX_OPERATORS(bblock_link)
+
+ bblock_link(bblock_t *block)
+ : block(block)
+ {
+ }
+#endif
+
+ struct exec_node link;
+ struct bblock_t *block;
+};
+
+struct backend_instruction;
+
+struct bblock_t {
+#ifdef __cplusplus
+ DECLARE_RALLOC_CXX_OPERATORS(bblock_t)
+
+ explicit bblock_t(cfg_t *cfg);
+
+ void add_successor(void *mem_ctx, bblock_t *successor);
+ bool is_predecessor_of(const bblock_t *block) const;
+ bool is_successor_of(const bblock_t *block) const;
+ bool can_combine_with(const bblock_t *that) const;
+ void combine_with(bblock_t *that);
+ void dump(backend_shader *s) const;
+
+ backend_instruction *start();
+ const backend_instruction *start() const;
+ backend_instruction *end();
+ const backend_instruction *end() const;
+
+ bblock_t *next();
+ const bblock_t *next() const;
+ bblock_t *prev();
+ const bblock_t *prev() const;
+
+ bool starts_with_control_flow() const;
+ bool ends_with_control_flow() const;
+
+ backend_instruction *first_non_control_flow_inst();
+ backend_instruction *last_non_control_flow_inst();
+#endif
+
+ struct exec_node link;
+ struct cfg_t *cfg;
+ struct bblock_t *idom;
+
+ int start_ip;
+ int end_ip;
+
+ struct exec_list instructions;
+ struct exec_list parents;
+ struct exec_list children;
+ int num;
+
+ unsigned cycle_count;
+};
+
+static inline struct backend_instruction *
+bblock_start(struct bblock_t *block)
+{
+ return (struct backend_instruction *)exec_list_get_head(&block->instructions);
+}
+
+static inline const struct backend_instruction *
+bblock_start_const(const struct bblock_t *block)
+{
+ return (const struct backend_instruction *)exec_list_get_head_const(&block->instructions);
+}
+
+static inline struct backend_instruction *
+bblock_end(struct bblock_t *block)
+{
+ return (struct backend_instruction *)exec_list_get_tail(&block->instructions);
+}
+
+static inline const struct backend_instruction *
+bblock_end_const(const struct bblock_t *block)
+{
+ return (const struct backend_instruction *)exec_list_get_tail_const(&block->instructions);
+}
+
+static inline struct bblock_t *
+bblock_next(struct bblock_t *block)
+{
+ if (exec_node_is_tail_sentinel(block->link.next))
+ return NULL;
+
+ return (struct bblock_t *)block->link.next;
+}
+
+static inline const struct bblock_t *
+bblock_next_const(const struct bblock_t *block)
+{
+ if (exec_node_is_tail_sentinel(block->link.next))
+ return NULL;
+
+ return (const struct bblock_t *)block->link.next;
+}
+
+static inline struct bblock_t *
+bblock_prev(struct bblock_t *block)
+{
+ if (exec_node_is_head_sentinel(block->link.prev))
+ return NULL;
+
+ return (struct bblock_t *)block->link.prev;
+}
+
+static inline const struct bblock_t *
+bblock_prev_const(const struct bblock_t *block)
+{
+ if (exec_node_is_head_sentinel(block->link.prev))
+ return NULL;
+
+ return (const struct bblock_t *)block->link.prev;
+}
+
+static inline bool
+bblock_starts_with_control_flow(const struct bblock_t *block)
+{
+ enum opcode op = bblock_start_const(block)->opcode;
+ return op == BRW_OPCODE_DO || op == BRW_OPCODE_ENDIF;
+}
+
+static inline bool
+bblock_ends_with_control_flow(const struct bblock_t *block)
+{
+ enum opcode op = bblock_end_const(block)->opcode;
+ return op == BRW_OPCODE_IF ||
+ op == BRW_OPCODE_ELSE ||
+ op == BRW_OPCODE_WHILE ||
+ op == BRW_OPCODE_BREAK ||
+ op == BRW_OPCODE_CONTINUE;
+}
+
+static inline struct backend_instruction *
+bblock_first_non_control_flow_inst(struct bblock_t *block)
+{
+ struct backend_instruction *inst = bblock_start(block);
+ if (bblock_starts_with_control_flow(block))
+#ifdef __cplusplus
+ inst = (struct backend_instruction *)inst->next;
+#else
+ inst = (struct backend_instruction *)inst->link.next;
+#endif
+ return inst;
+}
+
+static inline struct backend_instruction *
+bblock_last_non_control_flow_inst(struct bblock_t *block)
+{
+ struct backend_instruction *inst = bblock_end(block);
+ if (bblock_ends_with_control_flow(block))
+#ifdef __cplusplus
+ inst = (struct backend_instruction *)inst->prev;
+#else
+ inst = (struct backend_instruction *)inst->link.prev;
+#endif
+ return inst;
+}
+
+#ifdef __cplusplus
+inline backend_instruction *
+bblock_t::start()
+{
+ return bblock_start(this);
+}
+
+inline const backend_instruction *
+bblock_t::start() const
+{
+ return bblock_start_const(this);
+}
+
+inline backend_instruction *
+bblock_t::end()
+{
+ return bblock_end(this);
+}
+
+inline const backend_instruction *
+bblock_t::end() const
+{
+ return bblock_end_const(this);
+}
+
+inline bblock_t *
+bblock_t::next()
+{
+ return bblock_next(this);
+}
+
+inline const bblock_t *
+bblock_t::next() const
+{
+ return bblock_next_const(this);
+}
+
+inline bblock_t *
+bblock_t::prev()
+{
+ return bblock_prev(this);
+}
+
+inline const bblock_t *
+bblock_t::prev() const
+{
+ return bblock_prev_const(this);
+}
+
+inline bool
+bblock_t::starts_with_control_flow() const
+{
+ return bblock_starts_with_control_flow(this);
+}
+
+inline bool
+bblock_t::ends_with_control_flow() const
+{
+ return bblock_ends_with_control_flow(this);
+}
+
+inline backend_instruction *
+bblock_t::first_non_control_flow_inst()
+{
+ return bblock_first_non_control_flow_inst(this);
+}
+
+inline backend_instruction *
+bblock_t::last_non_control_flow_inst()
+{
+ return bblock_last_non_control_flow_inst(this);
+}
+#endif
+
+struct cfg_t {
+#ifdef __cplusplus
+ DECLARE_RALLOC_CXX_OPERATORS(cfg_t)
+
+ cfg_t(exec_list *instructions);
+ ~cfg_t();
+
+ void remove_block(bblock_t *block);
+
+ bblock_t *new_block();
+ void set_next_block(bblock_t **cur, bblock_t *block, int ip);
+ void make_block_array();
+ void calculate_idom();
+ static bblock_t *intersect(bblock_t *b1, bblock_t *b2);
+
+ void dump(backend_shader *s);
+ void dump_cfg();
+ void dump_domtree();
+#endif
+ void *mem_ctx;
+
+ /** Ordered list (by ip) of basic blocks */
+ struct exec_list block_list;
+ struct bblock_t **blocks;
+ int num_blocks;
+
+ bool idom_dirty;
+
+ unsigned cycle_count;
+};
+
+/* Note that this is implemented with a double for loop -- break will
+ * break from the inner loop only!
+ */
+#define foreach_block_and_inst(__block, __type, __inst, __cfg) \
+ foreach_block (__block, __cfg) \
+ foreach_inst_in_block (__type, __inst, __block)
+
+/* Note that this is implemented with a double for loop -- break will
+ * break from the inner loop only!
+ */
+#define foreach_block_and_inst_safe(__block, __type, __inst, __cfg) \
+ foreach_block_safe (__block, __cfg) \
+ foreach_inst_in_block_safe (__type, __inst, __block)
+
+#define foreach_block(__block, __cfg) \
+ foreach_list_typed (bblock_t, __block, link, &(__cfg)->block_list)
+
+#define foreach_block_reverse(__block, __cfg) \
+ foreach_list_typed_reverse (bblock_t, __block, link, &(__cfg)->block_list)
+
+#define foreach_block_safe(__block, __cfg) \
+ foreach_list_typed_safe (bblock_t, __block, link, &(__cfg)->block_list)
+
+#define foreach_block_reverse_safe(__block, __cfg) \
+ foreach_list_typed_reverse_safe (bblock_t, __block, link, &(__cfg)->block_list)
+
+#define foreach_inst_in_block(__type, __inst, __block) \
+ foreach_in_list(__type, __inst, &(__block)->instructions)
+
+#define foreach_inst_in_block_safe(__type, __inst, __block) \
+ for (__type *__inst = (__type *)__block->instructions.head_sentinel.next, \
+ *__next = (__type *)__inst->next; \
+ __next != NULL; \
+ __inst = __next, \
+ __next = (__type *)__next->next)
+
+#define foreach_inst_in_block_reverse(__type, __inst, __block) \
+ foreach_in_list_reverse(__type, __inst, &(__block)->instructions)
+
+#define foreach_inst_in_block_reverse_safe(__type, __inst, __block) \
+ foreach_in_list_reverse_safe(__type, __inst, &(__block)->instructions)
+
+#define foreach_inst_in_block_starting_from(__type, __scan_inst, __inst) \
+ for (__type *__scan_inst = (__type *)__inst->next; \
+ !__scan_inst->is_tail_sentinel(); \
+ __scan_inst = (__type *)__scan_inst->next)
+
+#define foreach_inst_in_block_reverse_starting_from(__type, __scan_inst, __inst) \
+ for (__type *__scan_inst = (__type *)__inst->prev; \
+ !__scan_inst->is_head_sentinel(); \
+ __scan_inst = (__type *)__scan_inst->prev)
+
+#endif /* BRW_CFG_H */
diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c
new file mode 100644
index 00000000000..cd9473f9a3b
--- /dev/null
+++ b/src/intel/compiler/brw_compiler.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright © 2015-2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_compiler.h"
+#include "brw_shader.h"
+#include "brw_eu.h"
+#include "common/gen_debug.h"
+#include "compiler/nir/nir.h"
+#include "main/errors.h"
+#include "util/debug.h"
+
+#define COMMON_OPTIONS \
+ .lower_sub = true, \
+ .lower_fdiv = true, \
+ .lower_scmp = true, \
+ .lower_fmod32 = true, \
+ .lower_fmod64 = false, \
+ .lower_bitfield_extract = true, \
+ .lower_bitfield_insert = true, \
+ .lower_uadd_carry = true, \
+ .lower_usub_borrow = true, \
+ .lower_fdiv = true, \
+ .lower_flrp64 = true, \
+ .native_integers = true, \
+ .use_interpolated_input_intrinsics = true, \
+ .vertex_id_zero_based = true
+
+static const struct nir_shader_compiler_options scalar_nir_options = {
+ COMMON_OPTIONS,
+ .lower_pack_half_2x16 = true,
+ .lower_pack_snorm_2x16 = true,
+ .lower_pack_snorm_4x8 = true,
+ .lower_pack_unorm_2x16 = true,
+ .lower_pack_unorm_4x8 = true,
+ .lower_unpack_half_2x16 = true,
+ .lower_unpack_snorm_2x16 = true,
+ .lower_unpack_snorm_4x8 = true,
+ .lower_unpack_unorm_2x16 = true,
+ .lower_unpack_unorm_4x8 = true,
+ .max_unroll_iterations = 32,
+};
+
+static const struct nir_shader_compiler_options vector_nir_options = {
+ COMMON_OPTIONS,
+
+ /* In the vec4 backend, our dpN instruction replicates its result to all the
+ * components of a vec4. We would like NIR to give us replicated fdot
+ * instructions because it can optimize better for us.
+ */
+ .fdot_replicates = true,
+
+ /* Prior to Gen6, there are no three source operations for SIMD4x2. */
+ .lower_flrp32 = true,
+
+ .lower_pack_snorm_2x16 = true,
+ .lower_pack_unorm_2x16 = true,
+ .lower_unpack_snorm_2x16 = true,
+ .lower_unpack_unorm_2x16 = true,
+ .lower_extract_byte = true,
+ .lower_extract_word = true,
+ .max_unroll_iterations = 32,
+};
+
+static const struct nir_shader_compiler_options vector_nir_options_gen6 = {
+ COMMON_OPTIONS,
+
+ /* In the vec4 backend, our dpN instruction replicates its result to all the
+ * components of a vec4. We would like NIR to give us replicated fdot
+ * instructions because it can optimize better for us.
+ */
+ .fdot_replicates = true,
+
+ .lower_pack_snorm_2x16 = true,
+ .lower_pack_unorm_2x16 = true,
+ .lower_unpack_snorm_2x16 = true,
+ .lower_unpack_unorm_2x16 = true,
+ .lower_extract_byte = true,
+ .lower_extract_word = true,
+ .max_unroll_iterations = 32,
+};
+
+struct brw_compiler *
+brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo)
+{
+ struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler);
+
+ compiler->devinfo = devinfo;
+
+ brw_fs_alloc_reg_sets(compiler);
+ brw_vec4_alloc_reg_set(compiler);
+ brw_init_compaction_tables(devinfo);
+
+ compiler->precise_trig = env_var_as_boolean("INTEL_PRECISE_TRIG", false);
+
+ compiler->scalar_stage[MESA_SHADER_VERTEX] =
+ devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS);
+ compiler->scalar_stage[MESA_SHADER_TESS_CTRL] =
+ devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TCS", true);
+ compiler->scalar_stage[MESA_SHADER_TESS_EVAL] =
+ devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true);
+ compiler->scalar_stage[MESA_SHADER_GEOMETRY] =
+ devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", true);
+ compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true;
+ compiler->scalar_stage[MESA_SHADER_COMPUTE] = true;
+
+ /* We want the GLSL compiler to emit code that uses condition codes */
+ for (int i = 0; i < MESA_SHADER_STAGES; i++) {
+ compiler->glsl_compiler_options[i].MaxUnrollIterations = 0;
+ compiler->glsl_compiler_options[i].MaxIfDepth =
+ devinfo->gen < 6 ? 16 : UINT_MAX;
+
+ compiler->glsl_compiler_options[i].EmitNoIndirectInput = true;
+ compiler->glsl_compiler_options[i].EmitNoIndirectUniform = false;
+
+ bool is_scalar = compiler->scalar_stage[i];
+
+ compiler->glsl_compiler_options[i].EmitNoIndirectOutput = is_scalar;
+ compiler->glsl_compiler_options[i].EmitNoIndirectTemp = is_scalar;
+ compiler->glsl_compiler_options[i].OptimizeForAOS = !is_scalar;
+
+ if (is_scalar) {
+ compiler->glsl_compiler_options[i].NirOptions = &scalar_nir_options;
+ } else {
+ compiler->glsl_compiler_options[i].NirOptions =
+ devinfo->gen < 6 ? &vector_nir_options : &vector_nir_options_gen6;
+ }
+
+ compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true;
+ compiler->glsl_compiler_options[i].ClampBlockIndicesToArrayBounds = true;
+ }
+
+ compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false;
+ compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false;
+ compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectOutput = false;
+
+ if (compiler->scalar_stage[MESA_SHADER_GEOMETRY])
+ compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false;
+
+ return compiler;
+}
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
new file mode 100644
index 00000000000..85257d494af
--- /dev/null
+++ b/src/intel/compiler/brw_compiler.h
@@ -0,0 +1,1057 @@
+/*
+ * Copyright © 2010 - 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include "common/gen_device_info.h"
+#include "main/mtypes.h"
+#include "main/macros.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ra_regs;
+struct nir_shader;
+struct brw_program;
+union gl_constant_value;
+
+struct brw_compiler {
+ const struct gen_device_info *devinfo;
+
+ struct {
+ struct ra_regs *regs;
+
+ /**
+ * Array of the ra classes for the unaligned contiguous register
+ * block sizes used.
+ */
+ int *classes;
+
+ /**
+ * Mapping for register-allocated objects in *regs to the first
+ * GRF for that object.
+ */
+ uint8_t *ra_reg_to_grf;
+ } vec4_reg_set;
+
+ struct {
+ struct ra_regs *regs;
+
+ /**
+ * Array of the ra classes for the unaligned contiguous register
+ * block sizes used, indexed by register size.
+ */
+ int classes[16];
+
+ /**
+ * Mapping from classes to ra_reg ranges. Each of the per-size
+ * classes corresponds to a range of ra_reg nodes. This array stores
+ * those ranges in the form of first ra_reg in each class and the
+ * total number of ra_reg elements in the last array element. This
+ * way the range of the i'th class is given by:
+ * [ class_to_ra_reg_range[i], class_to_ra_reg_range[i+1] )
+ */
+ int class_to_ra_reg_range[17];
+
+ /**
+ * Mapping for register-allocated objects in *regs to the first
+ * GRF for that object.
+ */
+ uint8_t *ra_reg_to_grf;
+
+ /**
+ * ra class for the aligned pairs we use for PLN, which doesn't
+ * appear in *classes.
+ */
+ int aligned_pairs_class;
+ } fs_reg_sets[3];
+
+ void (*shader_debug_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
+ void (*shader_perf_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
+
+ bool scalar_stage[MESA_SHADER_STAGES];
+ struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES];
+
+ /**
+ * Apply workarounds for SIN and COS output range problems.
+ * This can negatively impact performance.
+ */
+ bool precise_trig;
+};
+
+
+/**
+ * Program key structures.
+ *
+ * When drawing, we look for the currently bound shaders in the program
+ * cache. This is essentially a hash table lookup, and these are the keys.
+ *
+ * Sometimes OpenGL features specified as state need to be simulated via
+ * shader code, due to a mismatch between the API and the hardware. This
+ * is often referred to as "non-orthagonal state" or "NOS". We store NOS
+ * in the program key so it's considered when searching for a program. If
+ * we haven't seen a particular combination before, we have to recompile a
+ * new specialized version.
+ *
+ * Shader compilation should not look up state in gl_context directly, but
+ * instead use the copy in the program key. This guarantees recompiles will
+ * happen correctly.
+ *
+ * @{
+ */
+
+enum PACKED gen6_gather_sampler_wa {
+ WA_SIGN = 1, /* whether we need to sign extend */
+ WA_8BIT = 2, /* if we have an 8bit format needing wa */
+ WA_16BIT = 4, /* if we have a 16bit format needing wa */
+};
+
+/**
+ * Sampler information needed by VS, WM, and GS program cache keys.
+ */
+struct brw_sampler_prog_key_data {
+ /**
+ * EXT_texture_swizzle and DEPTH_TEXTURE_MODE swizzles.
+ */
+ uint16_t swizzles[MAX_SAMPLERS];
+
+ uint32_t gl_clamp_mask[3];
+
+ /**
+ * For RG32F, gather4's channel select is broken.
+ */
+ uint32_t gather_channel_quirk_mask;
+
+ /**
+ * Whether this sampler uses the compressed multisample surface layout.
+ */
+ uint32_t compressed_multisample_layout_mask;
+
+ /**
+ * Whether this sampler is using 16x multisampling. If so fetching from
+ * this sampler will be handled with a different instruction, ld2dms_w
+ * instead of ld2dms.
+ */
+ uint32_t msaa_16;
+
+ /**
+ * For Sandybridge, which shader w/a we need for gather quirks.
+ */
+ enum gen6_gather_sampler_wa gen6_gather_wa[MAX_SAMPLERS];
+
+ /**
+ * Texture units that have a YUV image bound.
+ */
+ uint32_t y_u_v_image_mask;
+ uint32_t y_uv_image_mask;
+ uint32_t yx_xuxv_image_mask;
+};
+
+/**
+ * The VF can't natively handle certain types of attributes, such as GL_FIXED
+ * or most 10_10_10_2 types. These flags enable various VS workarounds to
+ * "fix" attributes at the beginning of shaders.
+ */
+#define BRW_ATTRIB_WA_COMPONENT_MASK 7 /* mask for GL_FIXED scale channel count */
+#define BRW_ATTRIB_WA_NORMALIZE 8 /* normalize in shader */
+#define BRW_ATTRIB_WA_BGRA 16 /* swap r/b channels in shader */
+#define BRW_ATTRIB_WA_SIGN 32 /* interpret as signed in shader */
+#define BRW_ATTRIB_WA_SCALE 64 /* interpret as scaled in shader */
+
+/** The program key for Vertex Shaders. */
+struct brw_vs_prog_key {
+ unsigned program_string_id;
+
+ /**
+ * Per-attribute workaround flags
+ *
+ * For each attribute, a combination of BRW_ATTRIB_WA_*.
+ */
+ uint8_t gl_attrib_wa_flags[VERT_ATTRIB_MAX];
+
+ bool copy_edgeflag:1;
+
+ bool clamp_vertex_color:1;
+
+ /**
+ * How many user clipping planes are being uploaded to the vertex shader as
+ * push constants.
+ *
+ * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to
+ * clip distances.
+ */
+ unsigned nr_userclip_plane_consts:4;
+
+ /**
+ * For pre-Gen6 hardware, a bitfield indicating which texture coordinates
+ * are going to be replaced with point coordinates (as a consequence of a
+ * call to glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)). Because
+ * our SF thread requires exact matching between VS outputs and FS inputs,
+ * these texture coordinates will need to be unconditionally included in
+ * the VUE, even if they aren't written by the vertex shader.
+ */
+ uint8_t point_coord_replace;
+
+ struct brw_sampler_prog_key_data tex;
+};
+
+/** The program key for Tessellation Control Shaders. */
+struct brw_tcs_prog_key
+{
+ unsigned program_string_id;
+
+ GLenum tes_primitive_mode;
+
+ unsigned input_vertices;
+
+ /** A bitfield of per-patch outputs written. */
+ uint32_t patch_outputs_written;
+
+ /** A bitfield of per-vertex outputs written. */
+ uint64_t outputs_written;
+
+ bool quads_workaround;
+
+ struct brw_sampler_prog_key_data tex;
+};
+
+/** The program key for Tessellation Evaluation Shaders. */
+struct brw_tes_prog_key
+{
+ unsigned program_string_id;
+
+ /** A bitfield of per-patch inputs read. */
+ uint32_t patch_inputs_read;
+
+ /** A bitfield of per-vertex inputs read. */
+ uint64_t inputs_read;
+
+ struct brw_sampler_prog_key_data tex;
+};
+
+/** The program key for Geometry Shaders. */
+struct brw_gs_prog_key
+{
+ unsigned program_string_id;
+
+ struct brw_sampler_prog_key_data tex;
+};
+
+/* A big lookup table is used to figure out which and how many
+ * additional regs will inserted before the main payload in the WM
+ * program execution. These mainly relate to depth and stencil
+ * processing and the early-depth-test optimization.
+ */
+enum brw_wm_iz_bits {
+ BRW_WM_IZ_PS_KILL_ALPHATEST_BIT = 0x1,
+ BRW_WM_IZ_PS_COMPUTES_DEPTH_BIT = 0x2,
+ BRW_WM_IZ_DEPTH_WRITE_ENABLE_BIT = 0x4,
+ BRW_WM_IZ_DEPTH_TEST_ENABLE_BIT = 0x8,
+ BRW_WM_IZ_STENCIL_WRITE_ENABLE_BIT = 0x10,
+ BRW_WM_IZ_STENCIL_TEST_ENABLE_BIT = 0x20,
+ BRW_WM_IZ_BIT_MAX = 0x40
+};
+
+enum brw_wm_aa_enable {
+ BRW_WM_AA_NEVER,
+ BRW_WM_AA_SOMETIMES,
+ BRW_WM_AA_ALWAYS
+};
+
+/** The program key for Fragment/Pixel Shaders. */
+struct brw_wm_prog_key {
+ /* Some collection of BRW_WM_IZ_* */
+ uint8_t iz_lookup;
+ bool stats_wm:1;
+ bool flat_shade:1;
+ unsigned nr_color_regions:5;
+ bool replicate_alpha:1;
+ bool clamp_fragment_color:1;
+ bool persample_interp:1;
+ bool multisample_fbo:1;
+ enum brw_wm_aa_enable line_aa:2;
+ bool high_quality_derivatives:1;
+ bool force_dual_color_blend:1;
+ bool coherent_fb_fetch:1;
+
+ uint16_t drawable_height;
+ uint64_t input_slots_valid;
+ unsigned program_string_id;
+ GLenum alpha_test_func; /* < For Gen4/5 MRT alpha test */
+ float alpha_test_ref;
+
+ struct brw_sampler_prog_key_data tex;
+};
+
+struct brw_cs_prog_key {
+ uint32_t program_string_id;
+ struct brw_sampler_prog_key_data tex;
+};
+
+/*
+ * Image metadata structure as laid out in the shader parameter
+ * buffer. Entries have to be 16B-aligned for the vec4 back-end to be
+ * able to use them. That's okay because the padding and any unused
+ * entries [most of them except when we're doing untyped surface
+ * access] will be removed by the uniform packing pass.
+ */
+#define BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET 0
+#define BRW_IMAGE_PARAM_OFFSET_OFFSET 4
+#define BRW_IMAGE_PARAM_SIZE_OFFSET 8
+#define BRW_IMAGE_PARAM_STRIDE_OFFSET 12
+#define BRW_IMAGE_PARAM_TILING_OFFSET 16
+#define BRW_IMAGE_PARAM_SWIZZLING_OFFSET 20
+#define BRW_IMAGE_PARAM_SIZE 24
+
+struct brw_image_param {
+ /** Surface binding table index. */
+ uint32_t surface_idx;
+
+ /** Offset applied to the X and Y surface coordinates. */
+ uint32_t offset[2];
+
+ /** Surface X, Y and Z dimensions. */
+ uint32_t size[3];
+
+ /** X-stride in bytes, Y-stride in pixels, horizontal slice stride in
+ * pixels, vertical slice stride in pixels.
+ */
+ uint32_t stride[4];
+
+ /** Log2 of the tiling modulus in the X, Y and Z dimension. */
+ uint32_t tiling[3];
+
+ /**
+ * Right shift to apply for bit 6 address swizzling. Two different
+ * swizzles can be specified and will be applied one after the other. The
+ * resulting address will be:
+ *
+ * addr' = addr ^ ((1 << 6) & ((addr >> swizzling[0]) ^
+ * (addr >> swizzling[1])))
+ *
+ * Use \c 0xff if any of the swizzles is not required.
+ */
+ uint32_t swizzling[2];
+};
+
+/** Max number of render targets in a shader */
+#define BRW_MAX_DRAW_BUFFERS 8
+
+/**
+ * Max number of binding table entries used for stream output.
+ *
+ * From the OpenGL 3.0 spec, table 6.44 (Transform Feedback State), the
+ * minimum value of MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS is 64.
+ *
+ * On Gen6, the size of transform feedback data is limited not by the number
+ * of components but by the number of binding table entries we set aside. We
+ * use one binding table entry for a float, one entry for a vector, and one
+ * entry per matrix column. Since the only way we can communicate our
+ * transform feedback capabilities to the client is via
+ * MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS, we need to plan for the
+ * worst case, in which all the varyings are floats, so we use up one binding
+ * table entry per component. Therefore we need to set aside at least 64
+ * binding table entries for use by transform feedback.
+ *
+ * Note: since we don't currently pack varyings, it is currently impossible
+ * for the client to actually use up all of these binding table entries--if
+ * all of their varyings were floats, they would run out of varying slots and
+ * fail to link. But that's a bug, so it seems prudent to go ahead and
+ * allocate the number of binding table entries we will need once the bug is
+ * fixed.
+ */
+#define BRW_MAX_SOL_BINDINGS 64
+
+/**
+ * Binding table index for the first gen6 SOL binding.
+ */
+#define BRW_GEN6_SOL_BINDING_START 0
+
+/**
+ * Stride in bytes between shader_time entries.
+ *
+ * We separate entries by a cacheline to reduce traffic between EUs writing to
+ * different entries.
+ */
+#define BRW_SHADER_TIME_STRIDE 64
+
+struct brw_stage_prog_data {
+ struct {
+ /** size of our binding table. */
+ uint32_t size_bytes;
+
+ /** @{
+ * surface indices for the various groups of surfaces
+ */
+ uint32_t pull_constants_start;
+ uint32_t texture_start;
+ uint32_t gather_texture_start;
+ uint32_t ubo_start;
+ uint32_t ssbo_start;
+ uint32_t abo_start;
+ uint32_t image_start;
+ uint32_t shader_time_start;
+ uint32_t plane_start[3];
+ /** @} */
+ } binding_table;
+
+ GLuint nr_params; /**< number of float params/constants */
+ GLuint nr_pull_params;
+ unsigned nr_image_params;
+
+ unsigned curb_read_length;
+ unsigned total_scratch;
+ unsigned total_shared;
+
+ /**
+ * Register where the thread expects to find input data from the URB
+ * (typically uniforms, followed by vertex or fragment attributes).
+ */
+ unsigned dispatch_grf_start_reg;
+
+ bool use_alt_mode; /**< Use ALT floating point mode? Otherwise, IEEE. */
+
+ /* Pointers to tracked values (only valid once
+ * _mesa_load_state_parameters has been called at runtime).
+ */
+ const union gl_constant_value **param;
+ const union gl_constant_value **pull_param;
+
+ /** Image metadata passed to the shader as uniforms. */
+ struct brw_image_param *image_param;
+};
+
+static inline void
+brw_mark_surface_used(struct brw_stage_prog_data *prog_data,
+ unsigned surf_index)
+{
+ /* A binding table index is 8 bits and the top 3 values are reserved for
+ * special things (stateless and SLM).
+ */
+ assert(surf_index <= 252);
+
+ prog_data->binding_table.size_bytes =
+ MAX2(prog_data->binding_table.size_bytes, (surf_index + 1) * 4);
+}
+
+/* Data about a particular attempt to compile a program. Note that
+ * there can be many of these, each in a different GL state
+ * corresponding to a different brw_wm_prog_key struct, with different
+ * compiled programs.
+ */
+struct brw_wm_prog_data {
+ struct brw_stage_prog_data base;
+
+ GLuint num_varying_inputs;
+
+ uint8_t reg_blocks_0;
+ uint8_t reg_blocks_2;
+
+ uint8_t dispatch_grf_start_reg_2;
+ uint32_t prog_offset_2;
+
+ struct {
+ /** @{
+ * surface indices the WM-specific surfaces
+ */
+ uint32_t render_target_start;
+ uint32_t render_target_read_start;
+ /** @} */
+ } binding_table;
+
+ uint8_t computed_depth_mode;
+ bool computed_stencil;
+
+ bool early_fragment_tests;
+ bool post_depth_coverage;
+ bool inner_coverage;
+ bool dispatch_8;
+ bool dispatch_16;
+ bool dual_src_blend;
+ bool persample_dispatch;
+ bool uses_pos_offset;
+ bool uses_omask;
+ bool uses_kill;
+ bool uses_src_depth;
+ bool uses_src_w;
+ bool uses_sample_mask;
+ bool has_side_effects;
+ bool pulls_bary;
+
+ bool contains_flat_varying;
+ bool contains_noperspective_varying;
+
+ /**
+ * Mask of which interpolation modes are required by the fragment shader.
+ * Used in hardware setup on gen6+.
+ */
+ uint32_t barycentric_interp_modes;
+
+ /**
+ * Mask of which FS inputs are marked flat by the shader source. This is
+ * needed for setting up 3DSTATE_SF/SBE.
+ */
+ uint32_t flat_inputs;
+
+ /* Mapping of VUE slots to interpolation modes.
+ * Used by the Gen4-5 clip/sf/wm stages.
+ */
+ unsigned char interp_mode[65]; /* BRW_VARYING_SLOT_COUNT */
+
+ /**
+ * Map from gl_varying_slot to the position within the FS setup data
+ * payload where the varying's attribute vertex deltas should be delivered.
+ * For varying slots that are not used by the FS, the value is -1.
+ */
+ int urb_setup[VARYING_SLOT_MAX];
+};
+
+struct brw_push_const_block {
+ unsigned dwords; /* Dword count, not reg aligned */
+ unsigned regs;
+ unsigned size; /* Bytes, register aligned */
+};
+
+struct brw_cs_prog_data {
+ struct brw_stage_prog_data base;
+
+ GLuint dispatch_grf_start_reg_16;
+ unsigned local_size[3];
+ unsigned simd_size;
+ unsigned threads;
+ bool uses_barrier;
+ bool uses_num_work_groups;
+ int thread_local_id_index;
+
+ struct {
+ struct brw_push_const_block cross_thread;
+ struct brw_push_const_block per_thread;
+ struct brw_push_const_block total;
+ } push;
+
+ struct {
+ /** @{
+ * surface indices the CS-specific surfaces
+ */
+ uint32_t work_groups_start;
+ /** @} */
+ } binding_table;
+};
+
+/**
+ * Enum representing the i965-specific vertex results that don't correspond
+ * exactly to any element of gl_varying_slot. The values of this enum are
+ * assigned such that they don't conflict with gl_varying_slot.
+ */
+typedef enum
+{
+ BRW_VARYING_SLOT_NDC = VARYING_SLOT_MAX,
+ BRW_VARYING_SLOT_PAD,
+ /**
+ * Technically this is not a varying but just a placeholder that
+ * compile_sf_prog() inserts into its VUE map to cause the gl_PointCoord
+ * builtin variable to be compiled correctly. see compile_sf_prog() for
+ * more info.
+ */
+ BRW_VARYING_SLOT_PNTC,
+ BRW_VARYING_SLOT_COUNT
+} brw_varying_slot;
+
+/**
+ * We always program SF to start reading at an offset of 1 (2 varying slots)
+ * from the start of the vertex URB entry. This causes it to skip:
+ * - VARYING_SLOT_PSIZ and BRW_VARYING_SLOT_NDC on gen4-5
+ * - VARYING_SLOT_PSIZ and VARYING_SLOT_POS on gen6+
+ */
+#define BRW_SF_URB_ENTRY_READ_OFFSET 1
+
+/**
+ * Bitmask indicating which fragment shader inputs represent varyings (and
+ * hence have to be delivered to the fragment shader by the SF/SBE stage).
+ */
+#define BRW_FS_VARYING_INPUT_MASK \
+ (BITFIELD64_RANGE(0, VARYING_SLOT_MAX) & \
+ ~VARYING_BIT_POS & ~VARYING_BIT_FACE)
+
+/**
+ * Data structure recording the relationship between the gl_varying_slot enum
+ * and "slots" within the vertex URB entry (VUE). A "slot" is defined as a
+ * single octaword within the VUE (128 bits).
+ *
+ * Note that each BRW register contains 256 bits (2 octawords), so when
+ * accessing the VUE in URB_NOSWIZZLE mode, each register corresponds to two
+ * consecutive VUE slots. When accessing the VUE in URB_INTERLEAVED mode (as
+ * in a vertex shader), each register corresponds to a single VUE slot, since
+ * it contains data for two separate vertices.
+ */
+struct brw_vue_map {
+ /**
+ * Bitfield representing all varying slots that are (a) stored in this VUE
+ * map, and (b) actually written by the shader. Does not include any of
+ * the additional varying slots defined in brw_varying_slot.
+ */
+ uint64_t slots_valid;
+
+ /**
+ * Is this VUE map for a separate shader pipeline?
+ *
+ * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched
+ * without the linker having a chance to dead code eliminate unused varyings.
+ *
+ * This means that we have to use a fixed slot layout, based on the output's
+ * location field, rather than assigning slots in a compact contiguous block.
+ */
+ bool separate;
+
+ /**
+ * Map from gl_varying_slot value to VUE slot. For gl_varying_slots that are
+ * not stored in a slot (because they are not written, or because
+ * additional processing is applied before storing them in the VUE), the
+ * value is -1.
+ */
+ signed char varying_to_slot[VARYING_SLOT_TESS_MAX];
+
+ /**
+ * Map from VUE slot to gl_varying_slot value. For slots that do not
+ * directly correspond to a gl_varying_slot, the value comes from
+ * brw_varying_slot.
+ *
+ * For slots that are not in use, the value is BRW_VARYING_SLOT_PAD.
+ */
+ signed char slot_to_varying[VARYING_SLOT_TESS_MAX];
+
+ /**
+ * Total number of VUE slots in use
+ */
+ int num_slots;
+
+ /**
+ * Number of per-patch VUE slots. Only valid for tessellation control
+ * shader outputs and tessellation evaluation shader inputs.
+ */
+ int num_per_patch_slots;
+
+ /**
+ * Number of per-vertex VUE slots. Only valid for tessellation control
+ * shader outputs and tessellation evaluation shader inputs.
+ */
+ int num_per_vertex_slots;
+};
+
+void brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map);
+
+/**
+ * Convert a VUE slot number into a byte offset within the VUE.
+ */
+static inline GLuint brw_vue_slot_to_offset(GLuint slot)
+{
+ return 16*slot;
+}
+
+/**
+ * Convert a vertex output (brw_varying_slot) into a byte offset within the
+ * VUE.
+ */
+static inline
+GLuint brw_varying_to_offset(const struct brw_vue_map *vue_map, GLuint varying)
+{
+ return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]);
+}
+
+void brw_compute_vue_map(const struct gen_device_info *devinfo,
+ struct brw_vue_map *vue_map,
+ uint64_t slots_valid,
+ bool separate_shader);
+
+void brw_compute_tess_vue_map(struct brw_vue_map *const vue_map,
+ uint64_t slots_valid,
+ uint32_t is_patch);
+
+/* brw_interpolation_map.c */
+void brw_setup_vue_interpolation(struct brw_vue_map *vue_map,
+ struct nir_shader *nir,
+ struct brw_wm_prog_data *prog_data,
+ const struct gen_device_info *devinfo);
+
+enum shader_dispatch_mode {
+ DISPATCH_MODE_4X1_SINGLE = 0,
+ DISPATCH_MODE_4X2_DUAL_INSTANCE = 1,
+ DISPATCH_MODE_4X2_DUAL_OBJECT = 2,
+ DISPATCH_MODE_SIMD8 = 3,
+};
+
+/**
+ * @defgroup Tessellator parameter enumerations.
+ *
+ * These correspond to the hardware values in 3DSTATE_TE, and are provided
+ * as part of the tessellation evaluation shader.
+ *
+ * @{
+ */
+enum brw_tess_partitioning {
+ BRW_TESS_PARTITIONING_INTEGER = 0,
+ BRW_TESS_PARTITIONING_ODD_FRACTIONAL = 1,
+ BRW_TESS_PARTITIONING_EVEN_FRACTIONAL = 2,
+};
+
+enum brw_tess_output_topology {
+ BRW_TESS_OUTPUT_TOPOLOGY_POINT = 0,
+ BRW_TESS_OUTPUT_TOPOLOGY_LINE = 1,
+ BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW = 2,
+ BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW = 3,
+};
+
+enum brw_tess_domain {
+ BRW_TESS_DOMAIN_QUAD = 0,
+ BRW_TESS_DOMAIN_TRI = 1,
+ BRW_TESS_DOMAIN_ISOLINE = 2,
+};
+/** @} */
+
+struct brw_vue_prog_data {
+ struct brw_stage_prog_data base;
+ struct brw_vue_map vue_map;
+
+ /** Should the hardware deliver input VUE handles for URB pull loads? */
+ bool include_vue_handles;
+
+ GLuint urb_read_length;
+ GLuint total_grf;
+
+ uint32_t clip_distance_mask;
+ uint32_t cull_distance_mask;
+
+ /* Used for calculating urb partitions. In the VS, this is the size of the
+ * URB entry used for both input and output to the thread. In the GS, this
+ * is the size of the URB entry used for output.
+ */
+ GLuint urb_entry_size;
+
+ enum shader_dispatch_mode dispatch_mode;
+};
+
+struct brw_vs_prog_data {
+ struct brw_vue_prog_data base;
+
+ GLbitfield64 inputs_read;
+ GLbitfield64 double_inputs_read;
+
+ unsigned nr_attributes;
+ unsigned nr_attribute_slots;
+
+ bool uses_vertexid;
+ bool uses_instanceid;
+ bool uses_basevertex;
+ bool uses_baseinstance;
+ bool uses_drawid;
+};
+
+struct brw_tcs_prog_data
+{
+ struct brw_vue_prog_data base;
+
+ /** Number vertices in output patch */
+ int instances;
+};
+
+
+struct brw_tes_prog_data
+{
+ struct brw_vue_prog_data base;
+
+ enum brw_tess_partitioning partitioning;
+ enum brw_tess_output_topology output_topology;
+ enum brw_tess_domain domain;
+};
+
+struct brw_gs_prog_data
+{
+ struct brw_vue_prog_data base;
+
+ unsigned vertices_in;
+
+ /**
+ * Size of an output vertex, measured in HWORDS (32 bytes).
+ */
+ unsigned output_vertex_size_hwords;
+
+ unsigned output_topology;
+
+ /**
+ * Size of the control data (cut bits or StreamID bits), in hwords (32
+ * bytes). 0 if there is no control data.
+ */
+ unsigned control_data_header_size_hwords;
+
+ /**
+ * Format of the control data (either GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID
+ * if the control data is StreamID bits, or
+ * GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT if the control data is cut bits).
+ * Ignored if control_data_header_size is 0.
+ */
+ unsigned control_data_format;
+
+ bool include_primitive_id;
+
+ /**
+ * The number of vertices emitted, if constant - otherwise -1.
+ */
+ int static_vertex_count;
+
+ int invocations;
+
+ /**
+ * Gen6: Provoking vertex convention for odd-numbered triangles
+ * in tristrips.
+ */
+ GLuint pv_first:1;
+
+ /**
+ * Gen6: Number of varyings that are output to transform feedback.
+ */
+ GLuint num_transform_feedback_bindings:7; /* 0-BRW_MAX_SOL_BINDINGS */
+
+ /**
+ * Gen6: Map from the index of a transform feedback binding table entry to the
+ * gl_varying_slot that should be streamed out through that binding table
+ * entry.
+ */
+ unsigned char transform_feedback_bindings[64 /* BRW_MAX_SOL_BINDINGS */];
+
+ /**
+ * Gen6: Map from the index of a transform feedback binding table entry to the
+ * swizzles that should be used when streaming out data through that
+ * binding table entry.
+ */
+ unsigned char transform_feedback_swizzles[64 /* BRW_MAX_SOL_BINDINGS */];
+};
+
+#define DEFINE_PROG_DATA_DOWNCAST(stage) \
+static inline struct brw_##stage##_prog_data * \
+brw_##stage##_prog_data(struct brw_stage_prog_data *prog_data) \
+{ \
+ return (struct brw_##stage##_prog_data *) prog_data; \
+}
+DEFINE_PROG_DATA_DOWNCAST(vue)
+DEFINE_PROG_DATA_DOWNCAST(vs)
+DEFINE_PROG_DATA_DOWNCAST(tcs)
+DEFINE_PROG_DATA_DOWNCAST(tes)
+DEFINE_PROG_DATA_DOWNCAST(gs)
+DEFINE_PROG_DATA_DOWNCAST(wm)
+DEFINE_PROG_DATA_DOWNCAST(cs)
+DEFINE_PROG_DATA_DOWNCAST(ff_gs)
+DEFINE_PROG_DATA_DOWNCAST(clip)
+DEFINE_PROG_DATA_DOWNCAST(sf)
+#undef DEFINE_PROG_DATA_DOWNCAST
+
+/** @} */
+
+struct brw_compiler *
+brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo);
+
+/**
+ * Compile a vertex shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
+ void *mem_ctx,
+ const struct brw_vs_prog_key *key,
+ struct brw_vs_prog_data *prog_data,
+ const struct nir_shader *shader,
+ gl_clip_plane *clip_planes,
+ bool use_legacy_snorm_formula,
+ int shader_time_index,
+ unsigned *final_assembly_size,
+ char **error_str);
+
+/**
+ * Compile a tessellation control shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_tcs(const struct brw_compiler *compiler,
+ void *log_data,
+ void *mem_ctx,
+ const struct brw_tcs_prog_key *key,
+ struct brw_tcs_prog_data *prog_data,
+ const struct nir_shader *nir,
+ int shader_time_index,
+ unsigned *final_assembly_size,
+ char **error_str);
+
+/**
+ * Compile a tessellation evaluation shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_tes(const struct brw_compiler *compiler, void *log_data,
+ void *mem_ctx,
+ const struct brw_tes_prog_key *key,
+ const struct brw_vue_map *input_vue_map,
+ struct brw_tes_prog_data *prog_data,
+ const struct nir_shader *shader,
+ struct gl_program *prog,
+ int shader_time_index,
+ unsigned *final_assembly_size,
+ char **error_str);
+
+/**
+ * Compile a vertex shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
+ void *mem_ctx,
+ const struct brw_gs_prog_key *key,
+ struct brw_gs_prog_data *prog_data,
+ const struct nir_shader *shader,
+ struct gl_program *prog,
+ int shader_time_index,
+ unsigned *final_assembly_size,
+ char **error_str);
+
+/**
+ * Compile a fragment shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
+ void *mem_ctx,
+ const struct brw_wm_prog_key *key,
+ struct brw_wm_prog_data *prog_data,
+ const struct nir_shader *shader,
+ struct gl_program *prog,
+ int shader_time_index8,
+ int shader_time_index16,
+ bool allow_spilling,
+ bool use_rep_send, struct brw_vue_map *vue_map,
+ unsigned *final_assembly_size,
+ char **error_str);
+
+/**
+ * Compile a compute shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
+ void *mem_ctx,
+ const struct brw_cs_prog_key *key,
+ struct brw_cs_prog_data *prog_data,
+ const struct nir_shader *shader,
+ int shader_time_index,
+ unsigned *final_assembly_size,
+ char **error_str);
+
+static inline uint32_t
+encode_slm_size(unsigned gen, uint32_t bytes)
+{
+ uint32_t slm_size = 0;
+
+ /* Shared Local Memory is specified as powers of two, and encoded in
+ * INTERFACE_DESCRIPTOR_DATA with the following representations:
+ *
+ * Size | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB |
+ * -------------------------------------------------------------------
+ * Gen7-8 | 0 | none | none | 1 | 2 | 4 | 8 | 16 |
+ * -------------------------------------------------------------------
+ * Gen9+ | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+ */
+ assert(bytes <= 64 * 1024);
+
+ if (bytes > 0) {
+ /* Shared Local Memory Size is specified as powers of two. */
+ slm_size = util_next_power_of_two(bytes);
+
+ if (gen >= 9) {
+ /* Use a minimum of 1kB; turn an exponent of 10 (1024 kB) into 1. */
+ slm_size = ffs(MAX2(slm_size, 1024)) - 10;
+ } else {
+ /* Use a minimum of 4kB; convert to the pre-Gen9 representation. */
+ slm_size = MAX2(slm_size, 4096) / 4096;
+ }
+ }
+
+ return slm_size;
+}
+
+/**
+ * Return true if the given shader stage is dispatched contiguously by the
+ * relevant fixed function starting from channel 0 of the SIMD thread, which
+ * implies that the dispatch mask of a thread can be assumed to have the form
+ * '2^n - 1' for some n.
+ */
+static inline bool
+brw_stage_has_packed_dispatch(const struct gen_device_info *devinfo,
+ gl_shader_stage stage,
+ const struct brw_stage_prog_data *prog_data)
+{
+ /* The code below makes assumptions about the hardware's thread dispatch
+ * behavior that could be proven wrong in future generations -- Make sure
+ * to do a full test run with brw_fs_test_dispatch_packing() hooked up to
+ * the NIR front-end before changing this assertion.
+ */
+ assert(devinfo->gen <= 9);
+
+ switch (stage) {
+ case MESA_SHADER_FRAGMENT: {
+ /* The PSD discards subspans coming in with no lit samples, which in the
+ * per-pixel shading case implies that each subspan will either be fully
+ * lit (due to the VMask being used to allow derivative computations),
+ * or not dispatched at all. In per-sample dispatch mode individual
+ * samples from the same subspan have a fixed relative location within
+ * the SIMD thread, so dispatch of unlit samples cannot be avoided in
+ * general and we should return false.
+ */
+ const struct brw_wm_prog_data *wm_prog_data =
+ (const struct brw_wm_prog_data *)prog_data;
+ return !wm_prog_data->persample_dispatch;
+ }
+ case MESA_SHADER_COMPUTE:
+ /* Compute shaders will be spawned with either a fully enabled dispatch
+ * mask or with whatever bottom/right execution mask was given to the
+ * GPGPU walker command to be used along the workgroup edges -- In both
+ * cases the dispatch mask is required to be tightly packed for our
+ * invocation index calculations to work.
+ */
+ return true;
+ default:
+ /* Most remaining fixed functions are limited to use a packed dispatch
+ * mask due to the hardware representation of the dispatch mask as a
+ * single counter representing the number of enabled channels.
+ */
+ return true;
+ }
+}
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/intel/compiler/brw_dead_control_flow.cpp b/src/intel/compiler/brw_dead_control_flow.cpp
new file mode 100644
index 00000000000..114dc6cb212
--- /dev/null
+++ b/src/intel/compiler/brw_dead_control_flow.cpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_dead_control_flow.cpp
+ *
+ * This file implements the dead control flow elimination optimization pass.
+ */
+
+#include "brw_shader.h"
+#include "brw_cfg.h"
+
+/* Look for and eliminate dead control flow:
+ *
+ * - if/endif
+ * - else in else/endif
+ * - then in if/else/endif
+ */
+bool
+dead_control_flow_eliminate(backend_shader *s)
+{
+ bool progress = false;
+
+ foreach_block_safe (block, s->cfg) {
+ bblock_t *prev_block = block->prev();
+
+ if (!prev_block)
+ continue;
+
+ backend_instruction *const inst = block->start();
+ backend_instruction *const prev_inst = prev_block->end();
+
+ /* ENDIF instructions, by definition, can only be found at the start of
+ * basic blocks.
+ */
+ if (inst->opcode == BRW_OPCODE_ENDIF &&
+ prev_inst->opcode == BRW_OPCODE_ELSE) {
+ bblock_t *const else_block = prev_block;
+ backend_instruction *const else_inst = prev_inst;
+
+ else_inst->remove(else_block);
+ progress = true;
+ } else if (inst->opcode == BRW_OPCODE_ENDIF &&
+ prev_inst->opcode == BRW_OPCODE_IF) {
+ bblock_t *const endif_block = block;
+ bblock_t *const if_block = prev_block;
+ backend_instruction *const endif_inst = inst;
+ backend_instruction *const if_inst = prev_inst;
+
+ bblock_t *earlier_block = NULL, *later_block = NULL;
+
+ if (if_block->start_ip == if_block->end_ip) {
+ earlier_block = if_block->prev();
+ } else {
+ earlier_block = if_block;
+ }
+ if_inst->remove(if_block);
+
+ if (endif_block->start_ip == endif_block->end_ip) {
+ later_block = endif_block->next();
+ } else {
+ later_block = endif_block;
+ }
+ endif_inst->remove(endif_block);
+
+ assert((earlier_block == NULL) == (later_block == NULL));
+ if (earlier_block && earlier_block->can_combine_with(later_block)) {
+ earlier_block->combine_with(later_block);
+
+ /* If ENDIF was in its own block, then we've now deleted it and
+ * merged the two surrounding blocks, the latter of which the
+ * __next block pointer was pointing to.
+ */
+ if (endif_block != later_block) {
+ __next = earlier_block->next();
+ }
+ }
+
+ progress = true;
+ } else if (inst->opcode == BRW_OPCODE_ELSE &&
+ prev_inst->opcode == BRW_OPCODE_IF) {
+ bblock_t *const else_block = block;
+ backend_instruction *const if_inst = prev_inst;
+ backend_instruction *const else_inst = inst;
+
+ /* Since the else-branch is becoming the new then-branch, the
+ * condition has to be inverted.
+ */
+ if_inst->predicate_inverse = !if_inst->predicate_inverse;
+ else_inst->remove(else_block);
+
+ progress = true;
+ }
+ }
+
+ if (progress)
+ s->invalidate_live_intervals();
+
+ return progress;
+}
diff --git a/src/intel/compiler/brw_dead_control_flow.h b/src/intel/compiler/brw_dead_control_flow.h
new file mode 100644
index 00000000000..83fd9b1e79e
--- /dev/null
+++ b/src/intel/compiler/brw_dead_control_flow.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_shader.h"
+
+bool dead_control_flow_eliminate(backend_shader *s);
diff --git a/src/intel/compiler/brw_disasm.c b/src/intel/compiler/brw_disasm.c
new file mode 100644
index 00000000000..536a003dcbe
--- /dev/null
+++ b/src/intel/compiler/brw_disasm.c
@@ -0,0 +1,1646 @@
+/*
+ * Copyright © 2008 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission. The copyright holders make no representations
+ * about the suitability of this software for any purpose. It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdarg.h>
+
+#include "brw_eu_defines.h"
+#include "brw_inst.h"
+#include "brw_shader.h"
+#include "brw_reg.h"
+#include "brw_inst.h"
+#include "brw_eu.h"
+
+static bool
+has_jip(const struct gen_device_info *devinfo, enum opcode opcode)
+{
+ if (devinfo->gen < 6)
+ return false;
+
+ return opcode == BRW_OPCODE_IF ||
+ opcode == BRW_OPCODE_ELSE ||
+ opcode == BRW_OPCODE_ENDIF ||
+ opcode == BRW_OPCODE_WHILE ||
+ opcode == BRW_OPCODE_BREAK ||
+ opcode == BRW_OPCODE_CONTINUE ||
+ opcode == BRW_OPCODE_HALT;
+}
+
+static bool
+has_uip(const struct gen_device_info *devinfo, enum opcode opcode)
+{
+ if (devinfo->gen < 6)
+ return false;
+
+ return (devinfo->gen >= 7 && opcode == BRW_OPCODE_IF) ||
+ (devinfo->gen >= 8 && opcode == BRW_OPCODE_ELSE) ||
+ opcode == BRW_OPCODE_BREAK ||
+ opcode == BRW_OPCODE_CONTINUE ||
+ opcode == BRW_OPCODE_HALT;
+}
+
+static bool
+has_branch_ctrl(const struct gen_device_info *devinfo, enum opcode opcode)
+{
+ if (devinfo->gen < 8)
+ return false;
+
+ return opcode == BRW_OPCODE_IF ||
+ opcode == BRW_OPCODE_ELSE;
+ /* opcode == BRW_OPCODE_GOTO; */
+}
+
+static bool
+is_logic_instruction(unsigned opcode)
+{
+ return opcode == BRW_OPCODE_AND ||
+ opcode == BRW_OPCODE_NOT ||
+ opcode == BRW_OPCODE_OR ||
+ opcode == BRW_OPCODE_XOR;
+}
+
+const char *const conditional_modifier[16] = {
+ [BRW_CONDITIONAL_NONE] = "",
+ [BRW_CONDITIONAL_Z] = ".z",
+ [BRW_CONDITIONAL_NZ] = ".nz",
+ [BRW_CONDITIONAL_G] = ".g",
+ [BRW_CONDITIONAL_GE] = ".ge",
+ [BRW_CONDITIONAL_L] = ".l",
+ [BRW_CONDITIONAL_LE] = ".le",
+ [BRW_CONDITIONAL_R] = ".r",
+ [BRW_CONDITIONAL_O] = ".o",
+ [BRW_CONDITIONAL_U] = ".u",
+};
+
+static const char *const m_negate[2] = {
+ [0] = "",
+ [1] = "-",
+};
+
+static const char *const _abs[2] = {
+ [0] = "",
+ [1] = "(abs)",
+};
+
+static const char *const m_bitnot[2] = { "", "~" };
+
+static const char *const vert_stride[16] = {
+ [0] = "0",
+ [1] = "1",
+ [2] = "2",
+ [3] = "4",
+ [4] = "8",
+ [5] = "16",
+ [6] = "32",
+ [15] = "VxH",
+};
+
+static const char *const width[8] = {
+ [0] = "1",
+ [1] = "2",
+ [2] = "4",
+ [3] = "8",
+ [4] = "16",
+};
+
+static const char *const horiz_stride[4] = {
+ [0] = "0",
+ [1] = "1",
+ [2] = "2",
+ [3] = "4"
+};
+
+static const char *const chan_sel[4] = {
+ [0] = "x",
+ [1] = "y",
+ [2] = "z",
+ [3] = "w",
+};
+
+static const char *const debug_ctrl[2] = {
+ [0] = "",
+ [1] = ".breakpoint"
+};
+
+static const char *const saturate[2] = {
+ [0] = "",
+ [1] = ".sat"
+};
+
+static const char *const cmpt_ctrl[2] = {
+ [0] = "",
+ [1] = "compacted"
+};
+
+static const char *const accwr[2] = {
+ [0] = "",
+ [1] = "AccWrEnable"
+};
+
+static const char *const branch_ctrl[2] = {
+ [0] = "",
+ [1] = "BranchCtrl"
+};
+
+static const char *const wectrl[2] = {
+ [0] = "",
+ [1] = "WE_all"
+};
+
+static const char *const exec_size[8] = {
+ [0] = "1",
+ [1] = "2",
+ [2] = "4",
+ [3] = "8",
+ [4] = "16",
+ [5] = "32"
+};
+
+static const char *const pred_inv[2] = {
+ [0] = "+",
+ [1] = "-"
+};
+
+const char *const pred_ctrl_align16[16] = {
+ [1] = "",
+ [2] = ".x",
+ [3] = ".y",
+ [4] = ".z",
+ [5] = ".w",
+ [6] = ".any4h",
+ [7] = ".all4h",
+};
+
+static const char *const pred_ctrl_align1[16] = {
+ [BRW_PREDICATE_NORMAL] = "",
+ [BRW_PREDICATE_ALIGN1_ANYV] = ".anyv",
+ [BRW_PREDICATE_ALIGN1_ALLV] = ".allv",
+ [BRW_PREDICATE_ALIGN1_ANY2H] = ".any2h",
+ [BRW_PREDICATE_ALIGN1_ALL2H] = ".all2h",
+ [BRW_PREDICATE_ALIGN1_ANY4H] = ".any4h",
+ [BRW_PREDICATE_ALIGN1_ALL4H] = ".all4h",
+ [BRW_PREDICATE_ALIGN1_ANY8H] = ".any8h",
+ [BRW_PREDICATE_ALIGN1_ALL8H] = ".all8h",
+ [BRW_PREDICATE_ALIGN1_ANY16H] = ".any16h",
+ [BRW_PREDICATE_ALIGN1_ALL16H] = ".all16h",
+ [BRW_PREDICATE_ALIGN1_ANY32H] = ".any32h",
+ [BRW_PREDICATE_ALIGN1_ALL32H] = ".all32h",
+};
+
+static const char *const thread_ctrl[4] = {
+ [BRW_THREAD_NORMAL] = "",
+ [BRW_THREAD_ATOMIC] = "atomic",
+ [BRW_THREAD_SWITCH] = "switch",
+};
+
+static const char *const compr_ctrl[4] = {
+ [0] = "",
+ [1] = "sechalf",
+ [2] = "compr",
+ [3] = "compr4",
+};
+
+static const char *const dep_ctrl[4] = {
+ [0] = "",
+ [1] = "NoDDClr",
+ [2] = "NoDDChk",
+ [3] = "NoDDClr,NoDDChk",
+};
+
+static const char *const mask_ctrl[4] = {
+ [0] = "",
+ [1] = "nomask",
+};
+
+static const char *const access_mode[2] = {
+ [0] = "align1",
+ [1] = "align16",
+};
+
+static const char * const reg_encoding[] = {
+ [BRW_HW_REG_TYPE_UD] = "UD",
+ [BRW_HW_REG_TYPE_D] = "D",
+ [BRW_HW_REG_TYPE_UW] = "UW",
+ [BRW_HW_REG_TYPE_W] = "W",
+ [BRW_HW_REG_NON_IMM_TYPE_UB] = "UB",
+ [BRW_HW_REG_NON_IMM_TYPE_B] = "B",
+ [GEN7_HW_REG_NON_IMM_TYPE_DF] = "DF",
+ [BRW_HW_REG_TYPE_F] = "F",
+ [GEN8_HW_REG_TYPE_UQ] = "UQ",
+ [GEN8_HW_REG_TYPE_Q] = "Q",
+ [GEN8_HW_REG_NON_IMM_TYPE_HF] = "HF",
+};
+
+static const char *const three_source_reg_encoding[] = {
+ [BRW_3SRC_TYPE_F] = "F",
+ [BRW_3SRC_TYPE_D] = "D",
+ [BRW_3SRC_TYPE_UD] = "UD",
+ [BRW_3SRC_TYPE_DF] = "DF",
+};
+
+static const char *const reg_file[4] = {
+ [0] = "A",
+ [1] = "g",
+ [2] = "m",
+ [3] = "imm",
+};
+
+static const char *const writemask[16] = {
+ [0x0] = ".",
+ [0x1] = ".x",
+ [0x2] = ".y",
+ [0x3] = ".xy",
+ [0x4] = ".z",
+ [0x5] = ".xz",
+ [0x6] = ".yz",
+ [0x7] = ".xyz",
+ [0x8] = ".w",
+ [0x9] = ".xw",
+ [0xa] = ".yw",
+ [0xb] = ".xyw",
+ [0xc] = ".zw",
+ [0xd] = ".xzw",
+ [0xe] = ".yzw",
+ [0xf] = "",
+};
+
+static const char *const end_of_thread[2] = {
+ [0] = "",
+ [1] = "EOT"
+};
+
+/* SFIDs on Gen4-5 */
+static const char *const gen4_sfid[16] = {
+ [BRW_SFID_NULL] = "null",
+ [BRW_SFID_MATH] = "math",
+ [BRW_SFID_SAMPLER] = "sampler",
+ [BRW_SFID_MESSAGE_GATEWAY] = "gateway",
+ [BRW_SFID_DATAPORT_READ] = "read",
+ [BRW_SFID_DATAPORT_WRITE] = "write",
+ [BRW_SFID_URB] = "urb",
+ [BRW_SFID_THREAD_SPAWNER] = "thread_spawner",
+ [BRW_SFID_VME] = "vme",
+};
+
+static const char *const gen6_sfid[16] = {
+ [BRW_SFID_NULL] = "null",
+ [BRW_SFID_MATH] = "math",
+ [BRW_SFID_SAMPLER] = "sampler",
+ [BRW_SFID_MESSAGE_GATEWAY] = "gateway",
+ [BRW_SFID_URB] = "urb",
+ [BRW_SFID_THREAD_SPAWNER] = "thread_spawner",
+ [GEN6_SFID_DATAPORT_SAMPLER_CACHE] = "sampler",
+ [GEN6_SFID_DATAPORT_RENDER_CACHE] = "render",
+ [GEN6_SFID_DATAPORT_CONSTANT_CACHE] = "const",
+ [GEN7_SFID_DATAPORT_DATA_CACHE] = "data",
+ [GEN7_SFID_PIXEL_INTERPOLATOR] = "pixel interp",
+ [HSW_SFID_DATAPORT_DATA_CACHE_1] = "dp data 1",
+ [HSW_SFID_CRE] = "cre",
+};
+
+static const char *const gen7_gateway_subfuncid[8] = {
+ [BRW_MESSAGE_GATEWAY_SFID_OPEN_GATEWAY] = "open",
+ [BRW_MESSAGE_GATEWAY_SFID_CLOSE_GATEWAY] = "close",
+ [BRW_MESSAGE_GATEWAY_SFID_FORWARD_MSG] = "forward msg",
+ [BRW_MESSAGE_GATEWAY_SFID_GET_TIMESTAMP] = "get timestamp",
+ [BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG] = "barrier msg",
+ [BRW_MESSAGE_GATEWAY_SFID_UPDATE_GATEWAY_STATE] = "update state",
+ [BRW_MESSAGE_GATEWAY_SFID_MMIO_READ_WRITE] = "mmio read/write",
+};
+
+static const char *const gen4_dp_read_port_msg_type[4] = {
+ [0b00] = "OWord Block Read",
+ [0b01] = "OWord Dual Block Read",
+ [0b10] = "Media Block Read",
+ [0b11] = "DWord Scattered Read",
+};
+
+static const char *const g45_dp_read_port_msg_type[8] = {
+ [0b000] = "OWord Block Read",
+ [0b010] = "OWord Dual Block Read",
+ [0b100] = "Media Block Read",
+ [0b110] = "DWord Scattered Read",
+ [0b001] = "Render Target UNORM Read",
+ [0b011] = "AVC Loop Filter Read",
+};
+
+static const char *const dp_write_port_msg_type[8] = {
+ [0b000] = "OWord block write",
+ [0b001] = "OWord dual block write",
+ [0b010] = "media block write",
+ [0b011] = "DWord scattered write",
+ [0b100] = "RT write",
+ [0b101] = "streamed VB write",
+ [0b110] = "RT UNORM write", /* G45+ */
+ [0b111] = "flush render cache",
+};
+
+static const char *const dp_rc_msg_type_gen6[16] = {
+ [BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ] = "OWORD block read",
+ [GEN6_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ] = "RT UNORM read",
+ [GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ] = "OWORD dual block read",
+ [GEN6_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ] = "media block read",
+ [GEN6_DATAPORT_READ_MESSAGE_OWORD_UNALIGN_BLOCK_READ] =
+ "OWORD unaligned block read",
+ [GEN6_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ] = "DWORD scattered read",
+ [GEN6_DATAPORT_WRITE_MESSAGE_DWORD_ATOMIC_WRITE] = "DWORD atomic write",
+ [GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE] = "OWORD block write",
+ [GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE] =
+ "OWORD dual block write",
+ [GEN6_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE] = "media block write",
+ [GEN6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE] =
+ "DWORD scattered write",
+ [GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE] = "RT write",
+ [GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE] = "streamed VB write",
+ [GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_UNORM_WRITE] = "RT UNORM write",
+};
+
+static const char *const dp_rc_msg_type_gen7[16] = {
+ [GEN7_DATAPORT_RC_MEDIA_BLOCK_READ] = "media block read",
+ [GEN7_DATAPORT_RC_TYPED_SURFACE_READ] = "typed surface read",
+ [GEN7_DATAPORT_RC_TYPED_ATOMIC_OP] = "typed atomic op",
+ [GEN7_DATAPORT_RC_MEMORY_FENCE] = "memory fence",
+ [GEN7_DATAPORT_RC_MEDIA_BLOCK_WRITE] = "media block write",
+ [GEN7_DATAPORT_RC_RENDER_TARGET_WRITE] = "RT write",
+ [GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE] = "typed surface write"
+};
+
+static const char *const dp_rc_msg_type_gen9[16] = {
+ [GEN9_DATAPORT_RC_RENDER_TARGET_WRITE] = "RT write",
+ [GEN9_DATAPORT_RC_RENDER_TARGET_READ] = "RT read"
+};
+
+static const char *const *
+dp_rc_msg_type(const struct gen_device_info *devinfo)
+{
+ return (devinfo->gen >= 9 ? dp_rc_msg_type_gen9 :
+ devinfo->gen >= 7 ? dp_rc_msg_type_gen7 :
+ devinfo->gen >= 6 ? dp_rc_msg_type_gen6 :
+ dp_write_port_msg_type);
+}
+
+static const char *const m_rt_write_subtype[] = {
+ [0b000] = "SIMD16",
+ [0b001] = "SIMD16/RepData",
+ [0b010] = "SIMD8/DualSrcLow",
+ [0b011] = "SIMD8/DualSrcHigh",
+ [0b100] = "SIMD8",
+ [0b101] = "SIMD8/ImageWrite", /* Gen6+ */
+ [0b111] = "SIMD16/RepData-111", /* no idea how this is different than 1 */
+};
+
+static const char *const dp_dc0_msg_type_gen7[16] = {
+ [GEN7_DATAPORT_DC_OWORD_BLOCK_READ] = "DC OWORD block read",
+ [GEN7_DATAPORT_DC_UNALIGNED_OWORD_BLOCK_READ] =
+ "DC unaligned OWORD block read",
+ [GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_READ] = "DC OWORD dual block read",
+ [GEN7_DATAPORT_DC_DWORD_SCATTERED_READ] = "DC DWORD scattered read",
+ [GEN7_DATAPORT_DC_BYTE_SCATTERED_READ] = "DC byte scattered read",
+ [GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ] = "DC untyped surface read",
+ [GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP] = "DC untyped atomic",
+ [GEN7_DATAPORT_DC_MEMORY_FENCE] = "DC mfence",
+ [GEN7_DATAPORT_DC_OWORD_BLOCK_WRITE] = "DC OWORD block write",
+ [GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE] = "DC OWORD dual block write",
+ [GEN7_DATAPORT_DC_DWORD_SCATTERED_WRITE] = "DC DWORD scatterd write",
+ [GEN7_DATAPORT_DC_BYTE_SCATTERED_WRITE] = "DC byte scattered write",
+ [GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE] = "DC untyped surface write",
+};
+
+static const char *const dp_dc1_msg_type_hsw[16] = {
+ [HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ] = "untyped surface read",
+ [HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP] = "DC untyped atomic op",
+ [HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2] =
+ "DC untyped 4x2 atomic op",
+ [HSW_DATAPORT_DC_PORT1_MEDIA_BLOCK_READ] = "DC media block read",
+ [HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ] = "DC typed surface read",
+ [HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP] = "DC typed atomic",
+ [HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2] = "DC typed 4x2 atomic op",
+ [HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE] = "DC untyped surface write",
+ [HSW_DATAPORT_DC_PORT1_MEDIA_BLOCK_WRITE] = "DC media block write",
+ [HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP] = "DC atomic counter op",
+ [HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2] =
+ "DC 4x2 atomic counter op",
+ [HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE] = "DC typed surface write",
+};
+
+static const char *const aop[16] = {
+ [BRW_AOP_AND] = "and",
+ [BRW_AOP_OR] = "or",
+ [BRW_AOP_XOR] = "xor",
+ [BRW_AOP_MOV] = "mov",
+ [BRW_AOP_INC] = "inc",
+ [BRW_AOP_DEC] = "dec",
+ [BRW_AOP_ADD] = "add",
+ [BRW_AOP_SUB] = "sub",
+ [BRW_AOP_REVSUB] = "revsub",
+ [BRW_AOP_IMAX] = "imax",
+ [BRW_AOP_IMIN] = "imin",
+ [BRW_AOP_UMAX] = "umax",
+ [BRW_AOP_UMIN] = "umin",
+ [BRW_AOP_CMPWR] = "cmpwr",
+ [BRW_AOP_PREDEC] = "predec",
+};
+
+static const char * const pixel_interpolator_msg_types[4] = {
+ [GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET] = "per_message_offset",
+ [GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE] = "sample_position",
+ [GEN7_PIXEL_INTERPOLATOR_LOC_CENTROID] = "centroid",
+ [GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET] = "per_slot_offset",
+};
+
+static const char *const math_function[16] = {
+ [BRW_MATH_FUNCTION_INV] = "inv",
+ [BRW_MATH_FUNCTION_LOG] = "log",
+ [BRW_MATH_FUNCTION_EXP] = "exp",
+ [BRW_MATH_FUNCTION_SQRT] = "sqrt",
+ [BRW_MATH_FUNCTION_RSQ] = "rsq",
+ [BRW_MATH_FUNCTION_SIN] = "sin",
+ [BRW_MATH_FUNCTION_COS] = "cos",
+ [BRW_MATH_FUNCTION_SINCOS] = "sincos",
+ [BRW_MATH_FUNCTION_FDIV] = "fdiv",
+ [BRW_MATH_FUNCTION_POW] = "pow",
+ [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER] = "intdivmod",
+ [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT] = "intdiv",
+ [BRW_MATH_FUNCTION_INT_DIV_REMAINDER] = "intmod",
+ [GEN8_MATH_FUNCTION_INVM] = "invm",
+ [GEN8_MATH_FUNCTION_RSQRTM] = "rsqrtm",
+};
+
+static const char *const math_saturate[2] = {
+ [0] = "",
+ [1] = "sat"
+};
+
+static const char *const math_signed[2] = {
+ [0] = "",
+ [1] = "signed"
+};
+
+static const char *const math_scalar[2] = {
+ [0] = "",
+ [1] = "scalar"
+};
+
+static const char *const math_precision[2] = {
+ [0] = "",
+ [1] = "partial_precision"
+};
+
+static const char *const gen5_urb_opcode[] = {
+ [0] = "urb_write",
+ [1] = "ff_sync",
+};
+
+static const char *const gen7_urb_opcode[] = {
+ [BRW_URB_OPCODE_WRITE_HWORD] = "write HWord",
+ [BRW_URB_OPCODE_WRITE_OWORD] = "write OWord",
+ [BRW_URB_OPCODE_READ_HWORD] = "read HWord",
+ [BRW_URB_OPCODE_READ_OWORD] = "read OWord",
+ [GEN7_URB_OPCODE_ATOMIC_MOV] = "atomic mov", /* Gen7+ */
+ [GEN7_URB_OPCODE_ATOMIC_INC] = "atomic inc", /* Gen7+ */
+ [GEN8_URB_OPCODE_ATOMIC_ADD] = "atomic add", /* Gen8+ */
+ [GEN8_URB_OPCODE_SIMD8_WRITE] = "SIMD8 write", /* Gen8+ */
+ [GEN8_URB_OPCODE_SIMD8_READ] = "SIMD8 read", /* Gen8+ */
+ /* [9-15] - reserved */
+};
+
+static const char *const urb_swizzle[4] = {
+ [BRW_URB_SWIZZLE_NONE] = "",
+ [BRW_URB_SWIZZLE_INTERLEAVE] = "interleave",
+ [BRW_URB_SWIZZLE_TRANSPOSE] = "transpose",
+};
+
+static const char *const urb_allocate[2] = {
+ [0] = "",
+ [1] = "allocate"
+};
+
+static const char *const urb_used[2] = {
+ [0] = "",
+ [1] = "used"
+};
+
+static const char *const urb_complete[2] = {
+ [0] = "",
+ [1] = "complete"
+};
+
+static const char *const gen5_sampler_msg_type[] = {
+ [GEN5_SAMPLER_MESSAGE_SAMPLE] = "sample",
+ [GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS] = "sample_b",
+ [GEN5_SAMPLER_MESSAGE_SAMPLE_LOD] = "sample_l",
+ [GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE] = "sample_c",
+ [GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS] = "sample_d",
+ [GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE] = "sample_b_c",
+ [GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE] = "sample_l_c",
+ [GEN5_SAMPLER_MESSAGE_SAMPLE_LD] = "ld",
+ [GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4] = "gather4",
+ [GEN5_SAMPLER_MESSAGE_LOD] = "lod",
+ [GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO] = "resinfo",
+ [GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO] = "sampleinfo",
+ [GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C] = "gather4_c",
+ [GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO] = "gather4_po",
+ [GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C] = "gather4_po_c",
+ [HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE] = "sample_d_c",
+ [GEN9_SAMPLER_MESSAGE_SAMPLE_LZ] = "sample_lz",
+ [GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ] = "sample_c_lz",
+ [GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ] = "ld_lz",
+ [GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W] = "ld2dms_w",
+ [GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS] = "ld_mcs",
+ [GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS] = "ld2dms",
+ [GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS] = "ld2dss",
+};
+
+static const char *const gen5_sampler_simd_mode[4] = {
+ [BRW_SAMPLER_SIMD_MODE_SIMD4X2] = "SIMD4x2",
+ [BRW_SAMPLER_SIMD_MODE_SIMD8] = "SIMD8",
+ [BRW_SAMPLER_SIMD_MODE_SIMD16] = "SIMD16",
+ [BRW_SAMPLER_SIMD_MODE_SIMD32_64] = "SIMD32/64",
+};
+
+static const char *const sampler_target_format[4] = {
+ [0] = "F",
+ [2] = "UD",
+ [3] = "D"
+};
+
+
+static int column;
+
+static int
+string(FILE *file, const char *string)
+{
+ fputs(string, file);
+ column += strlen(string);
+ return 0;
+}
+
+static int
+format(FILE *f, const char *format, ...) PRINTFLIKE(2, 3);
+
+static int
+format(FILE *f, const char *format, ...)
+{
+ char buf[1024];
+ va_list args;
+ va_start(args, format);
+
+ vsnprintf(buf, sizeof(buf) - 1, format, args);
+ va_end(args);
+ string(f, buf);
+ return 0;
+}
+
+static int
+newline(FILE *f)
+{
+ putc('\n', f);
+ column = 0;
+ return 0;
+}
+
+static int
+pad(FILE *f, int c)
+{
+ do
+ string(f, " ");
+ while (column < c);
+ return 0;
+}
+
+static int
+control(FILE *file, const char *name, const char *const ctrl[],
+ unsigned id, int *space)
+{
+ if (!ctrl[id]) {
+ fprintf(file, "*** invalid %s value %d ", name, id);
+ return 1;
+ }
+ if (ctrl[id][0]) {
+ if (space && *space)
+ string(file, " ");
+ string(file, ctrl[id]);
+ if (space)
+ *space = 1;
+ }
+ return 0;
+}
+
+static int
+print_opcode(FILE *file, const struct gen_device_info *devinfo,
+ enum opcode id)
+{
+ const struct opcode_desc *desc = brw_opcode_desc(devinfo, id);
+ if (!desc) {
+ format(file, "*** invalid opcode value %d ", id);
+ return 1;
+ }
+ string(file, desc->name);
+ return 0;
+}
+
+static int
+reg(FILE *file, unsigned _reg_file, unsigned _reg_nr)
+{
+ int err = 0;
+
+ /* Clear the Compr4 instruction compression bit. */
+ if (_reg_file == BRW_MESSAGE_REGISTER_FILE)
+ _reg_nr &= ~BRW_MRF_COMPR4;
+
+ if (_reg_file == BRW_ARCHITECTURE_REGISTER_FILE) {
+ switch (_reg_nr & 0xf0) {
+ case BRW_ARF_NULL:
+ string(file, "null");
+ break;
+ case BRW_ARF_ADDRESS:
+ format(file, "a%d", _reg_nr & 0x0f);
+ break;
+ case BRW_ARF_ACCUMULATOR:
+ format(file, "acc%d", _reg_nr & 0x0f);
+ break;
+ case BRW_ARF_FLAG:
+ format(file, "f%d", _reg_nr & 0x0f);
+ break;
+ case BRW_ARF_MASK:
+ format(file, "mask%d", _reg_nr & 0x0f);
+ break;
+ case BRW_ARF_MASK_STACK:
+ format(file, "msd%d", _reg_nr & 0x0f);
+ break;
+ case BRW_ARF_STATE:
+ format(file, "sr%d", _reg_nr & 0x0f);
+ break;
+ case BRW_ARF_CONTROL:
+ format(file, "cr%d", _reg_nr & 0x0f);
+ break;
+ case BRW_ARF_NOTIFICATION_COUNT:
+ format(file, "n%d", _reg_nr & 0x0f);
+ break;
+ case BRW_ARF_IP:
+ string(file, "ip");
+ return -1;
+ break;
+ case BRW_ARF_TDR:
+ format(file, "tdr0");
+ return -1;
+ case BRW_ARF_TIMESTAMP:
+ format(file, "tm%d", _reg_nr & 0x0f);
+ break;
+ default:
+ format(file, "ARF%d", _reg_nr);
+ break;
+ }
+ } else {
+ err |= control(file, "src reg file", reg_file, _reg_file, NULL);
+ format(file, "%d", _reg_nr);
+ }
+ return err;
+}
+
+static int
+dest(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+{
+ unsigned elem_size = brw_element_size(devinfo, inst, dst);
+ int err = 0;
+
+ if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+ if (brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+ err |= reg(file, brw_inst_dst_reg_file(devinfo, inst),
+ brw_inst_dst_da_reg_nr(devinfo, inst));
+ if (err == -1)
+ return 0;
+ if (brw_inst_dst_da1_subreg_nr(devinfo, inst))
+ format(file, ".%"PRIu64, brw_inst_dst_da1_subreg_nr(devinfo, inst) /
+ elem_size);
+ string(file, "<");
+ err |= control(file, "horiz stride", horiz_stride,
+ brw_inst_dst_hstride(devinfo, inst), NULL);
+ string(file, ">");
+ err |= control(file, "dest reg encoding", reg_encoding,
+ brw_inst_dst_reg_type(devinfo, inst), NULL);
+ } else {
+ string(file, "g[a0");
+ if (brw_inst_dst_ia_subreg_nr(devinfo, inst))
+ format(file, ".%"PRIu64, brw_inst_dst_ia_subreg_nr(devinfo, inst) /
+ elem_size);
+ if (brw_inst_dst_ia1_addr_imm(devinfo, inst))
+ format(file, " %d", brw_inst_dst_ia1_addr_imm(devinfo, inst));
+ string(file, "]<");
+ err |= control(file, "horiz stride", horiz_stride,
+ brw_inst_dst_hstride(devinfo, inst), NULL);
+ string(file, ">");
+ err |= control(file, "dest reg encoding", reg_encoding,
+ brw_inst_dst_reg_type(devinfo, inst), NULL);
+ }
+ } else {
+ if (brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+ err |= reg(file, brw_inst_dst_reg_file(devinfo, inst),
+ brw_inst_dst_da_reg_nr(devinfo, inst));
+ if (err == -1)
+ return 0;
+ if (brw_inst_dst_da16_subreg_nr(devinfo, inst))
+ format(file, ".%u", 16 / elem_size);
+ string(file, "<1>");
+ err |= control(file, "writemask", writemask,
+ brw_inst_da16_writemask(devinfo, inst), NULL);
+ err |= control(file, "dest reg encoding", reg_encoding,
+ brw_inst_dst_reg_type(devinfo, inst), NULL);
+ } else {
+ err = 1;
+ string(file, "Indirect align16 address mode not supported");
+ }
+ }
+
+ return 0;
+}
+
+static int
+dest_3src(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+{
+ int err = 0;
+ uint32_t reg_file;
+
+ if (devinfo->gen == 6 && brw_inst_3src_dst_reg_file(devinfo, inst))
+ reg_file = BRW_MESSAGE_REGISTER_FILE;
+ else
+ reg_file = BRW_GENERAL_REGISTER_FILE;
+
+ err |= reg(file, reg_file, brw_inst_3src_dst_reg_nr(devinfo, inst));
+ if (err == -1)
+ return 0;
+ if (brw_inst_3src_dst_subreg_nr(devinfo, inst))
+ format(file, ".%"PRIu64, brw_inst_3src_dst_subreg_nr(devinfo, inst));
+ string(file, "<1>");
+ err |= control(file, "writemask", writemask,
+ brw_inst_3src_dst_writemask(devinfo, inst), NULL);
+ err |= control(file, "dest reg encoding", three_source_reg_encoding,
+ brw_inst_3src_dst_type(devinfo, inst), NULL);
+
+ return 0;
+}
+
+static int
+src_align1_region(FILE *file,
+ unsigned _vert_stride, unsigned _width,
+ unsigned _horiz_stride)
+{
+ int err = 0;
+ string(file, "<");
+ err |= control(file, "vert stride", vert_stride, _vert_stride, NULL);
+ string(file, ",");
+ err |= control(file, "width", width, _width, NULL);
+ string(file, ",");
+ err |= control(file, "horiz_stride", horiz_stride, _horiz_stride, NULL);
+ string(file, ">");
+ return err;
+}
+
+static int
+src_da1(FILE *file,
+ const struct gen_device_info *devinfo,
+ unsigned opcode,
+ unsigned type, unsigned _reg_file,
+ unsigned _vert_stride, unsigned _width, unsigned _horiz_stride,
+ unsigned reg_num, unsigned sub_reg_num, unsigned __abs,
+ unsigned _negate)
+{
+ int err = 0;
+
+ if (devinfo->gen >= 8 && is_logic_instruction(opcode))
+ err |= control(file, "bitnot", m_bitnot, _negate, NULL);
+ else
+ err |= control(file, "negate", m_negate, _negate, NULL);
+
+ err |= control(file, "abs", _abs, __abs, NULL);
+
+ err |= reg(file, _reg_file, reg_num);
+ if (err == -1)
+ return 0;
+ if (sub_reg_num) {
+ unsigned elem_size = brw_hw_reg_type_to_size(devinfo, type, _reg_file);
+ format(file, ".%d", sub_reg_num / elem_size); /* use formal style like spec */
+ }
+ src_align1_region(file, _vert_stride, _width, _horiz_stride);
+ err |= control(file, "src reg encoding", reg_encoding, type, NULL);
+ return err;
+}
+
+static int
+src_ia1(FILE *file,
+ const struct gen_device_info *devinfo,
+ unsigned opcode,
+ unsigned type,
+ unsigned _reg_file,
+ int _addr_imm,
+ unsigned _addr_subreg_nr,
+ unsigned _negate,
+ unsigned __abs,
+ unsigned _horiz_stride, unsigned _width, unsigned _vert_stride)
+{
+ int err = 0;
+
+ if (devinfo->gen >= 8 && is_logic_instruction(opcode))
+ err |= control(file, "bitnot", m_bitnot, _negate, NULL);
+ else
+ err |= control(file, "negate", m_negate, _negate, NULL);
+
+ err |= control(file, "abs", _abs, __abs, NULL);
+
+ string(file, "g[a0");
+ if (_addr_subreg_nr)
+ format(file, ".%d", _addr_subreg_nr);
+ if (_addr_imm)
+ format(file, " %d", _addr_imm);
+ string(file, "]");
+ src_align1_region(file, _vert_stride, _width, _horiz_stride);
+ err |= control(file, "src reg encoding", reg_encoding, type, NULL);
+ return err;
+}
+
+static int
+src_swizzle(FILE *file, unsigned swiz)
+{
+ unsigned x = BRW_GET_SWZ(swiz, BRW_CHANNEL_X);
+ unsigned y = BRW_GET_SWZ(swiz, BRW_CHANNEL_Y);
+ unsigned z = BRW_GET_SWZ(swiz, BRW_CHANNEL_Z);
+ unsigned w = BRW_GET_SWZ(swiz, BRW_CHANNEL_W);
+ int err = 0;
+
+ if (x == y && x == z && x == w) {
+ string(file, ".");
+ err |= control(file, "channel select", chan_sel, x, NULL);
+ } else if (swiz != BRW_SWIZZLE_XYZW) {
+ string(file, ".");
+ err |= control(file, "channel select", chan_sel, x, NULL);
+ err |= control(file, "channel select", chan_sel, y, NULL);
+ err |= control(file, "channel select", chan_sel, z, NULL);
+ err |= control(file, "channel select", chan_sel, w, NULL);
+ }
+ return err;
+}
+
+static int
+src_da16(FILE *file,
+ const struct gen_device_info *devinfo,
+ unsigned opcode,
+ unsigned _reg_type,
+ unsigned _reg_file,
+ unsigned _vert_stride,
+ unsigned _reg_nr,
+ unsigned _subreg_nr,
+ unsigned __abs,
+ unsigned _negate,
+ unsigned swz_x, unsigned swz_y, unsigned swz_z, unsigned swz_w)
+{
+ int err = 0;
+
+ if (devinfo->gen >= 8 && is_logic_instruction(opcode))
+ err |= control(file, "bitnot", m_bitnot, _negate, NULL);
+ else
+ err |= control(file, "negate", m_negate, _negate, NULL);
+
+ err |= control(file, "abs", _abs, __abs, NULL);
+
+ err |= reg(file, _reg_file, _reg_nr);
+ if (err == -1)
+ return 0;
+ if (_subreg_nr) {
+ unsigned elem_size =
+ brw_hw_reg_type_to_size(devinfo, _reg_type, _reg_file);
+
+ /* bit4 for subreg number byte addressing. Make this same meaning as
+ in da1 case, so output looks consistent. */
+ format(file, ".%d", 16 / elem_size);
+ }
+ string(file, "<");
+ err |= control(file, "vert stride", vert_stride, _vert_stride, NULL);
+ string(file, ">");
+ err |= src_swizzle(file, BRW_SWIZZLE4(swz_x, swz_y, swz_z, swz_w));
+ err |= control(file, "src da16 reg type", reg_encoding, _reg_type, NULL);
+ return err;
+}
+
+static int
+src0_3src(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+{
+ int err = 0;
+ unsigned src0_subreg_nr = brw_inst_3src_src0_subreg_nr(devinfo, inst);
+
+ err |= control(file, "negate", m_negate,
+ brw_inst_3src_src0_negate(devinfo, inst), NULL);
+ err |= control(file, "abs", _abs, brw_inst_3src_src0_abs(devinfo, inst), NULL);
+
+ err |= reg(file, BRW_GENERAL_REGISTER_FILE,
+ brw_inst_3src_src0_reg_nr(devinfo, inst));
+ if (err == -1)
+ return 0;
+ if (src0_subreg_nr || brw_inst_3src_src0_rep_ctrl(devinfo, inst))
+ format(file, ".%d", src0_subreg_nr);
+ if (brw_inst_3src_src0_rep_ctrl(devinfo, inst))
+ string(file, "<0,1,0>");
+ else {
+ string(file, "<4,4,1>");
+ err |= src_swizzle(file, brw_inst_3src_src0_swizzle(devinfo, inst));
+ }
+ err |= control(file, "src da16 reg type", three_source_reg_encoding,
+ brw_inst_3src_src_type(devinfo, inst), NULL);
+ return err;
+}
+
+static int
+src1_3src(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+{
+ int err = 0;
+ unsigned src1_subreg_nr = brw_inst_3src_src1_subreg_nr(devinfo, inst);
+
+ err |= control(file, "negate", m_negate,
+ brw_inst_3src_src1_negate(devinfo, inst), NULL);
+ err |= control(file, "abs", _abs, brw_inst_3src_src1_abs(devinfo, inst), NULL);
+
+ err |= reg(file, BRW_GENERAL_REGISTER_FILE,
+ brw_inst_3src_src1_reg_nr(devinfo, inst));
+ if (err == -1)
+ return 0;
+ if (src1_subreg_nr || brw_inst_3src_src1_rep_ctrl(devinfo, inst))
+ format(file, ".%d", src1_subreg_nr);
+ if (brw_inst_3src_src1_rep_ctrl(devinfo, inst))
+ string(file, "<0,1,0>");
+ else {
+ string(file, "<4,4,1>");
+ err |= src_swizzle(file, brw_inst_3src_src1_swizzle(devinfo, inst));
+ }
+ err |= control(file, "src da16 reg type", three_source_reg_encoding,
+ brw_inst_3src_src_type(devinfo, inst), NULL);
+ return err;
+}
+
+
+static int
+src2_3src(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+{
+ int err = 0;
+ unsigned src2_subreg_nr = brw_inst_3src_src2_subreg_nr(devinfo, inst);
+
+ err |= control(file, "negate", m_negate,
+ brw_inst_3src_src2_negate(devinfo, inst), NULL);
+ err |= control(file, "abs", _abs, brw_inst_3src_src2_abs(devinfo, inst), NULL);
+
+ err |= reg(file, BRW_GENERAL_REGISTER_FILE,
+ brw_inst_3src_src2_reg_nr(devinfo, inst));
+ if (err == -1)
+ return 0;
+ if (src2_subreg_nr || brw_inst_3src_src2_rep_ctrl(devinfo, inst))
+ format(file, ".%d", src2_subreg_nr);
+ if (brw_inst_3src_src2_rep_ctrl(devinfo, inst))
+ string(file, "<0,1,0>");
+ else {
+ string(file, "<4,4,1>");
+ err |= src_swizzle(file, brw_inst_3src_src2_swizzle(devinfo, inst));
+ }
+ err |= control(file, "src da16 reg type", three_source_reg_encoding,
+ brw_inst_3src_src_type(devinfo, inst), NULL);
+ return err;
+}
+
+static int
+imm(FILE *file, const struct gen_device_info *devinfo, unsigned type, brw_inst *inst)
+{
+ switch (type) {
+ case BRW_HW_REG_TYPE_UD:
+ format(file, "0x%08xUD", brw_inst_imm_ud(devinfo, inst));
+ break;
+ case BRW_HW_REG_TYPE_D:
+ format(file, "%dD", brw_inst_imm_d(devinfo, inst));
+ break;
+ case BRW_HW_REG_TYPE_UW:
+ format(file, "0x%04xUW", (uint16_t) brw_inst_imm_ud(devinfo, inst));
+ break;
+ case BRW_HW_REG_TYPE_W:
+ format(file, "%dW", (int16_t) brw_inst_imm_d(devinfo, inst));
+ break;
+ case BRW_HW_REG_IMM_TYPE_UV:
+ format(file, "0x%08xUV", brw_inst_imm_ud(devinfo, inst));
+ break;
+ case BRW_HW_REG_IMM_TYPE_VF:
+ format(file, "[%-gF, %-gF, %-gF, %-gF]VF",
+ brw_vf_to_float(brw_inst_imm_ud(devinfo, inst)),
+ brw_vf_to_float(brw_inst_imm_ud(devinfo, inst) >> 8),
+ brw_vf_to_float(brw_inst_imm_ud(devinfo, inst) >> 16),
+ brw_vf_to_float(brw_inst_imm_ud(devinfo, inst) >> 24));
+ break;
+ case BRW_HW_REG_IMM_TYPE_V:
+ format(file, "0x%08xV", brw_inst_imm_ud(devinfo, inst));
+ break;
+ case BRW_HW_REG_TYPE_F:
+ format(file, "%-gF", brw_inst_imm_f(devinfo, inst));
+ break;
+ case GEN8_HW_REG_IMM_TYPE_DF:
+ format(file, "%-gDF", brw_inst_imm_df(devinfo, inst));
+ break;
+ case GEN8_HW_REG_IMM_TYPE_HF:
+ string(file, "Half Float IMM");
+ break;
+ }
+ return 0;
+}
+
+static int
+src0(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+{
+ if (brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) {
+ return imm(file, devinfo, brw_inst_src0_reg_type(devinfo, inst), inst);
+ } else if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+ if (brw_inst_src0_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+ return src_da1(file,
+ devinfo,
+ brw_inst_opcode(devinfo, inst),
+ brw_inst_src0_reg_type(devinfo, inst),
+ brw_inst_src0_reg_file(devinfo, inst),
+ brw_inst_src0_vstride(devinfo, inst),
+ brw_inst_src0_width(devinfo, inst),
+ brw_inst_src0_hstride(devinfo, inst),
+ brw_inst_src0_da_reg_nr(devinfo, inst),
+ brw_inst_src0_da1_subreg_nr(devinfo, inst),
+ brw_inst_src0_abs(devinfo, inst),
+ brw_inst_src0_negate(devinfo, inst));
+ } else {
+ return src_ia1(file,
+ devinfo,
+ brw_inst_opcode(devinfo, inst),
+ brw_inst_src0_reg_type(devinfo, inst),
+ brw_inst_src0_reg_file(devinfo, inst),
+ brw_inst_src0_ia1_addr_imm(devinfo, inst),
+ brw_inst_src0_ia_subreg_nr(devinfo, inst),
+ brw_inst_src0_negate(devinfo, inst),
+ brw_inst_src0_abs(devinfo, inst),
+ brw_inst_src0_hstride(devinfo, inst),
+ brw_inst_src0_width(devinfo, inst),
+ brw_inst_src0_vstride(devinfo, inst));
+ }
+ } else {
+ if (brw_inst_src0_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+ return src_da16(file,
+ devinfo,
+ brw_inst_opcode(devinfo, inst),
+ brw_inst_src0_reg_type(devinfo, inst),
+ brw_inst_src0_reg_file(devinfo, inst),
+ brw_inst_src0_vstride(devinfo, inst),
+ brw_inst_src0_da_reg_nr(devinfo, inst),
+ brw_inst_src0_da16_subreg_nr(devinfo, inst),
+ brw_inst_src0_abs(devinfo, inst),
+ brw_inst_src0_negate(devinfo, inst),
+ brw_inst_src0_da16_swiz_x(devinfo, inst),
+ brw_inst_src0_da16_swiz_y(devinfo, inst),
+ brw_inst_src0_da16_swiz_z(devinfo, inst),
+ brw_inst_src0_da16_swiz_w(devinfo, inst));
+ } else {
+ string(file, "Indirect align16 address mode not supported");
+ return 1;
+ }
+ }
+}
+
+static int
+src1(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+{
+ if (brw_inst_src1_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) {
+ return imm(file, devinfo, brw_inst_src1_reg_type(devinfo, inst), inst);
+ } else if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+ if (brw_inst_src1_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+ return src_da1(file,
+ devinfo,
+ brw_inst_opcode(devinfo, inst),
+ brw_inst_src1_reg_type(devinfo, inst),
+ brw_inst_src1_reg_file(devinfo, inst),
+ brw_inst_src1_vstride(devinfo, inst),
+ brw_inst_src1_width(devinfo, inst),
+ brw_inst_src1_hstride(devinfo, inst),
+ brw_inst_src1_da_reg_nr(devinfo, inst),
+ brw_inst_src1_da1_subreg_nr(devinfo, inst),
+ brw_inst_src1_abs(devinfo, inst),
+ brw_inst_src1_negate(devinfo, inst));
+ } else {
+ return src_ia1(file,
+ devinfo,
+ brw_inst_opcode(devinfo, inst),
+ brw_inst_src1_reg_type(devinfo, inst),
+ brw_inst_src1_reg_file(devinfo, inst),
+ brw_inst_src1_ia1_addr_imm(devinfo, inst),
+ brw_inst_src1_ia_subreg_nr(devinfo, inst),
+ brw_inst_src1_negate(devinfo, inst),
+ brw_inst_src1_abs(devinfo, inst),
+ brw_inst_src1_hstride(devinfo, inst),
+ brw_inst_src1_width(devinfo, inst),
+ brw_inst_src1_vstride(devinfo, inst));
+ }
+ } else {
+ if (brw_inst_src1_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+ return src_da16(file,
+ devinfo,
+ brw_inst_opcode(devinfo, inst),
+ brw_inst_src1_reg_type(devinfo, inst),
+ brw_inst_src1_reg_file(devinfo, inst),
+ brw_inst_src1_vstride(devinfo, inst),
+ brw_inst_src1_da_reg_nr(devinfo, inst),
+ brw_inst_src1_da16_subreg_nr(devinfo, inst),
+ brw_inst_src1_abs(devinfo, inst),
+ brw_inst_src1_negate(devinfo, inst),
+ brw_inst_src1_da16_swiz_x(devinfo, inst),
+ brw_inst_src1_da16_swiz_y(devinfo, inst),
+ brw_inst_src1_da16_swiz_z(devinfo, inst),
+ brw_inst_src1_da16_swiz_w(devinfo, inst));
+ } else {
+ string(file, "Indirect align16 address mode not supported");
+ return 1;
+ }
+ }
+}
+
+static int
+qtr_ctrl(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+{
+ int qtr_ctl = brw_inst_qtr_control(devinfo, inst);
+ int exec_size = 1 << brw_inst_exec_size(devinfo, inst);
+
+ if (exec_size < 8) {
+ const unsigned nib_ctl = devinfo->gen < 7 ? 0 :
+ brw_inst_nib_control(devinfo, inst);
+ format(file, " %dN", qtr_ctl * 2 + nib_ctl + 1);
+ } else if (exec_size == 8) {
+ switch (qtr_ctl) {
+ case 0:
+ string(file, " 1Q");
+ break;
+ case 1:
+ string(file, " 2Q");
+ break;
+ case 2:
+ string(file, " 3Q");
+ break;
+ case 3:
+ string(file, " 4Q");
+ break;
+ }
+ } else if (exec_size == 16) {
+ if (qtr_ctl < 2)
+ string(file, " 1H");
+ else
+ string(file, " 2H");
+ }
+ return 0;
+}
+
+#ifdef DEBUG
+static __attribute__((__unused__)) int
+brw_disassemble_imm(const struct gen_device_info *devinfo,
+ uint32_t dw3, uint32_t dw2, uint32_t dw1, uint32_t dw0)
+{
+ brw_inst inst;
+ inst.data[0] = (((uint64_t) dw1) << 32) | ((uint64_t) dw0);
+ inst.data[1] = (((uint64_t) dw3) << 32) | ((uint64_t) dw2);
+ return brw_disassemble_inst(stderr, devinfo, &inst, false);
+}
+#endif
+
+int
+brw_disassemble_inst(FILE *file, const struct gen_device_info *devinfo,
+ brw_inst *inst, bool is_compacted)
+{
+ int err = 0;
+ int space = 0;
+
+ const enum opcode opcode = brw_inst_opcode(devinfo, inst);
+ const struct opcode_desc *desc = brw_opcode_desc(devinfo, opcode);
+
+ if (brw_inst_pred_control(devinfo, inst)) {
+ string(file, "(");
+ err |= control(file, "predicate inverse", pred_inv,
+ brw_inst_pred_inv(devinfo, inst), NULL);
+ format(file, "f%"PRIu64, devinfo->gen >= 7 ? brw_inst_flag_reg_nr(devinfo, inst) : 0);
+ if (brw_inst_flag_subreg_nr(devinfo, inst))
+ format(file, ".%"PRIu64, brw_inst_flag_subreg_nr(devinfo, inst));
+ if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+ err |= control(file, "predicate control align1", pred_ctrl_align1,
+ brw_inst_pred_control(devinfo, inst), NULL);
+ } else {
+ err |= control(file, "predicate control align16", pred_ctrl_align16,
+ brw_inst_pred_control(devinfo, inst), NULL);
+ }
+ string(file, ") ");
+ }
+
+ err |= print_opcode(file, devinfo, opcode);
+ err |= control(file, "saturate", saturate, brw_inst_saturate(devinfo, inst),
+ NULL);
+
+ err |= control(file, "debug control", debug_ctrl,
+ brw_inst_debug_control(devinfo, inst), NULL);
+
+ if (opcode == BRW_OPCODE_MATH) {
+ string(file, " ");
+ err |= control(file, "function", math_function,
+ brw_inst_math_function(devinfo, inst), NULL);
+ } else if (opcode != BRW_OPCODE_SEND && opcode != BRW_OPCODE_SENDC) {
+ err |= control(file, "conditional modifier", conditional_modifier,
+ brw_inst_cond_modifier(devinfo, inst), NULL);
+
+ /* If we're using the conditional modifier, print which flags reg is
+ * used for it. Note that on gen6+, the embedded-condition SEL and
+ * control flow doesn't update flags.
+ */
+ if (brw_inst_cond_modifier(devinfo, inst) &&
+ (devinfo->gen < 6 || (opcode != BRW_OPCODE_SEL &&
+ opcode != BRW_OPCODE_IF &&
+ opcode != BRW_OPCODE_WHILE))) {
+ format(file, ".f%"PRIu64,
+ devinfo->gen >= 7 ? brw_inst_flag_reg_nr(devinfo, inst) : 0);
+ if (brw_inst_flag_subreg_nr(devinfo, inst))
+ format(file, ".%"PRIu64, brw_inst_flag_subreg_nr(devinfo, inst));
+ }
+ }
+
+ if (opcode != BRW_OPCODE_NOP && opcode != BRW_OPCODE_NENOP) {
+ string(file, "(");
+ err |= control(file, "execution size", exec_size,
+ brw_inst_exec_size(devinfo, inst), NULL);
+ string(file, ")");
+ }
+
+ if (opcode == BRW_OPCODE_SEND && devinfo->gen < 6)
+ format(file, " %"PRIu64, brw_inst_base_mrf(devinfo, inst));
+
+ if (has_uip(devinfo, opcode)) {
+ /* Instructions that have UIP also have JIP. */
+ pad(file, 16);
+ format(file, "JIP: %d", brw_inst_jip(devinfo, inst));
+ pad(file, 32);
+ format(file, "UIP: %d", brw_inst_uip(devinfo, inst));
+ } else if (has_jip(devinfo, opcode)) {
+ pad(file, 16);
+ if (devinfo->gen >= 7) {
+ format(file, "JIP: %d", brw_inst_jip(devinfo, inst));
+ } else {
+ format(file, "JIP: %d", brw_inst_gen6_jump_count(devinfo, inst));
+ }
+ } else if (devinfo->gen < 6 && (opcode == BRW_OPCODE_BREAK ||
+ opcode == BRW_OPCODE_CONTINUE ||
+ opcode == BRW_OPCODE_ELSE)) {
+ pad(file, 16);
+ format(file, "Jump: %d", brw_inst_gen4_jump_count(devinfo, inst));
+ pad(file, 32);
+ format(file, "Pop: %"PRIu64, brw_inst_gen4_pop_count(devinfo, inst));
+ } else if (devinfo->gen < 6 && (opcode == BRW_OPCODE_IF ||
+ opcode == BRW_OPCODE_IFF ||
+ opcode == BRW_OPCODE_HALT)) {
+ pad(file, 16);
+ format(file, "Jump: %d", brw_inst_gen4_jump_count(devinfo, inst));
+ } else if (devinfo->gen < 6 && opcode == BRW_OPCODE_ENDIF) {
+ pad(file, 16);
+ format(file, "Pop: %"PRIu64, brw_inst_gen4_pop_count(devinfo, inst));
+ } else if (opcode == BRW_OPCODE_JMPI) {
+ pad(file, 16);
+ err |= src1(file, devinfo, inst);
+ } else if (desc && desc->nsrc == 3) {
+ pad(file, 16);
+ err |= dest_3src(file, devinfo, inst);
+
+ pad(file, 32);
+ err |= src0_3src(file, devinfo, inst);
+
+ pad(file, 48);
+ err |= src1_3src(file, devinfo, inst);
+
+ pad(file, 64);
+ err |= src2_3src(file, devinfo, inst);
+ } else if (desc) {
+ if (desc->ndst > 0) {
+ pad(file, 16);
+ err |= dest(file, devinfo, inst);
+ }
+
+ if (desc->nsrc > 0) {
+ pad(file, 32);
+ err |= src0(file, devinfo, inst);
+ }
+
+ if (desc->nsrc > 1) {
+ pad(file, 48);
+ err |= src1(file, devinfo, inst);
+ }
+ }
+
+ if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
+ enum brw_message_target sfid = brw_inst_sfid(devinfo, inst);
+
+ if (brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE) {
+ /* show the indirect descriptor source */
+ pad(file, 48);
+ err |= src1(file, devinfo, inst);
+ }
+
+ newline(file);
+ pad(file, 16);
+ space = 0;
+
+ fprintf(file, " ");
+ err |= control(file, "SFID", devinfo->gen >= 6 ? gen6_sfid : gen4_sfid,
+ sfid, &space);
+
+
+ if (brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE) {
+ format(file, " indirect");
+ } else {
+ switch (sfid) {
+ case BRW_SFID_MATH:
+ err |= control(file, "math function", math_function,
+ brw_inst_math_msg_function(devinfo, inst), &space);
+ err |= control(file, "math saturate", math_saturate,
+ brw_inst_math_msg_saturate(devinfo, inst), &space);
+ err |= control(file, "math signed", math_signed,
+ brw_inst_math_msg_signed_int(devinfo, inst), &space);
+ err |= control(file, "math scalar", math_scalar,
+ brw_inst_math_msg_data_type(devinfo, inst), &space);
+ err |= control(file, "math precision", math_precision,
+ brw_inst_math_msg_precision(devinfo, inst), &space);
+ break;
+ case BRW_SFID_SAMPLER:
+ if (devinfo->gen >= 5) {
+ err |= control(file, "sampler message", gen5_sampler_msg_type,
+ brw_inst_sampler_msg_type(devinfo, inst), &space);
+ err |= control(file, "sampler simd mode", gen5_sampler_simd_mode,
+ brw_inst_sampler_simd_mode(devinfo, inst), &space);
+ format(file, " Surface = %"PRIu64" Sampler = %"PRIu64,
+ brw_inst_binding_table_index(devinfo, inst),
+ brw_inst_sampler(devinfo, inst));
+ } else {
+ format(file, " (%"PRIu64", %"PRIu64", %"PRIu64", ",
+ brw_inst_binding_table_index(devinfo, inst),
+ brw_inst_sampler(devinfo, inst),
+ brw_inst_sampler_msg_type(devinfo, inst));
+ if (!devinfo->is_g4x) {
+ err |= control(file, "sampler target format",
+ sampler_target_format,
+ brw_inst_sampler_return_format(devinfo, inst), NULL);
+ }
+ string(file, ")");
+ }
+ break;
+ case GEN6_SFID_DATAPORT_SAMPLER_CACHE:
+ case GEN6_SFID_DATAPORT_CONSTANT_CACHE:
+ /* aka BRW_SFID_DATAPORT_READ on Gen4-5 */
+ if (devinfo->gen >= 6) {
+ format(file, " (%"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64")",
+ brw_inst_binding_table_index(devinfo, inst),
+ brw_inst_dp_msg_control(devinfo, inst),
+ brw_inst_dp_msg_type(devinfo, inst),
+ devinfo->gen >= 7 ? 0 : brw_inst_dp_write_commit(devinfo, inst));
+ } else {
+ bool is_965 = devinfo->gen == 4 && !devinfo->is_g4x;
+ err |= control(file, "DP read message type",
+ is_965 ? gen4_dp_read_port_msg_type :
+ g45_dp_read_port_msg_type,
+ brw_inst_dp_read_msg_type(devinfo, inst),
+ &space);
+
+ format(file, " MsgCtrl = 0x%"PRIx64,
+ brw_inst_dp_read_msg_control(devinfo, inst));
+
+ format(file, " Surface = %"PRIu64, brw_inst_binding_table_index(devinfo, inst));
+ }
+ break;
+
+ case GEN6_SFID_DATAPORT_RENDER_CACHE: {
+ /* aka BRW_SFID_DATAPORT_WRITE on Gen4-5 */
+ unsigned msg_type = brw_inst_dp_write_msg_type(devinfo, inst);
+
+ err |= control(file, "DP rc message type",
+ dp_rc_msg_type(devinfo), msg_type, &space);
+
+ bool is_rt_write = msg_type ==
+ (devinfo->gen >= 6 ? GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE
+ : BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE);
+
+ if (is_rt_write) {
+ err |= control(file, "RT message type", m_rt_write_subtype,
+ brw_inst_rt_message_type(devinfo, inst), &space);
+ if (devinfo->gen >= 6 && brw_inst_rt_slot_group(devinfo, inst))
+ string(file, " Hi");
+ if (brw_inst_rt_last(devinfo, inst))
+ string(file, " LastRT");
+ if (devinfo->gen < 7 && brw_inst_dp_write_commit(devinfo, inst))
+ string(file, " WriteCommit");
+ } else {
+ format(file, " MsgCtrl = 0x%"PRIx64,
+ brw_inst_dp_write_msg_control(devinfo, inst));
+ }
+
+ format(file, " Surface = %"PRIu64, brw_inst_binding_table_index(devinfo, inst));
+ break;
+ }
+
+ case BRW_SFID_URB: {
+ unsigned opcode = brw_inst_urb_opcode(devinfo, inst);
+
+ format(file, " %"PRIu64, brw_inst_urb_global_offset(devinfo, inst));
+
+ space = 1;
+
+ err |= control(file, "urb opcode",
+ devinfo->gen >= 7 ? gen7_urb_opcode
+ : gen5_urb_opcode,
+ opcode, &space);
+
+ if (devinfo->gen >= 7 &&
+ brw_inst_urb_per_slot_offset(devinfo, inst)) {
+ string(file, " per-slot");
+ }
+
+ if (opcode == GEN8_URB_OPCODE_SIMD8_WRITE ||
+ opcode == GEN8_URB_OPCODE_SIMD8_READ) {
+ if (brw_inst_urb_channel_mask_present(devinfo, inst))
+ string(file, " masked");
+ } else {
+ err |= control(file, "urb swizzle", urb_swizzle,
+ brw_inst_urb_swizzle_control(devinfo, inst),
+ &space);
+ }
+
+ if (devinfo->gen < 7) {
+ err |= control(file, "urb allocate", urb_allocate,
+ brw_inst_urb_allocate(devinfo, inst), &space);
+ err |= control(file, "urb used", urb_used,
+ brw_inst_urb_used(devinfo, inst), &space);
+ }
+ if (devinfo->gen < 8) {
+ err |= control(file, "urb complete", urb_complete,
+ brw_inst_urb_complete(devinfo, inst), &space);
+ }
+ break;
+ }
+ case BRW_SFID_THREAD_SPAWNER:
+ break;
+
+ case BRW_SFID_MESSAGE_GATEWAY:
+ format(file, " (%s)",
+ gen7_gateway_subfuncid[brw_inst_gateway_subfuncid(devinfo, inst)]);
+ break;
+
+ case GEN7_SFID_DATAPORT_DATA_CACHE:
+ if (devinfo->gen >= 7) {
+ format(file, " (");
+
+ err |= control(file, "DP DC0 message type",
+ dp_dc0_msg_type_gen7,
+ brw_inst_dp_msg_type(devinfo, inst), &space);
+
+ format(file, ", %"PRIu64", ", brw_inst_binding_table_index(devinfo, inst));
+
+ switch (brw_inst_dp_msg_type(devinfo, inst)) {
+ case GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
+ control(file, "atomic op", aop,
+ brw_inst_imm_ud(devinfo, inst) >> 8 & 0xf, &space);
+ break;
+ default:
+ format(file, "%"PRIu64, brw_inst_dp_msg_control(devinfo, inst));
+ }
+ format(file, ")");
+ break;
+ }
+ /* FALLTHROUGH */
+
+ case HSW_SFID_DATAPORT_DATA_CACHE_1: {
+ if (devinfo->gen >= 7) {
+ format(file, " (");
+
+ unsigned msg_ctrl = brw_inst_dp_msg_control(devinfo, inst);
+
+ err |= control(file, "DP DC1 message type",
+ dp_dc1_msg_type_hsw,
+ brw_inst_dp_msg_type(devinfo, inst), &space);
+
+ format(file, ", Surface = %"PRIu64", ",
+ brw_inst_binding_table_index(devinfo, inst));
+
+ switch (brw_inst_dp_msg_type(devinfo, inst)) {
+ case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
+ case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
+ case HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP:
+ format(file, "SIMD%d,", (msg_ctrl & (1 << 4)) ? 8 : 16);
+ /* fallthrough */
+ case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
+ case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
+ case HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2:
+ control(file, "atomic op", aop, msg_ctrl & 0xf, &space);
+ break;
+ case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ:
+ case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE:
+ case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ:
+ case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE: {
+ static const char *simd_modes[] = { "4x2", "16", "8" };
+ format(file, "SIMD%s, Mask = 0x%x",
+ simd_modes[msg_ctrl >> 4], msg_ctrl & 0xf);
+ break;
+ }
+ default:
+ format(file, "0x%x", msg_ctrl);
+ }
+ format(file, ")");
+ break;
+ }
+ /* FALLTHROUGH */
+ }
+
+ case GEN7_SFID_PIXEL_INTERPOLATOR:
+ if (devinfo->gen >= 7) {
+ format(file, " (%s, %s, 0x%02"PRIx64")",
+ brw_inst_pi_nopersp(devinfo, inst) ? "linear" : "persp",
+ pixel_interpolator_msg_types[brw_inst_pi_message_type(devinfo, inst)],
+ brw_inst_pi_message_data(devinfo, inst));
+ break;
+ }
+ /* FALLTHROUGH */
+
+ default:
+ format(file, "unsupported shared function ID %d", sfid);
+ break;
+ }
+
+ if (space)
+ string(file, " ");
+ format(file, "mlen %"PRIu64, brw_inst_mlen(devinfo, inst));
+ format(file, " rlen %"PRIu64, brw_inst_rlen(devinfo, inst));
+ }
+ }
+ pad(file, 64);
+ if (opcode != BRW_OPCODE_NOP && opcode != BRW_OPCODE_NENOP) {
+ string(file, "{");
+ space = 1;
+ err |= control(file, "access mode", access_mode,
+ brw_inst_access_mode(devinfo, inst), &space);
+ if (devinfo->gen >= 6) {
+ err |= control(file, "write enable control", wectrl,
+ brw_inst_mask_control(devinfo, inst), &space);
+ } else {
+ err |= control(file, "mask control", mask_ctrl,
+ brw_inst_mask_control(devinfo, inst), &space);
+ }
+ err |= control(file, "dependency control", dep_ctrl,
+ ((brw_inst_no_dd_check(devinfo, inst) << 1) |
+ brw_inst_no_dd_clear(devinfo, inst)), &space);
+
+ if (devinfo->gen >= 6)
+ err |= qtr_ctrl(file, devinfo, inst);
+ else {
+ if (brw_inst_qtr_control(devinfo, inst) == BRW_COMPRESSION_COMPRESSED &&
+ desc && desc->ndst > 0 &&
+ brw_inst_dst_reg_file(devinfo, inst) == BRW_MESSAGE_REGISTER_FILE &&
+ brw_inst_dst_da_reg_nr(devinfo, inst) & BRW_MRF_COMPR4) {
+ format(file, " compr4");
+ } else {
+ err |= control(file, "compression control", compr_ctrl,
+ brw_inst_qtr_control(devinfo, inst), &space);
+ }
+ }
+
+ err |= control(file, "compaction", cmpt_ctrl, is_compacted, &space);
+ err |= control(file, "thread control", thread_ctrl,
+ brw_inst_thread_control(devinfo, inst), &space);
+ if (has_branch_ctrl(devinfo, opcode)) {
+ err |= control(file, "branch ctrl", branch_ctrl,
+ brw_inst_branch_control(devinfo, inst), &space);
+ } else if (devinfo->gen >= 6) {
+ err |= control(file, "acc write control", accwr,
+ brw_inst_acc_wr_control(devinfo, inst), &space);
+ }
+ if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC)
+ err |= control(file, "end of thread", end_of_thread,
+ brw_inst_eot(devinfo, inst), &space);
+ if (space)
+ string(file, " ");
+ string(file, "}");
+ }
+ string(file, ";");
+ newline(file);
+ return err;
+}
diff --git a/src/intel/compiler/brw_eu.c b/src/intel/compiler/brw_eu.c
new file mode 100644
index 00000000000..77400c19914
--- /dev/null
+++ b/src/intel/compiler/brw_eu.c
@@ -0,0 +1,719 @@
+/*
+ Copyright (C) Intel Corp. 2006. All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+ * Authors:
+ * Keith Whitwell <[email protected]>
+ */
+
+
+#include "brw_eu_defines.h"
+#include "brw_eu.h"
+#include "brw_shader.h"
+#include "common/gen_debug.h"
+
+#include "util/ralloc.h"
+
+/**
+ * Converts a BRW_REGISTER_TYPE_* enum to a short string (F, UD, and so on).
+ *
+ * This is different than reg_encoding from brw_disasm.c in that it operates
+ * on the abstract enum values, rather than the generation-specific encoding.
+ */
+const char *
+brw_reg_type_letters(unsigned type)
+{
+ const char *names[] = {
+ [BRW_REGISTER_TYPE_UD] = "UD",
+ [BRW_REGISTER_TYPE_D] = "D",
+ [BRW_REGISTER_TYPE_UW] = "UW",
+ [BRW_REGISTER_TYPE_W] = "W",
+ [BRW_REGISTER_TYPE_F] = "F",
+ [BRW_REGISTER_TYPE_UB] = "UB",
+ [BRW_REGISTER_TYPE_B] = "B",
+ [BRW_REGISTER_TYPE_UV] = "UV",
+ [BRW_REGISTER_TYPE_V] = "V",
+ [BRW_REGISTER_TYPE_VF] = "VF",
+ [BRW_REGISTER_TYPE_DF] = "DF",
+ [BRW_REGISTER_TYPE_HF] = "HF",
+ [BRW_REGISTER_TYPE_UQ] = "UQ",
+ [BRW_REGISTER_TYPE_Q] = "Q",
+ };
+ assert(type <= BRW_REGISTER_TYPE_Q);
+ return names[type];
+}
+
+/* Returns a conditional modifier that negates the condition. */
+enum brw_conditional_mod
+brw_negate_cmod(uint32_t cmod)
+{
+ switch (cmod) {
+ case BRW_CONDITIONAL_Z:
+ return BRW_CONDITIONAL_NZ;
+ case BRW_CONDITIONAL_NZ:
+ return BRW_CONDITIONAL_Z;
+ case BRW_CONDITIONAL_G:
+ return BRW_CONDITIONAL_LE;
+ case BRW_CONDITIONAL_GE:
+ return BRW_CONDITIONAL_L;
+ case BRW_CONDITIONAL_L:
+ return BRW_CONDITIONAL_GE;
+ case BRW_CONDITIONAL_LE:
+ return BRW_CONDITIONAL_G;
+ default:
+ return ~0;
+ }
+}
+
+/* Returns the corresponding conditional mod for swapping src0 and
+ * src1 in e.g. CMP.
+ */
+enum brw_conditional_mod
+brw_swap_cmod(uint32_t cmod)
+{
+ switch (cmod) {
+ case BRW_CONDITIONAL_Z:
+ case BRW_CONDITIONAL_NZ:
+ return cmod;
+ case BRW_CONDITIONAL_G:
+ return BRW_CONDITIONAL_L;
+ case BRW_CONDITIONAL_GE:
+ return BRW_CONDITIONAL_LE;
+ case BRW_CONDITIONAL_L:
+ return BRW_CONDITIONAL_G;
+ case BRW_CONDITIONAL_LE:
+ return BRW_CONDITIONAL_GE;
+ default:
+ return BRW_CONDITIONAL_NONE;
+ }
+}
+
+/**
+ * Get the least significant bit offset of the i+1-th component of immediate
+ * type \p type. For \p i equal to the two's complement of j, return the
+ * offset of the j-th component starting from the end of the vector. For
+ * scalar register types return zero.
+ */
+static unsigned
+imm_shift(enum brw_reg_type type, unsigned i)
+{
+ assert(type != BRW_REGISTER_TYPE_UV && type != BRW_REGISTER_TYPE_V &&
+ "Not implemented.");
+
+ if (type == BRW_REGISTER_TYPE_VF)
+ return 8 * (i & 3);
+ else
+ return 0;
+}
+
+/**
+ * Swizzle an arbitrary immediate \p x of the given type according to the
+ * permutation specified as \p swz.
+ */
+uint32_t
+brw_swizzle_immediate(enum brw_reg_type type, uint32_t x, unsigned swz)
+{
+ if (imm_shift(type, 1)) {
+ const unsigned n = 32 / imm_shift(type, 1);
+ uint32_t y = 0;
+
+ for (unsigned i = 0; i < n; i++) {
+ /* Shift the specified component all the way to the right and left to
+ * discard any undesired L/MSBs, then shift it right into component i.
+ */
+ y |= x >> imm_shift(type, (i & ~3) + BRW_GET_SWZ(swz, i & 3))
+ << imm_shift(type, ~0u)
+ >> imm_shift(type, ~0u - i);
+ }
+
+ return y;
+ } else {
+ return x;
+ }
+}
+
+void
+brw_set_default_exec_size(struct brw_codegen *p, unsigned value)
+{
+ brw_inst_set_exec_size(p->devinfo, p->current, value);
+}
+
+void brw_set_default_predicate_control( struct brw_codegen *p, unsigned pc )
+{
+ brw_inst_set_pred_control(p->devinfo, p->current, pc);
+}
+
+void brw_set_default_predicate_inverse(struct brw_codegen *p, bool predicate_inverse)
+{
+ brw_inst_set_pred_inv(p->devinfo, p->current, predicate_inverse);
+}
+
+void brw_set_default_flag_reg(struct brw_codegen *p, int reg, int subreg)
+{
+ if (p->devinfo->gen >= 7)
+ brw_inst_set_flag_reg_nr(p->devinfo, p->current, reg);
+
+ brw_inst_set_flag_subreg_nr(p->devinfo, p->current, subreg);
+}
+
+void brw_set_default_access_mode( struct brw_codegen *p, unsigned access_mode )
+{
+ brw_inst_set_access_mode(p->devinfo, p->current, access_mode);
+}
+
+void
+brw_set_default_compression_control(struct brw_codegen *p,
+ enum brw_compression compression_control)
+{
+ if (p->devinfo->gen >= 6) {
+ /* Since we don't use the SIMD32 support in gen6, we translate
+ * the pre-gen6 compression control here.
+ */
+ switch (compression_control) {
+ case BRW_COMPRESSION_NONE:
+ /* This is the "use the first set of bits of dmask/vmask/arf
+ * according to execsize" option.
+ */
+ brw_inst_set_qtr_control(p->devinfo, p->current, GEN6_COMPRESSION_1Q);
+ break;
+ case BRW_COMPRESSION_2NDHALF:
+ /* For SIMD8, this is "use the second set of 8 bits." */
+ brw_inst_set_qtr_control(p->devinfo, p->current, GEN6_COMPRESSION_2Q);
+ break;
+ case BRW_COMPRESSION_COMPRESSED:
+ /* For SIMD16 instruction compression, use the first set of 16 bits
+ * since we don't do SIMD32 dispatch.
+ */
+ brw_inst_set_qtr_control(p->devinfo, p->current, GEN6_COMPRESSION_1H);
+ break;
+ default:
+ unreachable("not reached");
+ }
+ } else {
+ brw_inst_set_qtr_control(p->devinfo, p->current, compression_control);
+ }
+}
+
+/**
+ * Enable or disable instruction compression on the given instruction leaving
+ * the currently selected channel enable group untouched.
+ */
+void
+brw_inst_set_compression(const struct gen_device_info *devinfo,
+ brw_inst *inst, bool on)
+{
+ if (devinfo->gen >= 6) {
+ /* No-op, the EU will figure out for us whether the instruction needs to
+ * be compressed.
+ */
+ } else {
+ /* The channel group and compression controls are non-orthogonal, there
+ * are two possible representations for uncompressed instructions and we
+ * may need to preserve the current one to avoid changing the selected
+ * channel group inadvertently.
+ */
+ if (on)
+ brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_COMPRESSED);
+ else if (brw_inst_qtr_control(devinfo, inst)
+ == BRW_COMPRESSION_COMPRESSED)
+ brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
+ }
+}
+
+void
+brw_set_default_compression(struct brw_codegen *p, bool on)
+{
+ brw_inst_set_compression(p->devinfo, p->current, on);
+}
+
+/**
+ * Apply the range of channel enable signals given by
+ * [group, group + exec_size) to the instruction passed as argument.
+ */
+void
+brw_inst_set_group(const struct gen_device_info *devinfo,
+ brw_inst *inst, unsigned group)
+{
+ if (devinfo->gen >= 7) {
+ assert(group % 4 == 0 && group < 32);
+ brw_inst_set_qtr_control(devinfo, inst, group / 8);
+ brw_inst_set_nib_control(devinfo, inst, (group / 4) % 2);
+
+ } else if (devinfo->gen == 6) {
+ assert(group % 8 == 0 && group < 32);
+ brw_inst_set_qtr_control(devinfo, inst, group / 8);
+
+ } else {
+ assert(group % 8 == 0 && group < 16);
+ /* The channel group and compression controls are non-orthogonal, there
+ * are two possible representations for group zero and we may need to
+ * preserve the current one to avoid changing the selected compression
+ * enable inadvertently.
+ */
+ if (group == 8)
+ brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_2NDHALF);
+ else if (brw_inst_qtr_control(devinfo, inst) == BRW_COMPRESSION_2NDHALF)
+ brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
+ }
+}
+
+void
+brw_set_default_group(struct brw_codegen *p, unsigned group)
+{
+ brw_inst_set_group(p->devinfo, p->current, group);
+}
+
+void brw_set_default_mask_control( struct brw_codegen *p, unsigned value )
+{
+ brw_inst_set_mask_control(p->devinfo, p->current, value);
+}
+
+void brw_set_default_saturate( struct brw_codegen *p, bool enable )
+{
+ brw_inst_set_saturate(p->devinfo, p->current, enable);
+}
+
+void brw_set_default_acc_write_control(struct brw_codegen *p, unsigned value)
+{
+ if (p->devinfo->gen >= 6)
+ brw_inst_set_acc_wr_control(p->devinfo, p->current, value);
+}
+
+void brw_push_insn_state( struct brw_codegen *p )
+{
+ assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]);
+ memcpy(p->current + 1, p->current, sizeof(brw_inst));
+ p->current++;
+}
+
+void brw_pop_insn_state( struct brw_codegen *p )
+{
+ assert(p->current != p->stack);
+ p->current--;
+}
+
+
+/***********************************************************************
+ */
+void
+brw_init_codegen(const struct gen_device_info *devinfo,
+ struct brw_codegen *p, void *mem_ctx)
+{
+ memset(p, 0, sizeof(*p));
+
+ p->devinfo = devinfo;
+ /*
+ * Set the initial instruction store array size to 1024, if found that
+ * isn't enough, then it will double the store size at brw_next_insn()
+ * until out of memory.
+ */
+ p->store_size = 1024;
+ p->store = rzalloc_array(mem_ctx, brw_inst, p->store_size);
+ p->nr_insn = 0;
+ p->current = p->stack;
+ memset(p->current, 0, sizeof(p->current[0]));
+
+ p->mem_ctx = mem_ctx;
+
+ /* Some defaults?
+ */
+ brw_set_default_exec_size(p, BRW_EXECUTE_8);
+ brw_set_default_mask_control(p, BRW_MASK_ENABLE); /* what does this do? */
+ brw_set_default_saturate(p, 0);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+
+ /* Set up control flow stack */
+ p->if_stack_depth = 0;
+ p->if_stack_array_size = 16;
+ p->if_stack = rzalloc_array(mem_ctx, int, p->if_stack_array_size);
+
+ p->loop_stack_depth = 0;
+ p->loop_stack_array_size = 16;
+ p->loop_stack = rzalloc_array(mem_ctx, int, p->loop_stack_array_size);
+ p->if_depth_in_loop = rzalloc_array(mem_ctx, int, p->loop_stack_array_size);
+}
+
+
+const unsigned *brw_get_program( struct brw_codegen *p,
+ unsigned *sz )
+{
+ *sz = p->next_insn_offset;
+ return (const unsigned *)p->store;
+}
+
+void
+brw_disassemble(const struct gen_device_info *devinfo,
+ void *assembly, int start, int end, FILE *out)
+{
+ bool dump_hex = (INTEL_DEBUG & DEBUG_HEX) != 0;
+
+ for (int offset = start; offset < end;) {
+ brw_inst *insn = assembly + offset;
+ brw_inst uncompacted;
+ bool compacted = brw_inst_cmpt_control(devinfo, insn);
+ if (0)
+ fprintf(out, "0x%08x: ", offset);
+
+ if (compacted) {
+ brw_compact_inst *compacted = (void *)insn;
+ if (dump_hex) {
+ fprintf(out, "0x%08x 0x%08x ",
+ ((uint32_t *)insn)[1],
+ ((uint32_t *)insn)[0]);
+ }
+
+ brw_uncompact_instruction(devinfo, &uncompacted, compacted);
+ insn = &uncompacted;
+ offset += 8;
+ } else {
+ if (dump_hex) {
+ fprintf(out, "0x%08x 0x%08x 0x%08x 0x%08x ",
+ ((uint32_t *)insn)[3],
+ ((uint32_t *)insn)[2],
+ ((uint32_t *)insn)[1],
+ ((uint32_t *)insn)[0]);
+ }
+ offset += 16;
+ }
+
+ brw_disassemble_inst(out, devinfo, insn, compacted);
+ }
+}
+
+enum gen {
+ GEN4 = (1 << 0),
+ GEN45 = (1 << 1),
+ GEN5 = (1 << 2),
+ GEN6 = (1 << 3),
+ GEN7 = (1 << 4),
+ GEN75 = (1 << 5),
+ GEN8 = (1 << 6),
+ GEN9 = (1 << 7),
+ GEN_ALL = ~0
+};
+
+#define GEN_LT(gen) ((gen) - 1)
+#define GEN_GE(gen) (~GEN_LT(gen))
+#define GEN_LE(gen) (GEN_LT(gen) | (gen))
+
+static const struct opcode_desc opcode_10_descs[] = {
+ { .name = "dim", .nsrc = 1, .ndst = 1, .gens = GEN75 },
+ { .name = "smov", .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN8) },
+};
+
+static const struct opcode_desc opcode_35_descs[] = {
+ { .name = "iff", .nsrc = 0, .ndst = 0, .gens = GEN_LE(GEN5) },
+ { .name = "brc", .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN7) },
+};
+
+static const struct opcode_desc opcode_38_descs[] = {
+ { .name = "do", .nsrc = 0, .ndst = 0, .gens = GEN_LE(GEN5) },
+ { .name = "case", .nsrc = 0, .ndst = 0, .gens = GEN6 },
+};
+
+static const struct opcode_desc opcode_44_descs[] = {
+ { .name = "msave", .nsrc = 0, .ndst = 0, .gens = GEN_LE(GEN5) },
+ { .name = "call", .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN6) },
+};
+
+static const struct opcode_desc opcode_45_descs[] = {
+ { .name = "mrest", .nsrc = 0, .ndst = 0, .gens = GEN_LE(GEN5) },
+ { .name = "ret", .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN6) },
+};
+
+static const struct opcode_desc opcode_46_descs[] = {
+ { .name = "push", .nsrc = 0, .ndst = 0, .gens = GEN_LE(GEN5) },
+ { .name = "fork", .nsrc = 0, .ndst = 0, .gens = GEN6 },
+ { .name = "goto", .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN8) },
+};
+
+static const struct opcode_desc opcode_descs[128] = {
+ [BRW_OPCODE_ILLEGAL] = {
+ .name = "illegal", .nsrc = 0, .ndst = 0, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_MOV] = {
+ .name = "mov", .nsrc = 1, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_SEL] = {
+ .name = "sel", .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_MOVI] = {
+ .name = "movi", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN45),
+ },
+ [BRW_OPCODE_NOT] = {
+ .name = "not", .nsrc = 1, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_AND] = {
+ .name = "and", .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_OR] = {
+ .name = "or", .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_XOR] = {
+ .name = "xor", .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_SHR] = {
+ .name = "shr", .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_SHL] = {
+ .name = "shl", .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+ },
+ [10] = {
+ .table = opcode_10_descs, .size = ARRAY_SIZE(opcode_10_descs),
+ },
+ /* Reserved - 11 */
+ [BRW_OPCODE_ASR] = {
+ .name = "asr", .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+ },
+ /* Reserved - 13-15 */
+ [BRW_OPCODE_CMP] = {
+ .name = "cmp", .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_CMPN] = {
+ .name = "cmpn", .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_CSEL] = {
+ .name = "csel", .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN8),
+ },
+ [BRW_OPCODE_F32TO16] = {
+ .name = "f32to16", .nsrc = 1, .ndst = 1, .gens = GEN7 | GEN75,
+ },
+ [BRW_OPCODE_F16TO32] = {
+ .name = "f16to32", .nsrc = 1, .ndst = 1, .gens = GEN7 | GEN75,
+ },
+ /* Reserved - 21-22 */
+ [BRW_OPCODE_BFREV] = {
+ .name = "bfrev", .nsrc = 1, .ndst = 1, .gens = GEN_GE(GEN7),
+ },
+ [BRW_OPCODE_BFE] = {
+ .name = "bfe", .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN7),
+ },
+ [BRW_OPCODE_BFI1] = {
+ .name = "bfi1", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN7),
+ },
+ [BRW_OPCODE_BFI2] = {
+ .name = "bfi2", .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN7),
+ },
+ /* Reserved - 27-31 */
+ [BRW_OPCODE_JMPI] = {
+ .name = "jmpi", .nsrc = 0, .ndst = 0, .gens = GEN_ALL,
+ },
+ [33] = {
+ .name = "brd", .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN7),
+ },
+ [BRW_OPCODE_IF] = {
+ .name = "if", .nsrc = 0, .ndst = 0, .gens = GEN_ALL,
+ },
+ [35] = {
+ .table = opcode_35_descs, .size = ARRAY_SIZE(opcode_35_descs),
+ },
+ [BRW_OPCODE_ELSE] = {
+ .name = "else", .nsrc = 0, .ndst = 0, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_ENDIF] = {
+ .name = "endif", .nsrc = 0, .ndst = 0, .gens = GEN_ALL,
+ },
+ [38] = {
+ .table = opcode_38_descs, .size = ARRAY_SIZE(opcode_38_descs),
+ },
+ [BRW_OPCODE_WHILE] = {
+ .name = "while", .nsrc = 0, .ndst = 0, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_BREAK] = {
+ .name = "break", .nsrc = 0, .ndst = 0, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_CONTINUE] = {
+ .name = "cont", .nsrc = 0, .ndst = 0, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_HALT] = {
+ .name = "halt", .nsrc = 0, .ndst = 0, .gens = GEN_ALL,
+ },
+ [43] = {
+ .name = "calla", .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN75),
+ },
+ [44] = {
+ .table = opcode_44_descs, .size = ARRAY_SIZE(opcode_44_descs),
+ },
+ [45] = {
+ .table = opcode_45_descs, .size = ARRAY_SIZE(opcode_45_descs),
+ },
+ [46] = {
+ .table = opcode_46_descs, .size = ARRAY_SIZE(opcode_46_descs),
+ },
+ [47] = {
+ .name = "pop", .nsrc = 2, .ndst = 0, .gens = GEN_LE(GEN5),
+ },
+ [BRW_OPCODE_WAIT] = {
+ .name = "wait", .nsrc = 1, .ndst = 0, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_SEND] = {
+ .name = "send", .nsrc = 1, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_SENDC] = {
+ .name = "sendc", .nsrc = 1, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_SENDS] = {
+ .name = "sends", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN9),
+ },
+ [BRW_OPCODE_SENDSC] = {
+ .name = "sendsc", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN9),
+ },
+ /* Reserved 53-55 */
+ [BRW_OPCODE_MATH] = {
+ .name = "math", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN6),
+ },
+ /* Reserved 57-63 */
+ [BRW_OPCODE_ADD] = {
+ .name = "add", .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_MUL] = {
+ .name = "mul", .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_AVG] = {
+ .name = "avg", .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_FRC] = {
+ .name = "frc", .nsrc = 1, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_RNDU] = {
+ .name = "rndu", .nsrc = 1, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_RNDD] = {
+ .name = "rndd", .nsrc = 1, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_RNDE] = {
+ .name = "rnde", .nsrc = 1, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_RNDZ] = {
+ .name = "rndz", .nsrc = 1, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_MAC] = {
+ .name = "mac", .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_MACH] = {
+ .name = "mach", .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_LZD] = {
+ .name = "lzd", .nsrc = 1, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_FBH] = {
+ .name = "fbh", .nsrc = 1, .ndst = 1, .gens = GEN_GE(GEN7),
+ },
+ [BRW_OPCODE_FBL] = {
+ .name = "fbl", .nsrc = 1, .ndst = 1, .gens = GEN_GE(GEN7),
+ },
+ [BRW_OPCODE_CBIT] = {
+ .name = "cbit", .nsrc = 1, .ndst = 1, .gens = GEN_GE(GEN7),
+ },
+ [BRW_OPCODE_ADDC] = {
+ .name = "addc", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN7),
+ },
+ [BRW_OPCODE_SUBB] = {
+ .name = "subb", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN7),
+ },
+ [BRW_OPCODE_SAD2] = {
+ .name = "sad2", .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_SADA2] = {
+ .name = "sada2", .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+ },
+ /* Reserved 82-83 */
+ [BRW_OPCODE_DP4] = {
+ .name = "dp4", .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_DPH] = {
+ .name = "dph", .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_DP3] = {
+ .name = "dp3", .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_DP2] = {
+ .name = "dp2", .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+ },
+ /* Reserved 88 */
+ [BRW_OPCODE_LINE] = {
+ .name = "line", .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+ },
+ [BRW_OPCODE_PLN] = {
+ .name = "pln", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN45),
+ },
+ [BRW_OPCODE_MAD] = {
+ .name = "mad", .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN6),
+ },
+ [BRW_OPCODE_LRP] = {
+ .name = "lrp", .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN6),
+ },
+ [93] = {
+ .name = "madm", .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN8),
+ },
+ /* Reserved 94-124 */
+ [BRW_OPCODE_NENOP] = {
+ .name = "nenop", .nsrc = 0, .ndst = 0, .gens = GEN45,
+ },
+ [BRW_OPCODE_NOP] = {
+ .name = "nop", .nsrc = 0, .ndst = 0, .gens = GEN_ALL,
+ },
+};
+
+static enum gen
+gen_from_devinfo(const struct gen_device_info *devinfo)
+{
+ switch (devinfo->gen) {
+ case 4: return devinfo->is_g4x ? GEN45 : GEN4;
+ case 5: return GEN5;
+ case 6: return GEN6;
+ case 7: return devinfo->is_haswell ? GEN75 : GEN7;
+ case 8: return GEN8;
+ case 9: return GEN9;
+ default:
+ unreachable("not reached");
+ }
+}
+
+/* Return the matching opcode_desc for the specified opcode number and
+ * hardware generation, or NULL if the opcode is not supported by the device.
+ */
+const struct opcode_desc *
+brw_opcode_desc(const struct gen_device_info *devinfo, enum opcode opcode)
+{
+ if (opcode >= ARRAY_SIZE(opcode_descs))
+ return NULL;
+
+ enum gen gen = gen_from_devinfo(devinfo);
+ if (opcode_descs[opcode].gens != 0) {
+ if ((opcode_descs[opcode].gens & gen) != 0) {
+ return &opcode_descs[opcode];
+ }
+ } else if (opcode_descs[opcode].table != NULL) {
+ const struct opcode_desc *table = opcode_descs[opcode].table;
+ for (unsigned i = 0; i < opcode_descs[opcode].size; i++) {
+ if ((table[i].gens & gen) != 0) {
+ return &table[i];
+ }
+ }
+ }
+ return NULL;
+}
diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
new file mode 100644
index 00000000000..f4225952333
--- /dev/null
+++ b/src/intel/compiler/brw_eu.h
@@ -0,0 +1,612 @@
+/*
+ Copyright (C) Intel Corp. 2006. All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+ * Authors:
+ * Keith Whitwell <[email protected]>
+ */
+
+
+#ifndef BRW_EU_H
+#define BRW_EU_H
+
+#include <stdbool.h>
+#include "brw_inst.h"
+#include "brw_eu_defines.h"
+#include "brw_reg.h"
+#include "intel_asm_annotation.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BRW_EU_MAX_INSN_STACK 5
+
+/* A helper for accessing the last instruction emitted. This makes it easy
+ * to set various bits on an instruction without having to create temporary
+ * variable and assign the emitted instruction to those.
+ */
+#define brw_last_inst (&p->store[p->nr_insn - 1])
+
+struct brw_codegen {
+ brw_inst *store;
+ int store_size;
+ unsigned nr_insn;
+ unsigned int next_insn_offset;
+
+ void *mem_ctx;
+
+ /* Allow clients to push/pop instruction state:
+ */
+ brw_inst stack[BRW_EU_MAX_INSN_STACK];
+ bool compressed_stack[BRW_EU_MAX_INSN_STACK];
+ brw_inst *current;
+
+ bool single_program_flow;
+ const struct gen_device_info *devinfo;
+
+ /* Control flow stacks:
+ * - if_stack contains IF and ELSE instructions which must be patched
+ * (and popped) once the matching ENDIF instruction is encountered.
+ *
+ * Just store the instruction pointer(an index).
+ */
+ int *if_stack;
+ int if_stack_depth;
+ int if_stack_array_size;
+
+ /**
+ * loop_stack contains the instruction pointers of the starts of loops which
+ * must be patched (and popped) once the matching WHILE instruction is
+ * encountered.
+ */
+ int *loop_stack;
+ /**
+ * pre-gen6, the BREAK and CONT instructions had to tell how many IF/ENDIF
+ * blocks they were popping out of, to fix up the mask stack. This tracks
+ * the IF/ENDIF nesting in each current nested loop level.
+ */
+ int *if_depth_in_loop;
+ int loop_stack_depth;
+ int loop_stack_array_size;
+};
+
+void brw_pop_insn_state( struct brw_codegen *p );
+void brw_push_insn_state( struct brw_codegen *p );
+void brw_set_default_exec_size(struct brw_codegen *p, unsigned value);
+void brw_set_default_mask_control( struct brw_codegen *p, unsigned value );
+void brw_set_default_saturate( struct brw_codegen *p, bool enable );
+void brw_set_default_access_mode( struct brw_codegen *p, unsigned access_mode );
+void brw_inst_set_compression(const struct gen_device_info *devinfo,
+ brw_inst *inst, bool on);
+void brw_set_default_compression(struct brw_codegen *p, bool on);
+void brw_inst_set_group(const struct gen_device_info *devinfo,
+ brw_inst *inst, unsigned group);
+void brw_set_default_group(struct brw_codegen *p, unsigned group);
+void brw_set_default_compression_control(struct brw_codegen *p, enum brw_compression c);
+void brw_set_default_predicate_control( struct brw_codegen *p, unsigned pc );
+void brw_set_default_predicate_inverse(struct brw_codegen *p, bool predicate_inverse);
+void brw_set_default_flag_reg(struct brw_codegen *p, int reg, int subreg);
+void brw_set_default_acc_write_control(struct brw_codegen *p, unsigned value);
+
+void brw_init_codegen(const struct gen_device_info *, struct brw_codegen *p,
+ void *mem_ctx);
+int brw_disassemble_inst(FILE *file, const struct gen_device_info *devinfo,
+ struct brw_inst *inst, bool is_compacted);
+void brw_disassemble(const struct gen_device_info *devinfo, void *assembly,
+ int start, int end, FILE *out);
+const unsigned *brw_get_program( struct brw_codegen *p, unsigned *sz );
+
+brw_inst *brw_next_insn(struct brw_codegen *p, unsigned opcode);
+void brw_set_dest(struct brw_codegen *p, brw_inst *insn, struct brw_reg dest);
+void brw_set_src0(struct brw_codegen *p, brw_inst *insn, struct brw_reg reg);
+
+void gen6_resolve_implied_move(struct brw_codegen *p,
+ struct brw_reg *src,
+ unsigned msg_reg_nr);
+
+/* Helpers for regular instructions:
+ */
+#define ALU1(OP) \
+brw_inst *brw_##OP(struct brw_codegen *p, \
+ struct brw_reg dest, \
+ struct brw_reg src0);
+
+#define ALU2(OP) \
+brw_inst *brw_##OP(struct brw_codegen *p, \
+ struct brw_reg dest, \
+ struct brw_reg src0, \
+ struct brw_reg src1);
+
+#define ALU3(OP) \
+brw_inst *brw_##OP(struct brw_codegen *p, \
+ struct brw_reg dest, \
+ struct brw_reg src0, \
+ struct brw_reg src1, \
+ struct brw_reg src2);
+
+#define ROUND(OP) \
+void brw_##OP(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0);
+
+ALU1(MOV)
+ALU2(SEL)
+ALU1(NOT)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(SHR)
+ALU2(SHL)
+ALU1(DIM)
+ALU2(ASR)
+ALU1(F32TO16)
+ALU1(F16TO32)
+ALU2(ADD)
+ALU2(AVG)
+ALU2(MUL)
+ALU1(FRC)
+ALU1(RNDD)
+ALU2(MAC)
+ALU2(MACH)
+ALU1(LZD)
+ALU2(DP4)
+ALU2(DPH)
+ALU2(DP3)
+ALU2(DP2)
+ALU2(LINE)
+ALU2(PLN)
+ALU3(MAD)
+ALU3(LRP)
+ALU1(BFREV)
+ALU3(BFE)
+ALU2(BFI1)
+ALU3(BFI2)
+ALU1(FBH)
+ALU1(FBL)
+ALU1(CBIT)
+ALU2(ADDC)
+ALU2(SUBB)
+ALU2(MAC)
+
+ROUND(RNDZ)
+ROUND(RNDE)
+
+#undef ALU1
+#undef ALU2
+#undef ALU3
+#undef ROUND
+
+
+/* Helpers for SEND instruction:
+ */
+void brw_set_sampler_message(struct brw_codegen *p,
+ brw_inst *insn,
+ unsigned binding_table_index,
+ unsigned sampler,
+ unsigned msg_type,
+ unsigned response_length,
+ unsigned msg_length,
+ unsigned header_present,
+ unsigned simd_mode,
+ unsigned return_format);
+
+void brw_set_message_descriptor(struct brw_codegen *p,
+ brw_inst *inst,
+ enum brw_message_target sfid,
+ unsigned msg_length,
+ unsigned response_length,
+ bool header_present,
+ bool end_of_thread);
+
+void brw_set_dp_read_message(struct brw_codegen *p,
+ brw_inst *insn,
+ unsigned binding_table_index,
+ unsigned msg_control,
+ unsigned msg_type,
+ unsigned target_cache,
+ unsigned msg_length,
+ bool header_present,
+ unsigned response_length);
+
+void brw_set_dp_write_message(struct brw_codegen *p,
+ brw_inst *insn,
+ unsigned binding_table_index,
+ unsigned msg_control,
+ unsigned msg_type,
+ unsigned target_cache,
+ unsigned msg_length,
+ bool header_present,
+ unsigned last_render_target,
+ unsigned response_length,
+ unsigned end_of_thread,
+ unsigned send_commit_msg);
+
+void brw_urb_WRITE(struct brw_codegen *p,
+ struct brw_reg dest,
+ unsigned msg_reg_nr,
+ struct brw_reg src0,
+ enum brw_urb_write_flags flags,
+ unsigned msg_length,
+ unsigned response_length,
+ unsigned offset,
+ unsigned swizzle);
+
+/**
+ * Send message to shared unit \p sfid with a possibly indirect descriptor \p
+ * desc. If \p desc is not an immediate it will be transparently loaded to an
+ * address register using an OR instruction. The returned instruction can be
+ * passed as argument to the usual brw_set_*_message() functions in order to
+ * specify any additional descriptor bits -- If \p desc is an immediate this
+ * will be the SEND instruction itself, otherwise it will be the OR
+ * instruction.
+ */
+struct brw_inst *
+brw_send_indirect_message(struct brw_codegen *p,
+ unsigned sfid,
+ struct brw_reg dst,
+ struct brw_reg payload,
+ struct brw_reg desc);
+
+void brw_ff_sync(struct brw_codegen *p,
+ struct brw_reg dest,
+ unsigned msg_reg_nr,
+ struct brw_reg src0,
+ bool allocate,
+ unsigned response_length,
+ bool eot);
+
+void brw_svb_write(struct brw_codegen *p,
+ struct brw_reg dest,
+ unsigned msg_reg_nr,
+ struct brw_reg src0,
+ unsigned binding_table_index,
+ bool send_commit_msg);
+
+void brw_fb_WRITE(struct brw_codegen *p,
+ struct brw_reg payload,
+ struct brw_reg implied_header,
+ unsigned msg_control,
+ unsigned binding_table_index,
+ unsigned msg_length,
+ unsigned response_length,
+ bool eot,
+ bool last_render_target,
+ bool header_present);
+
+brw_inst *gen9_fb_READ(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg payload,
+ unsigned binding_table_index,
+ unsigned msg_length,
+ unsigned response_length,
+ bool per_sample);
+
+void brw_SAMPLE(struct brw_codegen *p,
+ struct brw_reg dest,
+ unsigned msg_reg_nr,
+ struct brw_reg src0,
+ unsigned binding_table_index,
+ unsigned sampler,
+ unsigned msg_type,
+ unsigned response_length,
+ unsigned msg_length,
+ unsigned header_present,
+ unsigned simd_mode,
+ unsigned return_format);
+
+void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
+ struct brw_reg header,
+ struct brw_reg sampler_index);
+
+void gen4_math(struct brw_codegen *p,
+ struct brw_reg dest,
+ unsigned function,
+ unsigned msg_reg_nr,
+ struct brw_reg src,
+ unsigned precision );
+
+void gen6_math(struct brw_codegen *p,
+ struct brw_reg dest,
+ unsigned function,
+ struct brw_reg src0,
+ struct brw_reg src1);
+
+void brw_oword_block_read(struct brw_codegen *p,
+ struct brw_reg dest,
+ struct brw_reg mrf,
+ uint32_t offset,
+ uint32_t bind_table_index);
+
+unsigned brw_scratch_surface_idx(const struct brw_codegen *p);
+
+void brw_oword_block_read_scratch(struct brw_codegen *p,
+ struct brw_reg dest,
+ struct brw_reg mrf,
+ int num_regs,
+ unsigned offset);
+
+void brw_oword_block_write_scratch(struct brw_codegen *p,
+ struct brw_reg mrf,
+ int num_regs,
+ unsigned offset);
+
+void gen7_block_read_scratch(struct brw_codegen *p,
+ struct brw_reg dest,
+ int num_regs,
+ unsigned offset);
+
+void brw_shader_time_add(struct brw_codegen *p,
+ struct brw_reg payload,
+ uint32_t surf_index);
+
+/**
+ * Return the generation-specific jump distance scaling factor.
+ *
+ * Given the number of instructions to jump, we need to scale by
+ * some number to obtain the actual jump distance to program in an
+ * instruction.
+ */
+static inline unsigned
+brw_jump_scale(const struct gen_device_info *devinfo)
+{
+ /* Broadwell measures jump targets in bytes. */
+ if (devinfo->gen >= 8)
+ return 16;
+
+ /* Ironlake and later measure jump targets in 64-bit data chunks (in order
+ * (to support compaction), so each 128-bit instruction requires 2 chunks.
+ */
+ if (devinfo->gen >= 5)
+ return 2;
+
+ /* Gen4 simply uses the number of 128-bit instructions. */
+ return 1;
+}
+
+void brw_barrier(struct brw_codegen *p, struct brw_reg src);
+
+/* If/else/endif. Works by manipulating the execution flags on each
+ * channel.
+ */
+brw_inst *brw_IF(struct brw_codegen *p, unsigned execute_size);
+brw_inst *gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
+ struct brw_reg src0, struct brw_reg src1);
+
+void brw_ELSE(struct brw_codegen *p);
+void brw_ENDIF(struct brw_codegen *p);
+
+/* DO/WHILE loops:
+ */
+brw_inst *brw_DO(struct brw_codegen *p, unsigned execute_size);
+
+brw_inst *brw_WHILE(struct brw_codegen *p);
+
+brw_inst *brw_BREAK(struct brw_codegen *p);
+brw_inst *brw_CONT(struct brw_codegen *p);
+brw_inst *gen6_HALT(struct brw_codegen *p);
+
+/* Forward jumps:
+ */
+void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx);
+
+brw_inst *brw_JMPI(struct brw_codegen *p, struct brw_reg index,
+ unsigned predicate_control);
+
+void brw_NOP(struct brw_codegen *p);
+
+void brw_WAIT(struct brw_codegen *p);
+
+/* Special case: there is never a destination, execution size will be
+ * taken from src0:
+ */
+void brw_CMP(struct brw_codegen *p,
+ struct brw_reg dest,
+ unsigned conditional,
+ struct brw_reg src0,
+ struct brw_reg src1);
+
+void
+brw_untyped_atomic(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg payload,
+ struct brw_reg surface,
+ unsigned atomic_op,
+ unsigned msg_length,
+ bool response_expected);
+
+void
+brw_untyped_surface_read(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg payload,
+ struct brw_reg surface,
+ unsigned msg_length,
+ unsigned num_channels);
+
+void
+brw_untyped_surface_write(struct brw_codegen *p,
+ struct brw_reg payload,
+ struct brw_reg surface,
+ unsigned msg_length,
+ unsigned num_channels);
+
+void
+brw_typed_atomic(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg payload,
+ struct brw_reg surface,
+ unsigned atomic_op,
+ unsigned msg_length,
+ bool response_expected);
+
+void
+brw_typed_surface_read(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg payload,
+ struct brw_reg surface,
+ unsigned msg_length,
+ unsigned num_channels);
+
+void
+brw_typed_surface_write(struct brw_codegen *p,
+ struct brw_reg payload,
+ struct brw_reg surface,
+ unsigned msg_length,
+ unsigned num_channels);
+
+void
+brw_memory_fence(struct brw_codegen *p,
+ struct brw_reg dst);
+
+void
+brw_pixel_interpolator_query(struct brw_codegen *p,
+ struct brw_reg dest,
+ struct brw_reg mrf,
+ bool noperspective,
+ unsigned mode,
+ struct brw_reg data,
+ unsigned msg_length,
+ unsigned response_length);
+
+void
+brw_find_live_channel(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg mask);
+
+void
+brw_broadcast(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg src,
+ struct brw_reg idx);
+
+/***********************************************************************
+ * brw_eu_util.c:
+ */
+
+void brw_copy_indirect_to_indirect(struct brw_codegen *p,
+ struct brw_indirect dst_ptr,
+ struct brw_indirect src_ptr,
+ unsigned count);
+
+void brw_copy_from_indirect(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_indirect ptr,
+ unsigned count);
+
+void brw_copy4(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg src,
+ unsigned count);
+
+void brw_copy8(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg src,
+ unsigned count);
+
+void brw_math_invert( struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg src);
+
+void brw_set_src1(struct brw_codegen *p, brw_inst *insn, struct brw_reg reg);
+
+void brw_set_uip_jip(struct brw_codegen *p, int start_offset);
+
+enum brw_conditional_mod brw_negate_cmod(uint32_t cmod);
+enum brw_conditional_mod brw_swap_cmod(uint32_t cmod);
+
+/* brw_eu_compact.c */
+void brw_init_compaction_tables(const struct gen_device_info *devinfo);
+void brw_compact_instructions(struct brw_codegen *p, int start_offset,
+ int num_annotations, struct annotation *annotation);
+void brw_uncompact_instruction(const struct gen_device_info *devinfo,
+ brw_inst *dst, brw_compact_inst *src);
+bool brw_try_compact_instruction(const struct gen_device_info *devinfo,
+ brw_compact_inst *dst, brw_inst *src);
+
+void brw_debug_compact_uncompact(const struct gen_device_info *devinfo,
+ brw_inst *orig, brw_inst *uncompacted);
+
+/* brw_eu_validate.c */
+bool brw_validate_instructions(const struct brw_codegen *p, int start_offset,
+ struct annotation_info *annotation);
+
+static inline int
+next_offset(const struct gen_device_info *devinfo, void *store, int offset)
+{
+ brw_inst *insn = (brw_inst *)((char *)store + offset);
+
+ if (brw_inst_cmpt_control(devinfo, insn))
+ return offset + 8;
+ else
+ return offset + 16;
+}
+
+struct opcode_desc {
+ /* The union is an implementation detail used by brw_opcode_desc() to handle
+ * opcodes that have been reused for different instructions across hardware
+ * generations.
+ *
+ * The gens field acts as a tag. If it is non-zero, name points to a string
+ * containing the instruction mnemonic. If it is zero, the table field is
+ * valid and either points to a secondary opcode_desc table with 'size'
+ * elements or is NULL and no such instruction exists for the opcode.
+ */
+ union {
+ struct {
+ char *name;
+ int nsrc;
+ };
+ struct {
+ const struct opcode_desc *table;
+ unsigned size;
+ };
+ };
+ int ndst;
+ int gens;
+};
+
+const struct opcode_desc *
+brw_opcode_desc(const struct gen_device_info *devinfo, enum opcode opcode);
+
+static inline bool
+is_3src(const struct gen_device_info *devinfo, enum opcode opcode)
+{
+ const struct opcode_desc *desc = brw_opcode_desc(devinfo, opcode);
+ return desc && desc->nsrc == 3;
+}
+
+/** Maximum SEND message length */
+#define BRW_MAX_MSG_LENGTH 15
+
+/** First MRF register used by pull loads */
+#define FIRST_SPILL_MRF(gen) ((gen) == 6 ? 21 : 13)
+
+/** First MRF register used by spills */
+#define FIRST_PULL_LOAD_MRF(gen) ((gen) == 6 ? 16 : 13)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/intel/compiler/brw_eu_compact.c b/src/intel/compiler/brw_eu_compact.c
new file mode 100644
index 00000000000..b2af76d533a
--- /dev/null
+++ b/src/intel/compiler/brw_eu_compact.c
@@ -0,0 +1,1579 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_eu_compact.c
+ *
+ * Instruction compaction is a feature of G45 and newer hardware that allows
+ * for a smaller instruction encoding.
+ *
+ * The instruction cache is on the order of 32KB, and many programs generate
+ * far more instructions than that. The instruction cache is built to barely
+ * keep up with instruction dispatch ability in cache hit cases -- L1
+ * instruction cache misses that still hit in the next level could limit
+ * throughput by around 50%.
+ *
+ * The idea of instruction compaction is that most instructions use a tiny
+ * subset of the GPU functionality, so we can encode what would be a 16 byte
+ * instruction in 8 bytes using some lookup tables for various fields.
+ *
+ *
+ * Instruction compaction capabilities vary subtly by generation.
+ *
+ * G45's support for instruction compaction is very limited. Jump counts on
+ * this generation are in units of 16-byte uncompacted instructions. As such,
+ * all jump targets must be 16-byte aligned. Also, all instructions must be
+ * naturally aligned, i.e. uncompacted instructions must be 16-byte aligned.
+ * A G45-only instruction, NENOP, must be used to provide padding to align
+ * uncompacted instructions.
+ *
+ * Gen5 removes these restrictions and changes jump counts to be in units of
+ * 8-byte compacted instructions, allowing jump targets to be only 8-byte
+ * aligned. Uncompacted instructions can also be placed on 8-byte boundaries.
+ *
+ * Gen6 adds the ability to compact instructions with a limited range of
+ * immediate values. Compactable immediates have 12 unrestricted bits, and a
+ * 13th bit that's replicated through the high 20 bits, to create the 32-bit
+ * value of DW3 in the uncompacted instruction word.
+ *
+ * On Gen7 we can compact some control flow instructions with a small positive
+ * immediate in the low bits of DW3, like ENDIF with the JIP field. Other
+ * control flow instructions with UIP cannot be compacted, because of the
+ * replicated 13th bit. No control flow instructions can be compacted on Gen6
+ * since the jump count field is not in DW3.
+ *
+ * break JIP/UIP
+ * cont JIP/UIP
+ * halt JIP/UIP
+ * if JIP/UIP
+ * else JIP (plus UIP on BDW+)
+ * endif JIP
+ * while JIP (must be negative)
+ *
+ * Gen 8 adds support for compacting 3-src instructions.
+ */
+
+#include "brw_eu.h"
+#include "brw_shader.h"
+#include "intel_asm_annotation.h"
+#include "common/gen_debug.h"
+
+static const uint32_t g45_control_index_table[32] = {
+ 0b00000000000000000,
+ 0b01000000000000000,
+ 0b00110000000000000,
+ 0b00000000000000010,
+ 0b00100000000000000,
+ 0b00010000000000000,
+ 0b01000000000100000,
+ 0b01000000100000000,
+ 0b01010000000100000,
+ 0b00000000100000010,
+ 0b11000000000000000,
+ 0b00001000100000010,
+ 0b01001000100000000,
+ 0b00000000100000000,
+ 0b11000000000100000,
+ 0b00001000100000000,
+ 0b10110000000000000,
+ 0b11010000000100000,
+ 0b00110000100000000,
+ 0b00100000100000000,
+ 0b01000000000001000,
+ 0b01000000000000100,
+ 0b00111100000000000,
+ 0b00101011000000000,
+ 0b00110000000010000,
+ 0b00010000100000000,
+ 0b01000000000100100,
+ 0b01000000000101000,
+ 0b00110000000000110,
+ 0b00000000000001010,
+ 0b01010000000101000,
+ 0b01010000000100100
+};
+
+static const uint32_t g45_datatype_table[32] = {
+ 0b001000000000100001,
+ 0b001011010110101101,
+ 0b001000001000110001,
+ 0b001111011110111101,
+ 0b001011010110101100,
+ 0b001000000110101101,
+ 0b001000000000100000,
+ 0b010100010110110001,
+ 0b001100011000101101,
+ 0b001000000000100010,
+ 0b001000001000110110,
+ 0b010000001000110001,
+ 0b001000001000110010,
+ 0b011000001000110010,
+ 0b001111011110111100,
+ 0b001000000100101000,
+ 0b010100011000110001,
+ 0b001010010100101001,
+ 0b001000001000101001,
+ 0b010000001000110110,
+ 0b101000001000110001,
+ 0b001011011000101101,
+ 0b001000000100001001,
+ 0b001011011000101100,
+ 0b110100011000110001,
+ 0b001000001110111101,
+ 0b110000001000110001,
+ 0b011000000100101010,
+ 0b101000001000101001,
+ 0b001011010110001100,
+ 0b001000000110100001,
+ 0b001010010100001000
+};
+
+static const uint16_t g45_subreg_table[32] = {
+ 0b000000000000000,
+ 0b000000010000000,
+ 0b000001000000000,
+ 0b000100000000000,
+ 0b000000000100000,
+ 0b100000000000000,
+ 0b000000000010000,
+ 0b001100000000000,
+ 0b001010000000000,
+ 0b000000100000000,
+ 0b001000000000000,
+ 0b000000000001000,
+ 0b000000001000000,
+ 0b000000000000001,
+ 0b000010000000000,
+ 0b000000010100000,
+ 0b000000000000111,
+ 0b000001000100000,
+ 0b011000000000000,
+ 0b000000110000000,
+ 0b000000000000010,
+ 0b000000000000100,
+ 0b000000001100000,
+ 0b000100000000010,
+ 0b001110011000110,
+ 0b001110100001000,
+ 0b000110011000110,
+ 0b000001000011000,
+ 0b000110010000100,
+ 0b001100000000110,
+ 0b000000010000110,
+ 0b000001000110000
+};
+
+static const uint16_t g45_src_index_table[32] = {
+ 0b000000000000,
+ 0b010001101000,
+ 0b010110001000,
+ 0b011010010000,
+ 0b001101001000,
+ 0b010110001010,
+ 0b010101110000,
+ 0b011001111000,
+ 0b001000101000,
+ 0b000000101000,
+ 0b010001010000,
+ 0b111101101100,
+ 0b010110001100,
+ 0b010001101100,
+ 0b011010010100,
+ 0b010001001100,
+ 0b001100101000,
+ 0b000000000010,
+ 0b111101001100,
+ 0b011001101000,
+ 0b010101001000,
+ 0b000000000100,
+ 0b000000101100,
+ 0b010001101010,
+ 0b000000111000,
+ 0b010101011000,
+ 0b000100100000,
+ 0b010110000000,
+ 0b010000000100,
+ 0b010000111000,
+ 0b000101100000,
+ 0b111101110100
+};
+
+static const uint32_t gen6_control_index_table[32] = {
+ 0b00000000000000000,
+ 0b01000000000000000,
+ 0b00110000000000000,
+ 0b00000000100000000,
+ 0b00010000000000000,
+ 0b00001000100000000,
+ 0b00000000100000010,
+ 0b00000000000000010,
+ 0b01000000100000000,
+ 0b01010000000000000,
+ 0b10110000000000000,
+ 0b00100000000000000,
+ 0b11010000000000000,
+ 0b11000000000000000,
+ 0b01001000100000000,
+ 0b01000000000001000,
+ 0b01000000000000100,
+ 0b00000000000001000,
+ 0b00000000000000100,
+ 0b00111000100000000,
+ 0b00001000100000010,
+ 0b00110000100000000,
+ 0b00110000000000001,
+ 0b00100000000000001,
+ 0b00110000000000010,
+ 0b00110000000000101,
+ 0b00110000000001001,
+ 0b00110000000010000,
+ 0b00110000000000011,
+ 0b00110000000000100,
+ 0b00110000100001000,
+ 0b00100000000001001
+};
+
+static const uint32_t gen6_datatype_table[32] = {
+ 0b001001110000000000,
+ 0b001000110000100000,
+ 0b001001110000000001,
+ 0b001000000001100000,
+ 0b001010110100101001,
+ 0b001000000110101101,
+ 0b001100011000101100,
+ 0b001011110110101101,
+ 0b001000000111101100,
+ 0b001000000001100001,
+ 0b001000110010100101,
+ 0b001000000001000001,
+ 0b001000001000110001,
+ 0b001000001000101001,
+ 0b001000000000100000,
+ 0b001000001000110010,
+ 0b001010010100101001,
+ 0b001011010010100101,
+ 0b001000000110100101,
+ 0b001100011000101001,
+ 0b001011011000101100,
+ 0b001011010110100101,
+ 0b001011110110100101,
+ 0b001111011110111101,
+ 0b001111011110111100,
+ 0b001111011110111101,
+ 0b001111011110011101,
+ 0b001111011110111110,
+ 0b001000000000100001,
+ 0b001000000000100010,
+ 0b001001111111011101,
+ 0b001000001110111110,
+};
+
+static const uint16_t gen6_subreg_table[32] = {
+ 0b000000000000000,
+ 0b000000000000100,
+ 0b000000110000000,
+ 0b111000000000000,
+ 0b011110000001000,
+ 0b000010000000000,
+ 0b000000000010000,
+ 0b000110000001100,
+ 0b001000000000000,
+ 0b000001000000000,
+ 0b000001010010100,
+ 0b000000001010110,
+ 0b010000000000000,
+ 0b110000000000000,
+ 0b000100000000000,
+ 0b000000010000000,
+ 0b000000000001000,
+ 0b100000000000000,
+ 0b000001010000000,
+ 0b001010000000000,
+ 0b001100000000000,
+ 0b000000001010100,
+ 0b101101010010100,
+ 0b010100000000000,
+ 0b000000010001111,
+ 0b011000000000000,
+ 0b111110000000000,
+ 0b101000000000000,
+ 0b000000000001111,
+ 0b000100010001111,
+ 0b001000010001111,
+ 0b000110000000000,
+};
+
+static const uint16_t gen6_src_index_table[32] = {
+ 0b000000000000,
+ 0b010110001000,
+ 0b010001101000,
+ 0b001000101000,
+ 0b011010010000,
+ 0b000100100000,
+ 0b010001101100,
+ 0b010101110000,
+ 0b011001111000,
+ 0b001100101000,
+ 0b010110001100,
+ 0b001000100000,
+ 0b010110001010,
+ 0b000000000010,
+ 0b010101010000,
+ 0b010101101000,
+ 0b111101001100,
+ 0b111100101100,
+ 0b011001110000,
+ 0b010110001001,
+ 0b010101011000,
+ 0b001101001000,
+ 0b010000101100,
+ 0b010000000000,
+ 0b001101110000,
+ 0b001100010000,
+ 0b001100000000,
+ 0b010001101010,
+ 0b001101111000,
+ 0b000001110000,
+ 0b001100100000,
+ 0b001101010000,
+};
+
+static const uint32_t gen7_control_index_table[32] = {
+ 0b0000000000000000010,
+ 0b0000100000000000000,
+ 0b0000100000000000001,
+ 0b0000100000000000010,
+ 0b0000100000000000011,
+ 0b0000100000000000100,
+ 0b0000100000000000101,
+ 0b0000100000000000111,
+ 0b0000100000000001000,
+ 0b0000100000000001001,
+ 0b0000100000000001101,
+ 0b0000110000000000000,
+ 0b0000110000000000001,
+ 0b0000110000000000010,
+ 0b0000110000000000011,
+ 0b0000110000000000100,
+ 0b0000110000000000101,
+ 0b0000110000000000111,
+ 0b0000110000000001001,
+ 0b0000110000000001101,
+ 0b0000110000000010000,
+ 0b0000110000100000000,
+ 0b0001000000000000000,
+ 0b0001000000000000010,
+ 0b0001000000000000100,
+ 0b0001000000100000000,
+ 0b0010110000000000000,
+ 0b0010110000000010000,
+ 0b0011000000000000000,
+ 0b0011000000100000000,
+ 0b0101000000000000000,
+ 0b0101000000100000000
+};
+
+static const uint32_t gen7_datatype_table[32] = {
+ 0b001000000000000001,
+ 0b001000000000100000,
+ 0b001000000000100001,
+ 0b001000000001100001,
+ 0b001000000010111101,
+ 0b001000001011111101,
+ 0b001000001110100001,
+ 0b001000001110100101,
+ 0b001000001110111101,
+ 0b001000010000100001,
+ 0b001000110000100000,
+ 0b001000110000100001,
+ 0b001001010010100101,
+ 0b001001110010100100,
+ 0b001001110010100101,
+ 0b001111001110111101,
+ 0b001111011110011101,
+ 0b001111011110111100,
+ 0b001111011110111101,
+ 0b001111111110111100,
+ 0b000000001000001100,
+ 0b001000000000111101,
+ 0b001000000010100101,
+ 0b001000010000100000,
+ 0b001001010010100100,
+ 0b001001110010000100,
+ 0b001010010100001001,
+ 0b001101111110111101,
+ 0b001111111110111101,
+ 0b001011110110101100,
+ 0b001010010100101000,
+ 0b001010110100101000
+};
+
+static const uint16_t gen7_subreg_table[32] = {
+ 0b000000000000000,
+ 0b000000000000001,
+ 0b000000000001000,
+ 0b000000000001111,
+ 0b000000000010000,
+ 0b000000010000000,
+ 0b000000100000000,
+ 0b000000110000000,
+ 0b000001000000000,
+ 0b000001000010000,
+ 0b000010100000000,
+ 0b001000000000000,
+ 0b001000000000001,
+ 0b001000010000001,
+ 0b001000010000010,
+ 0b001000010000011,
+ 0b001000010000100,
+ 0b001000010000111,
+ 0b001000010001000,
+ 0b001000010001110,
+ 0b001000010001111,
+ 0b001000110000000,
+ 0b001000111101000,
+ 0b010000000000000,
+ 0b010000110000000,
+ 0b011000000000000,
+ 0b011110010000111,
+ 0b100000000000000,
+ 0b101000000000000,
+ 0b110000000000000,
+ 0b111000000000000,
+ 0b111000000011100
+};
+
+static const uint16_t gen7_src_index_table[32] = {
+ 0b000000000000,
+ 0b000000000010,
+ 0b000000010000,
+ 0b000000010010,
+ 0b000000011000,
+ 0b000000100000,
+ 0b000000101000,
+ 0b000001001000,
+ 0b000001010000,
+ 0b000001110000,
+ 0b000001111000,
+ 0b001100000000,
+ 0b001100000010,
+ 0b001100001000,
+ 0b001100010000,
+ 0b001100010010,
+ 0b001100100000,
+ 0b001100101000,
+ 0b001100111000,
+ 0b001101000000,
+ 0b001101000010,
+ 0b001101001000,
+ 0b001101010000,
+ 0b001101100000,
+ 0b001101101000,
+ 0b001101110000,
+ 0b001101110001,
+ 0b001101111000,
+ 0b010001101000,
+ 0b010001101001,
+ 0b010001101010,
+ 0b010110001000
+};
+
+static const uint32_t gen8_control_index_table[32] = {
+ 0b0000000000000000010,
+ 0b0000100000000000000,
+ 0b0000100000000000001,
+ 0b0000100000000000010,
+ 0b0000100000000000011,
+ 0b0000100000000000100,
+ 0b0000100000000000101,
+ 0b0000100000000000111,
+ 0b0000100000000001000,
+ 0b0000100000000001001,
+ 0b0000100000000001101,
+ 0b0000110000000000000,
+ 0b0000110000000000001,
+ 0b0000110000000000010,
+ 0b0000110000000000011,
+ 0b0000110000000000100,
+ 0b0000110000000000101,
+ 0b0000110000000000111,
+ 0b0000110000000001001,
+ 0b0000110000000001101,
+ 0b0000110000000010000,
+ 0b0000110000100000000,
+ 0b0001000000000000000,
+ 0b0001000000000000010,
+ 0b0001000000000000100,
+ 0b0001000000100000000,
+ 0b0010110000000000000,
+ 0b0010110000000010000,
+ 0b0011000000000000000,
+ 0b0011000000100000000,
+ 0b0101000000000000000,
+ 0b0101000000100000000
+};
+
+static const uint32_t gen8_datatype_table[32] = {
+ 0b001000000000000000001,
+ 0b001000000000001000000,
+ 0b001000000000001000001,
+ 0b001000000000011000001,
+ 0b001000000000101011101,
+ 0b001000000010111011101,
+ 0b001000000011101000001,
+ 0b001000000011101000101,
+ 0b001000000011101011101,
+ 0b001000001000001000001,
+ 0b001000011000001000000,
+ 0b001000011000001000001,
+ 0b001000101000101000101,
+ 0b001000111000101000100,
+ 0b001000111000101000101,
+ 0b001011100011101011101,
+ 0b001011101011100011101,
+ 0b001011101011101011100,
+ 0b001011101011101011101,
+ 0b001011111011101011100,
+ 0b000000000010000001100,
+ 0b001000000000001011101,
+ 0b001000000000101000101,
+ 0b001000001000001000000,
+ 0b001000101000101000100,
+ 0b001000111000100000100,
+ 0b001001001001000001001,
+ 0b001010111011101011101,
+ 0b001011111011101011101,
+ 0b001001111001101001100,
+ 0b001001001001001001000,
+ 0b001001011001001001000
+};
+
+static const uint16_t gen8_subreg_table[32] = {
+ 0b000000000000000,
+ 0b000000000000001,
+ 0b000000000001000,
+ 0b000000000001111,
+ 0b000000000010000,
+ 0b000000010000000,
+ 0b000000100000000,
+ 0b000000110000000,
+ 0b000001000000000,
+ 0b000001000010000,
+ 0b000001010000000,
+ 0b001000000000000,
+ 0b001000000000001,
+ 0b001000010000001,
+ 0b001000010000010,
+ 0b001000010000011,
+ 0b001000010000100,
+ 0b001000010000111,
+ 0b001000010001000,
+ 0b001000010001110,
+ 0b001000010001111,
+ 0b001000110000000,
+ 0b001000111101000,
+ 0b010000000000000,
+ 0b010000110000000,
+ 0b011000000000000,
+ 0b011110010000111,
+ 0b100000000000000,
+ 0b101000000000000,
+ 0b110000000000000,
+ 0b111000000000000,
+ 0b111000000011100
+};
+
+static const uint16_t gen8_src_index_table[32] = {
+ 0b000000000000,
+ 0b000000000010,
+ 0b000000010000,
+ 0b000000010010,
+ 0b000000011000,
+ 0b000000100000,
+ 0b000000101000,
+ 0b000001001000,
+ 0b000001010000,
+ 0b000001110000,
+ 0b000001111000,
+ 0b001100000000,
+ 0b001100000010,
+ 0b001100001000,
+ 0b001100010000,
+ 0b001100010010,
+ 0b001100100000,
+ 0b001100101000,
+ 0b001100111000,
+ 0b001101000000,
+ 0b001101000010,
+ 0b001101001000,
+ 0b001101010000,
+ 0b001101100000,
+ 0b001101101000,
+ 0b001101110000,
+ 0b001101110001,
+ 0b001101111000,
+ 0b010001101000,
+ 0b010001101001,
+ 0b010001101010,
+ 0b010110001000
+};
+
+/* This is actually the control index table for Cherryview (26 bits), but the
+ * only difference from Broadwell (24 bits) is that it has two extra 0-bits at
+ * the start.
+ *
+ * The low 24 bits have the same mappings on both hardware.
+ */
+static const uint32_t gen8_3src_control_index_table[4] = {
+ 0b00100000000110000000000001,
+ 0b00000000000110000000000001,
+ 0b00000000001000000000000001,
+ 0b00000000001000000000100001
+};
+
+/* This is actually the control index table for Cherryview (49 bits), but the
+ * only difference from Broadwell (46 bits) is that it has three extra 0-bits
+ * at the start.
+ *
+ * The low 44 bits have the same mappings on both hardware, and since the high
+ * three bits on Broadwell are zero, we can reuse Cherryview's table.
+ */
+static const uint64_t gen8_3src_source_index_table[4] = {
+ 0b0000001110010011100100111001000001111000000000000,
+ 0b0000001110010011100100111001000001111000000000010,
+ 0b0000001110010011100100111001000001111000000001000,
+ 0b0000001110010011100100111001000001111000000100000
+};
+
+static const uint32_t *control_index_table;
+static const uint32_t *datatype_table;
+static const uint16_t *subreg_table;
+static const uint16_t *src_index_table;
+
+static bool
+set_control_index(const struct gen_device_info *devinfo,
+ brw_compact_inst *dst, brw_inst *src)
+{
+ uint32_t uncompacted = devinfo->gen >= 8 /* 17b/G45; 19b/IVB+ */
+ ? (brw_inst_bits(src, 33, 31) << 16) | /* 3b */
+ (brw_inst_bits(src, 23, 12) << 4) | /* 12b */
+ (brw_inst_bits(src, 10, 9) << 2) | /* 2b */
+ (brw_inst_bits(src, 34, 34) << 1) | /* 1b */
+ (brw_inst_bits(src, 8, 8)) /* 1b */
+ : (brw_inst_bits(src, 31, 31) << 16) | /* 1b */
+ (brw_inst_bits(src, 23, 8)); /* 16b */
+
+ /* On gen7, the flag register and subregister numbers are integrated into
+ * the control index.
+ */
+ if (devinfo->gen == 7)
+ uncompacted |= brw_inst_bits(src, 90, 89) << 17; /* 2b */
+
+ for (int i = 0; i < 32; i++) {
+ if (control_index_table[i] == uncompacted) {
+ brw_compact_inst_set_control_index(devinfo, dst, i);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool
+set_datatype_index(const struct gen_device_info *devinfo, brw_compact_inst *dst,
+ brw_inst *src)
+{
+ uint32_t uncompacted = devinfo->gen >= 8 /* 18b/G45+; 21b/BDW+ */
+ ? (brw_inst_bits(src, 63, 61) << 18) | /* 3b */
+ (brw_inst_bits(src, 94, 89) << 12) | /* 6b */
+ (brw_inst_bits(src, 46, 35)) /* 12b */
+ : (brw_inst_bits(src, 63, 61) << 15) | /* 3b */
+ (brw_inst_bits(src, 46, 32)); /* 15b */
+
+ for (int i = 0; i < 32; i++) {
+ if (datatype_table[i] == uncompacted) {
+ brw_compact_inst_set_datatype_index(devinfo, dst, i);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool
+set_subreg_index(const struct gen_device_info *devinfo, brw_compact_inst *dst,
+ brw_inst *src, bool is_immediate)
+{
+ uint16_t uncompacted = /* 15b */
+ (brw_inst_bits(src, 52, 48) << 0) | /* 5b */
+ (brw_inst_bits(src, 68, 64) << 5); /* 5b */
+
+ if (!is_immediate)
+ uncompacted |= brw_inst_bits(src, 100, 96) << 10; /* 5b */
+
+ for (int i = 0; i < 32; i++) {
+ if (subreg_table[i] == uncompacted) {
+ brw_compact_inst_set_subreg_index(devinfo, dst, i);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool
+get_src_index(uint16_t uncompacted,
+ uint16_t *compacted)
+{
+ for (int i = 0; i < 32; i++) {
+ if (src_index_table[i] == uncompacted) {
+ *compacted = i;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool
+set_src0_index(const struct gen_device_info *devinfo,
+ brw_compact_inst *dst, brw_inst *src)
+{
+ uint16_t compacted;
+ uint16_t uncompacted = brw_inst_bits(src, 88, 77); /* 12b */
+
+ if (!get_src_index(uncompacted, &compacted))
+ return false;
+
+ brw_compact_inst_set_src0_index(devinfo, dst, compacted);
+
+ return true;
+}
+
+static bool
+set_src1_index(const struct gen_device_info *devinfo, brw_compact_inst *dst,
+ brw_inst *src, bool is_immediate)
+{
+ uint16_t compacted;
+
+ if (is_immediate) {
+ compacted = (brw_inst_imm_ud(devinfo, src) >> 8) & 0x1f;
+ } else {
+ uint16_t uncompacted = brw_inst_bits(src, 120, 109); /* 12b */
+
+ if (!get_src_index(uncompacted, &compacted))
+ return false;
+ }
+
+ brw_compact_inst_set_src1_index(devinfo, dst, compacted);
+
+ return true;
+}
+
+static bool
+set_3src_control_index(const struct gen_device_info *devinfo,
+ brw_compact_inst *dst, brw_inst *src)
+{
+ assert(devinfo->gen >= 8);
+
+ uint32_t uncompacted = /* 24b/BDW; 26b/CHV */
+ (brw_inst_bits(src, 34, 32) << 21) | /* 3b */
+ (brw_inst_bits(src, 28, 8)); /* 21b */
+
+ if (devinfo->gen >= 9 || devinfo->is_cherryview)
+ uncompacted |= brw_inst_bits(src, 36, 35) << 24; /* 2b */
+
+ for (unsigned i = 0; i < ARRAY_SIZE(gen8_3src_control_index_table); i++) {
+ if (gen8_3src_control_index_table[i] == uncompacted) {
+ brw_compact_inst_set_3src_control_index(devinfo, dst, i);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool
+set_3src_source_index(const struct gen_device_info *devinfo,
+ brw_compact_inst *dst, brw_inst *src)
+{
+ assert(devinfo->gen >= 8);
+
+ uint64_t uncompacted = /* 46b/BDW; 49b/CHV */
+ (brw_inst_bits(src, 83, 83) << 43) | /* 1b */
+ (brw_inst_bits(src, 114, 107) << 35) | /* 8b */
+ (brw_inst_bits(src, 93, 86) << 27) | /* 8b */
+ (brw_inst_bits(src, 72, 65) << 19) | /* 8b */
+ (brw_inst_bits(src, 55, 37)); /* 19b */
+
+ if (devinfo->gen >= 9 || devinfo->is_cherryview) {
+ uncompacted |=
+ (brw_inst_bits(src, 126, 125) << 47) | /* 2b */
+ (brw_inst_bits(src, 105, 104) << 45) | /* 2b */
+ (brw_inst_bits(src, 84, 84) << 44); /* 1b */
+ } else {
+ uncompacted |=
+ (brw_inst_bits(src, 125, 125) << 45) | /* 1b */
+ (brw_inst_bits(src, 104, 104) << 44); /* 1b */
+ }
+
+ for (unsigned i = 0; i < ARRAY_SIZE(gen8_3src_source_index_table); i++) {
+ if (gen8_3src_source_index_table[i] == uncompacted) {
+ brw_compact_inst_set_3src_source_index(devinfo, dst, i);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool
+has_unmapped_bits(const struct gen_device_info *devinfo, brw_inst *src)
+{
+ /* EOT can only be mapped on a send if the src1 is an immediate */
+ if ((brw_inst_opcode(devinfo, src) == BRW_OPCODE_SENDC ||
+ brw_inst_opcode(devinfo, src) == BRW_OPCODE_SEND) &&
+ brw_inst_eot(devinfo, src))
+ return true;
+
+ /* Check for instruction bits that don't map to any of the fields of the
+ * compacted instruction. The instruction cannot be compacted if any of
+ * them are set. They overlap with:
+ * - NibCtrl (bit 47 on Gen7, bit 11 on Gen8)
+ * - Dst.AddrImm[9] (bit 47 on Gen8)
+ * - Src0.AddrImm[9] (bit 95 on Gen8)
+ * - Imm64[27:31] (bits 91-95 on Gen7, bit 95 on Gen8)
+ * - UIP[31] (bit 95 on Gen8)
+ */
+ if (devinfo->gen >= 8) {
+ assert(!brw_inst_bits(src, 7, 7));
+ return brw_inst_bits(src, 95, 95) ||
+ brw_inst_bits(src, 47, 47) ||
+ brw_inst_bits(src, 11, 11);
+ } else {
+ assert(!brw_inst_bits(src, 7, 7) &&
+ !(devinfo->gen < 7 && brw_inst_bits(src, 90, 90)));
+ return brw_inst_bits(src, 95, 91) ||
+ brw_inst_bits(src, 47, 47);
+ }
+}
+
+static bool
+has_3src_unmapped_bits(const struct gen_device_info *devinfo, brw_inst *src)
+{
+ /* Check for three-source instruction bits that don't map to any of the
+ * fields of the compacted instruction. All of them seem to be reserved
+ * bits currently.
+ */
+ if (devinfo->gen >= 9 || devinfo->is_cherryview) {
+ assert(!brw_inst_bits(src, 127, 127) &&
+ !brw_inst_bits(src, 7, 7));
+ } else {
+ assert(devinfo->gen >= 8);
+ assert(!brw_inst_bits(src, 127, 126) &&
+ !brw_inst_bits(src, 105, 105) &&
+ !brw_inst_bits(src, 84, 84) &&
+ !brw_inst_bits(src, 36, 35) &&
+ !brw_inst_bits(src, 7, 7));
+ }
+
+ return false;
+}
+
+static bool
+brw_try_compact_3src_instruction(const struct gen_device_info *devinfo,
+ brw_compact_inst *dst, brw_inst *src)
+{
+ assert(devinfo->gen >= 8);
+
+ if (has_3src_unmapped_bits(devinfo, src))
+ return false;
+
+#define compact(field) \
+ brw_compact_inst_set_3src_##field(devinfo, dst, brw_inst_3src_##field(devinfo, src))
+
+ compact(opcode);
+
+ if (!set_3src_control_index(devinfo, dst, src))
+ return false;
+
+ if (!set_3src_source_index(devinfo, dst, src))
+ return false;
+
+ compact(dst_reg_nr);
+ compact(src0_rep_ctrl);
+ brw_compact_inst_set_3src_cmpt_control(devinfo, dst, true);
+ compact(debug_control);
+ compact(saturate);
+ compact(src1_rep_ctrl);
+ compact(src2_rep_ctrl);
+ compact(src0_reg_nr);
+ compact(src1_reg_nr);
+ compact(src2_reg_nr);
+ compact(src0_subreg_nr);
+ compact(src1_subreg_nr);
+ compact(src2_subreg_nr);
+
+#undef compact
+
+ return true;
+}
+
+/* Compacted instructions have 12-bits for immediate sources, and a 13th bit
+ * that's replicated through the high 20 bits.
+ *
+ * Effectively this means we get 12-bit integers, 0.0f, and some limited uses
+ * of packed vectors as compactable immediates.
+ */
+static bool
+is_compactable_immediate(unsigned imm)
+{
+ /* We get the low 12 bits as-is. */
+ imm &= ~0xfff;
+
+ /* We get one bit replicated through the top 20 bits. */
+ return imm == 0 || imm == 0xfffff000;
+}
+
+/**
+ * Tries to compact instruction src into dst.
+ *
+ * It doesn't modify dst unless src is compactable, which is relied on by
+ * brw_compact_instructions().
+ */
+bool
+brw_try_compact_instruction(const struct gen_device_info *devinfo,
+ brw_compact_inst *dst, brw_inst *src)
+{
+ brw_compact_inst temp;
+
+ assert(brw_inst_cmpt_control(devinfo, src) == 0);
+
+ if (is_3src(devinfo, brw_inst_opcode(devinfo, src))) {
+ if (devinfo->gen >= 8) {
+ memset(&temp, 0, sizeof(temp));
+ if (brw_try_compact_3src_instruction(devinfo, &temp, src)) {
+ *dst = temp;
+ return true;
+ } else {
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ bool is_immediate =
+ brw_inst_src0_reg_file(devinfo, src) == BRW_IMMEDIATE_VALUE ||
+ brw_inst_src1_reg_file(devinfo, src) == BRW_IMMEDIATE_VALUE;
+ if (is_immediate &&
+ (devinfo->gen < 6 ||
+ !is_compactable_immediate(brw_inst_imm_ud(devinfo, src)))) {
+ return false;
+ }
+
+ if (has_unmapped_bits(devinfo, src))
+ return false;
+
+ memset(&temp, 0, sizeof(temp));
+
+#define compact(field) \
+ brw_compact_inst_set_##field(devinfo, &temp, brw_inst_##field(devinfo, src))
+
+ compact(opcode);
+ compact(debug_control);
+
+ if (!set_control_index(devinfo, &temp, src))
+ return false;
+ if (!set_datatype_index(devinfo, &temp, src))
+ return false;
+ if (!set_subreg_index(devinfo, &temp, src, is_immediate))
+ return false;
+
+ if (devinfo->gen >= 6) {
+ compact(acc_wr_control);
+ } else {
+ compact(mask_control_ex);
+ }
+
+ compact(cond_modifier);
+
+ if (devinfo->gen <= 6)
+ compact(flag_subreg_nr);
+
+ brw_compact_inst_set_cmpt_control(devinfo, &temp, true);
+
+ if (!set_src0_index(devinfo, &temp, src))
+ return false;
+ if (!set_src1_index(devinfo, &temp, src, is_immediate))
+ return false;
+
+ brw_compact_inst_set_dst_reg_nr(devinfo, &temp,
+ brw_inst_dst_da_reg_nr(devinfo, src));
+ brw_compact_inst_set_src0_reg_nr(devinfo, &temp,
+ brw_inst_src0_da_reg_nr(devinfo, src));
+
+ if (is_immediate) {
+ brw_compact_inst_set_src1_reg_nr(devinfo, &temp,
+ brw_inst_imm_ud(devinfo, src) & 0xff);
+ } else {
+ brw_compact_inst_set_src1_reg_nr(devinfo, &temp,
+ brw_inst_src1_da_reg_nr(devinfo, src));
+ }
+
+#undef compact
+
+ *dst = temp;
+
+ return true;
+}
+
+static void
+set_uncompacted_control(const struct gen_device_info *devinfo, brw_inst *dst,
+ brw_compact_inst *src)
+{
+ uint32_t uncompacted =
+ control_index_table[brw_compact_inst_control_index(devinfo, src)];
+
+ if (devinfo->gen >= 8) {
+ brw_inst_set_bits(dst, 33, 31, (uncompacted >> 16));
+ brw_inst_set_bits(dst, 23, 12, (uncompacted >> 4) & 0xfff);
+ brw_inst_set_bits(dst, 10, 9, (uncompacted >> 2) & 0x3);
+ brw_inst_set_bits(dst, 34, 34, (uncompacted >> 1) & 0x1);
+ brw_inst_set_bits(dst, 8, 8, (uncompacted >> 0) & 0x1);
+ } else {
+ brw_inst_set_bits(dst, 31, 31, (uncompacted >> 16) & 0x1);
+ brw_inst_set_bits(dst, 23, 8, (uncompacted & 0xffff));
+
+ if (devinfo->gen == 7)
+ brw_inst_set_bits(dst, 90, 89, uncompacted >> 17);
+ }
+}
+
+static void
+set_uncompacted_datatype(const struct gen_device_info *devinfo, brw_inst *dst,
+ brw_compact_inst *src)
+{
+ uint32_t uncompacted =
+ datatype_table[brw_compact_inst_datatype_index(devinfo, src)];
+
+ if (devinfo->gen >= 8) {
+ brw_inst_set_bits(dst, 63, 61, (uncompacted >> 18));
+ brw_inst_set_bits(dst, 94, 89, (uncompacted >> 12) & 0x3f);
+ brw_inst_set_bits(dst, 46, 35, (uncompacted >> 0) & 0xfff);
+ } else {
+ brw_inst_set_bits(dst, 63, 61, (uncompacted >> 15));
+ brw_inst_set_bits(dst, 46, 32, (uncompacted & 0x7fff));
+ }
+}
+
+static void
+set_uncompacted_subreg(const struct gen_device_info *devinfo, brw_inst *dst,
+ brw_compact_inst *src)
+{
+ uint16_t uncompacted =
+ subreg_table[brw_compact_inst_subreg_index(devinfo, src)];
+
+ brw_inst_set_bits(dst, 100, 96, (uncompacted >> 10));
+ brw_inst_set_bits(dst, 68, 64, (uncompacted >> 5) & 0x1f);
+ brw_inst_set_bits(dst, 52, 48, (uncompacted >> 0) & 0x1f);
+}
+
+static void
+set_uncompacted_src0(const struct gen_device_info *devinfo, brw_inst *dst,
+ brw_compact_inst *src)
+{
+ uint32_t compacted = brw_compact_inst_src0_index(devinfo, src);
+ uint16_t uncompacted = src_index_table[compacted];
+
+ brw_inst_set_bits(dst, 88, 77, uncompacted);
+}
+
+static void
+set_uncompacted_src1(const struct gen_device_info *devinfo, brw_inst *dst,
+ brw_compact_inst *src, bool is_immediate)
+{
+ if (is_immediate) {
+ signed high5 = brw_compact_inst_src1_index(devinfo, src);
+ /* Replicate top bit of src1_index into high 20 bits of the immediate. */
+ brw_inst_set_imm_ud(devinfo, dst, (high5 << 27) >> 19);
+ } else {
+ uint16_t uncompacted =
+ src_index_table[brw_compact_inst_src1_index(devinfo, src)];
+
+ brw_inst_set_bits(dst, 120, 109, uncompacted);
+ }
+}
+
+static void
+set_uncompacted_3src_control_index(const struct gen_device_info *devinfo,
+ brw_inst *dst, brw_compact_inst *src)
+{
+ assert(devinfo->gen >= 8);
+
+ uint32_t compacted = brw_compact_inst_3src_control_index(devinfo, src);
+ uint32_t uncompacted = gen8_3src_control_index_table[compacted];
+
+ brw_inst_set_bits(dst, 34, 32, (uncompacted >> 21) & 0x7);
+ brw_inst_set_bits(dst, 28, 8, (uncompacted >> 0) & 0x1fffff);
+
+ if (devinfo->gen >= 9 || devinfo->is_cherryview)
+ brw_inst_set_bits(dst, 36, 35, (uncompacted >> 24) & 0x3);
+}
+
+static void
+set_uncompacted_3src_source_index(const struct gen_device_info *devinfo,
+ brw_inst *dst, brw_compact_inst *src)
+{
+ assert(devinfo->gen >= 8);
+
+ uint32_t compacted = brw_compact_inst_3src_source_index(devinfo, src);
+ uint64_t uncompacted = gen8_3src_source_index_table[compacted];
+
+ brw_inst_set_bits(dst, 83, 83, (uncompacted >> 43) & 0x1);
+ brw_inst_set_bits(dst, 114, 107, (uncompacted >> 35) & 0xff);
+ brw_inst_set_bits(dst, 93, 86, (uncompacted >> 27) & 0xff);
+ brw_inst_set_bits(dst, 72, 65, (uncompacted >> 19) & 0xff);
+ brw_inst_set_bits(dst, 55, 37, (uncompacted >> 0) & 0x7ffff);
+
+ if (devinfo->gen >= 9 || devinfo->is_cherryview) {
+ brw_inst_set_bits(dst, 126, 125, (uncompacted >> 47) & 0x3);
+ brw_inst_set_bits(dst, 105, 104, (uncompacted >> 45) & 0x3);
+ brw_inst_set_bits(dst, 84, 84, (uncompacted >> 44) & 0x1);
+ } else {
+ brw_inst_set_bits(dst, 125, 125, (uncompacted >> 45) & 0x1);
+ brw_inst_set_bits(dst, 104, 104, (uncompacted >> 44) & 0x1);
+ }
+}
+
+static void
+brw_uncompact_3src_instruction(const struct gen_device_info *devinfo,
+ brw_inst *dst, brw_compact_inst *src)
+{
+ assert(devinfo->gen >= 8);
+
+#define uncompact(field) \
+ brw_inst_set_3src_##field(devinfo, dst, brw_compact_inst_3src_##field(devinfo, src))
+
+ uncompact(opcode);
+
+ set_uncompacted_3src_control_index(devinfo, dst, src);
+ set_uncompacted_3src_source_index(devinfo, dst, src);
+
+ uncompact(dst_reg_nr);
+ uncompact(src0_rep_ctrl);
+ brw_inst_set_3src_cmpt_control(devinfo, dst, false);
+ uncompact(debug_control);
+ uncompact(saturate);
+ uncompact(src1_rep_ctrl);
+ uncompact(src2_rep_ctrl);
+ uncompact(src0_reg_nr);
+ uncompact(src1_reg_nr);
+ uncompact(src2_reg_nr);
+ uncompact(src0_subreg_nr);
+ uncompact(src1_subreg_nr);
+ uncompact(src2_subreg_nr);
+
+#undef uncompact
+}
+
+void
+brw_uncompact_instruction(const struct gen_device_info *devinfo, brw_inst *dst,
+ brw_compact_inst *src)
+{
+ memset(dst, 0, sizeof(*dst));
+
+ if (devinfo->gen >= 8 &&
+ is_3src(devinfo, brw_compact_inst_3src_opcode(devinfo, src))) {
+ brw_uncompact_3src_instruction(devinfo, dst, src);
+ return;
+ }
+
+#define uncompact(field) \
+ brw_inst_set_##field(devinfo, dst, brw_compact_inst_##field(devinfo, src))
+
+ uncompact(opcode);
+ uncompact(debug_control);
+
+ set_uncompacted_control(devinfo, dst, src);
+ set_uncompacted_datatype(devinfo, dst, src);
+
+ /* src0/1 register file fields are in the datatype table. */
+ bool is_immediate = brw_inst_src0_reg_file(devinfo, dst) == BRW_IMMEDIATE_VALUE ||
+ brw_inst_src1_reg_file(devinfo, dst) == BRW_IMMEDIATE_VALUE;
+
+ set_uncompacted_subreg(devinfo, dst, src);
+
+ if (devinfo->gen >= 6) {
+ uncompact(acc_wr_control);
+ } else {
+ uncompact(mask_control_ex);
+ }
+
+ uncompact(cond_modifier);
+
+ if (devinfo->gen <= 6)
+ uncompact(flag_subreg_nr);
+
+ set_uncompacted_src0(devinfo, dst, src);
+ set_uncompacted_src1(devinfo, dst, src, is_immediate);
+
+ brw_inst_set_dst_da_reg_nr(devinfo, dst,
+ brw_compact_inst_dst_reg_nr(devinfo, src));
+ brw_inst_set_src0_da_reg_nr(devinfo, dst,
+ brw_compact_inst_src0_reg_nr(devinfo, src));
+
+ if (is_immediate) {
+ brw_inst_set_imm_ud(devinfo, dst,
+ brw_inst_imm_ud(devinfo, dst) |
+ brw_compact_inst_src1_reg_nr(devinfo, src));
+ } else {
+ brw_inst_set_src1_da_reg_nr(devinfo, dst,
+ brw_compact_inst_src1_reg_nr(devinfo, src));
+ }
+
+#undef uncompact
+}
+
+void brw_debug_compact_uncompact(const struct gen_device_info *devinfo,
+ brw_inst *orig,
+ brw_inst *uncompacted)
+{
+ fprintf(stderr, "Instruction compact/uncompact changed (gen%d):\n",
+ devinfo->gen);
+
+ fprintf(stderr, " before: ");
+ brw_disassemble_inst(stderr, devinfo, orig, true);
+
+ fprintf(stderr, " after: ");
+ brw_disassemble_inst(stderr, devinfo, uncompacted, false);
+
+ uint32_t *before_bits = (uint32_t *)orig;
+ uint32_t *after_bits = (uint32_t *)uncompacted;
+ fprintf(stderr, " changed bits:\n");
+ for (int i = 0; i < 128; i++) {
+ uint32_t before = before_bits[i / 32] & (1 << (i & 31));
+ uint32_t after = after_bits[i / 32] & (1 << (i & 31));
+
+ if (before != after) {
+ fprintf(stderr, " bit %d, %s to %s\n", i,
+ before ? "set" : "unset",
+ after ? "set" : "unset");
+ }
+ }
+}
+
+static int
+compacted_between(int old_ip, int old_target_ip, int *compacted_counts)
+{
+ int this_compacted_count = compacted_counts[old_ip];
+ int target_compacted_count = compacted_counts[old_target_ip];
+ return target_compacted_count - this_compacted_count;
+}
+
+static void
+update_uip_jip(const struct gen_device_info *devinfo, brw_inst *insn,
+ int this_old_ip, int *compacted_counts)
+{
+ /* JIP and UIP are in units of:
+ * - bytes on Gen8+; and
+ * - compacted instructions on Gen6+.
+ */
+ int shift = devinfo->gen >= 8 ? 3 : 0;
+
+ int32_t jip_compacted = brw_inst_jip(devinfo, insn) >> shift;
+ jip_compacted -= compacted_between(this_old_ip,
+ this_old_ip + (jip_compacted / 2),
+ compacted_counts);
+ brw_inst_set_jip(devinfo, insn, jip_compacted << shift);
+
+ if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_ENDIF ||
+ brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE ||
+ (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_ELSE && devinfo->gen <= 7))
+ return;
+
+ int32_t uip_compacted = brw_inst_uip(devinfo, insn) >> shift;
+ uip_compacted -= compacted_between(this_old_ip,
+ this_old_ip + (uip_compacted / 2),
+ compacted_counts);
+ brw_inst_set_uip(devinfo, insn, uip_compacted << shift);
+}
+
+static void
+update_gen4_jump_count(const struct gen_device_info *devinfo, brw_inst *insn,
+ int this_old_ip, int *compacted_counts)
+{
+ assert(devinfo->gen == 5 || devinfo->is_g4x);
+
+ /* Jump Count is in units of:
+ * - uncompacted instructions on G45; and
+ * - compacted instructions on Gen5.
+ */
+ int shift = devinfo->is_g4x ? 1 : 0;
+
+ int jump_count_compacted = brw_inst_gen4_jump_count(devinfo, insn) << shift;
+
+ int target_old_ip = this_old_ip + (jump_count_compacted / 2);
+
+ int this_compacted_count = compacted_counts[this_old_ip];
+ int target_compacted_count = compacted_counts[target_old_ip];
+
+ jump_count_compacted -= (target_compacted_count - this_compacted_count);
+ brw_inst_set_gen4_jump_count(devinfo, insn, jump_count_compacted >> shift);
+}
+
+void
+brw_init_compaction_tables(const struct gen_device_info *devinfo)
+{
+ assert(g45_control_index_table[ARRAY_SIZE(g45_control_index_table) - 1] != 0);
+ assert(g45_datatype_table[ARRAY_SIZE(g45_datatype_table) - 1] != 0);
+ assert(g45_subreg_table[ARRAY_SIZE(g45_subreg_table) - 1] != 0);
+ assert(g45_src_index_table[ARRAY_SIZE(g45_src_index_table) - 1] != 0);
+ assert(gen6_control_index_table[ARRAY_SIZE(gen6_control_index_table) - 1] != 0);
+ assert(gen6_datatype_table[ARRAY_SIZE(gen6_datatype_table) - 1] != 0);
+ assert(gen6_subreg_table[ARRAY_SIZE(gen6_subreg_table) - 1] != 0);
+ assert(gen6_src_index_table[ARRAY_SIZE(gen6_src_index_table) - 1] != 0);
+ assert(gen7_control_index_table[ARRAY_SIZE(gen7_control_index_table) - 1] != 0);
+ assert(gen7_datatype_table[ARRAY_SIZE(gen7_datatype_table) - 1] != 0);
+ assert(gen7_subreg_table[ARRAY_SIZE(gen7_subreg_table) - 1] != 0);
+ assert(gen7_src_index_table[ARRAY_SIZE(gen7_src_index_table) - 1] != 0);
+ assert(gen8_control_index_table[ARRAY_SIZE(gen8_control_index_table) - 1] != 0);
+ assert(gen8_datatype_table[ARRAY_SIZE(gen8_datatype_table) - 1] != 0);
+ assert(gen8_subreg_table[ARRAY_SIZE(gen8_subreg_table) - 1] != 0);
+ assert(gen8_src_index_table[ARRAY_SIZE(gen8_src_index_table) - 1] != 0);
+
+ switch (devinfo->gen) {
+ case 9:
+ case 8:
+ control_index_table = gen8_control_index_table;
+ datatype_table = gen8_datatype_table;
+ subreg_table = gen8_subreg_table;
+ src_index_table = gen8_src_index_table;
+ break;
+ case 7:
+ control_index_table = gen7_control_index_table;
+ datatype_table = gen7_datatype_table;
+ subreg_table = gen7_subreg_table;
+ src_index_table = gen7_src_index_table;
+ break;
+ case 6:
+ control_index_table = gen6_control_index_table;
+ datatype_table = gen6_datatype_table;
+ subreg_table = gen6_subreg_table;
+ src_index_table = gen6_src_index_table;
+ break;
+ case 5:
+ case 4:
+ control_index_table = g45_control_index_table;
+ datatype_table = g45_datatype_table;
+ subreg_table = g45_subreg_table;
+ src_index_table = g45_src_index_table;
+ break;
+ default:
+ unreachable("unknown generation");
+ }
+}
+
+void
+brw_compact_instructions(struct brw_codegen *p, int start_offset,
+ int num_annotations, struct annotation *annotation)
+{
+ if (unlikely(INTEL_DEBUG & DEBUG_NO_COMPACTION))
+ return;
+
+ const struct gen_device_info *devinfo = p->devinfo;
+ void *store = p->store + start_offset / 16;
+ /* For an instruction at byte offset 16*i before compaction, this is the
+ * number of compacted instructions minus the number of padding NOP/NENOPs
+ * that preceded it.
+ */
+ int compacted_counts[(p->next_insn_offset - start_offset) / sizeof(brw_inst)];
+ /* For an instruction at byte offset 8*i after compaction, this was its IP
+ * (in 16-byte units) before compaction.
+ */
+ int old_ip[(p->next_insn_offset - start_offset) / sizeof(brw_compact_inst)];
+
+ if (devinfo->gen == 4 && !devinfo->is_g4x)
+ return;
+
+ int offset = 0;
+ int compacted_count = 0;
+ for (int src_offset = 0; src_offset < p->next_insn_offset - start_offset;
+ src_offset += sizeof(brw_inst)) {
+ brw_inst *src = store + src_offset;
+ void *dst = store + offset;
+
+ old_ip[offset / sizeof(brw_compact_inst)] = src_offset / sizeof(brw_inst);
+ compacted_counts[src_offset / sizeof(brw_inst)] = compacted_count;
+
+ brw_inst saved = *src;
+
+ if (brw_try_compact_instruction(devinfo, dst, src)) {
+ compacted_count++;
+
+ if (INTEL_DEBUG) {
+ brw_inst uncompacted;
+ brw_uncompact_instruction(devinfo, &uncompacted, dst);
+ if (memcmp(&saved, &uncompacted, sizeof(uncompacted))) {
+ brw_debug_compact_uncompact(devinfo, &saved, &uncompacted);
+ }
+ }
+
+ offset += sizeof(brw_compact_inst);
+ } else {
+ /* All uncompacted instructions need to be aligned on G45. */
+ if ((offset & sizeof(brw_compact_inst)) != 0 && devinfo->is_g4x){
+ brw_compact_inst *align = store + offset;
+ memset(align, 0, sizeof(*align));
+ brw_compact_inst_set_opcode(devinfo, align, BRW_OPCODE_NENOP);
+ brw_compact_inst_set_cmpt_control(devinfo, align, true);
+ offset += sizeof(brw_compact_inst);
+ compacted_count--;
+ compacted_counts[src_offset / sizeof(brw_inst)] = compacted_count;
+ old_ip[offset / sizeof(brw_compact_inst)] = src_offset / sizeof(brw_inst);
+
+ dst = store + offset;
+ }
+
+ /* If we didn't compact this intruction, we need to move it down into
+ * place.
+ */
+ if (offset != src_offset) {
+ memmove(dst, src, sizeof(brw_inst));
+ }
+ offset += sizeof(brw_inst);
+ }
+ }
+
+ /* Fix up control flow offsets. */
+ p->next_insn_offset = start_offset + offset;
+ for (offset = 0; offset < p->next_insn_offset - start_offset;
+ offset = next_offset(devinfo, store, offset)) {
+ brw_inst *insn = store + offset;
+ int this_old_ip = old_ip[offset / sizeof(brw_compact_inst)];
+ int this_compacted_count = compacted_counts[this_old_ip];
+
+ switch (brw_inst_opcode(devinfo, insn)) {
+ case BRW_OPCODE_BREAK:
+ case BRW_OPCODE_CONTINUE:
+ case BRW_OPCODE_HALT:
+ if (devinfo->gen >= 6) {
+ update_uip_jip(devinfo, insn, this_old_ip, compacted_counts);
+ } else {
+ update_gen4_jump_count(devinfo, insn, this_old_ip,
+ compacted_counts);
+ }
+ break;
+
+ case BRW_OPCODE_IF:
+ case BRW_OPCODE_IFF:
+ case BRW_OPCODE_ELSE:
+ case BRW_OPCODE_ENDIF:
+ case BRW_OPCODE_WHILE:
+ if (devinfo->gen >= 7) {
+ if (brw_inst_cmpt_control(devinfo, insn)) {
+ brw_inst uncompacted;
+ brw_uncompact_instruction(devinfo, &uncompacted,
+ (brw_compact_inst *)insn);
+
+ update_uip_jip(devinfo, &uncompacted, this_old_ip,
+ compacted_counts);
+
+ bool ret = brw_try_compact_instruction(devinfo,
+ (brw_compact_inst *)insn,
+ &uncompacted);
+ assert(ret); (void)ret;
+ } else {
+ update_uip_jip(devinfo, insn, this_old_ip, compacted_counts);
+ }
+ } else if (devinfo->gen == 6) {
+ assert(!brw_inst_cmpt_control(devinfo, insn));
+
+ /* Jump Count is in units of compacted instructions on Gen6. */
+ int jump_count_compacted = brw_inst_gen6_jump_count(devinfo, insn);
+
+ int target_old_ip = this_old_ip + (jump_count_compacted / 2);
+ int target_compacted_count = compacted_counts[target_old_ip];
+ jump_count_compacted -= (target_compacted_count - this_compacted_count);
+ brw_inst_set_gen6_jump_count(devinfo, insn, jump_count_compacted);
+ } else {
+ update_gen4_jump_count(devinfo, insn, this_old_ip,
+ compacted_counts);
+ }
+ break;
+
+ case BRW_OPCODE_ADD:
+ /* Add instructions modifying the IP register use an immediate src1,
+ * and Gens that use this cannot compact instructions with immediate
+ * operands.
+ */
+ if (brw_inst_cmpt_control(devinfo, insn))
+ break;
+
+ if (brw_inst_dst_reg_file(devinfo, insn) == BRW_ARCHITECTURE_REGISTER_FILE &&
+ brw_inst_dst_da_reg_nr(devinfo, insn) == BRW_ARF_IP) {
+ assert(brw_inst_src1_reg_file(devinfo, insn) == BRW_IMMEDIATE_VALUE);
+
+ int shift = 3;
+ int jump_compacted = brw_inst_imm_d(devinfo, insn) >> shift;
+
+ int target_old_ip = this_old_ip + (jump_compacted / 2);
+ int target_compacted_count = compacted_counts[target_old_ip];
+ jump_compacted -= (target_compacted_count - this_compacted_count);
+ brw_inst_set_imm_ud(devinfo, insn, jump_compacted << shift);
+ }
+ break;
+ }
+ }
+
+ /* p->nr_insn is counting the number of uncompacted instructions still, so
+ * divide. We do want to be sure there's a valid instruction in any
+ * alignment padding, so that the next compression pass (for the FS 8/16
+ * compile passes) parses correctly.
+ */
+ if (p->next_insn_offset & sizeof(brw_compact_inst)) {
+ brw_compact_inst *align = store + offset;
+ memset(align, 0, sizeof(*align));
+ brw_compact_inst_set_opcode(devinfo, align, BRW_OPCODE_NOP);
+ brw_compact_inst_set_cmpt_control(devinfo, align, true);
+ p->next_insn_offset += sizeof(brw_compact_inst);
+ }
+ p->nr_insn = p->next_insn_offset / sizeof(brw_inst);
+
+ /* Update the instruction offsets for each annotation. */
+ if (annotation) {
+ for (int offset = 0, i = 0; i < num_annotations; i++) {
+ while (start_offset + old_ip[offset / sizeof(brw_compact_inst)] *
+ sizeof(brw_inst) != annotation[i].offset) {
+ assert(start_offset + old_ip[offset / sizeof(brw_compact_inst)] *
+ sizeof(brw_inst) < annotation[i].offset);
+ offset = next_offset(devinfo, store, offset);
+ }
+
+ annotation[i].offset = start_offset + offset;
+
+ offset = next_offset(devinfo, store, offset);
+ }
+
+ annotation[num_annotations].offset = p->next_insn_offset;
+ }
+}
diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h
new file mode 100644
index 00000000000..5848f920448
--- /dev/null
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -0,0 +1,1246 @@
+/*
+ Copyright (C) Intel Corp. 2006. All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+ * Authors:
+ * Keith Whitwell <[email protected]>
+ */
+
+#ifndef BRW_EU_DEFINES_H
+#define BRW_EU_DEFINES_H
+
+#include "util/macros.h"
+
+/* The following hunk, up-to "Execution Unit" is used by both the
+ * intel/compiler and i965 codebase. */
+
+#define INTEL_MASK(high, low) (((1u<<((high)-(low)+1))-1)<<(low))
+/* Using the GNU statement expression extension */
+#define SET_FIELD(value, field) \
+ ({ \
+ uint32_t fieldval = (value) << field ## _SHIFT; \
+ assert((fieldval & ~ field ## _MASK) == 0); \
+ fieldval & field ## _MASK; \
+ })
+
+#define GET_BITS(data, high, low) ((data & INTEL_MASK((high), (low))) >> (low))
+#define GET_FIELD(word, field) (((word) & field ## _MASK) >> field ## _SHIFT)
+
+#define _3DPRIM_POINTLIST 0x01
+#define _3DPRIM_LINELIST 0x02
+#define _3DPRIM_LINESTRIP 0x03
+#define _3DPRIM_TRILIST 0x04
+#define _3DPRIM_TRISTRIP 0x05
+#define _3DPRIM_TRIFAN 0x06
+#define _3DPRIM_QUADLIST 0x07
+#define _3DPRIM_QUADSTRIP 0x08
+#define _3DPRIM_LINELIST_ADJ 0x09 /* G45+ */
+#define _3DPRIM_LINESTRIP_ADJ 0x0A /* G45+ */
+#define _3DPRIM_TRILIST_ADJ 0x0B /* G45+ */
+#define _3DPRIM_TRISTRIP_ADJ 0x0C /* G45+ */
+#define _3DPRIM_TRISTRIP_REVERSE 0x0D
+#define _3DPRIM_POLYGON 0x0E
+#define _3DPRIM_RECTLIST 0x0F
+#define _3DPRIM_LINELOOP 0x10
+#define _3DPRIM_POINTLIST_BF 0x11
+#define _3DPRIM_LINESTRIP_CONT 0x12
+#define _3DPRIM_LINESTRIP_BF 0x13
+#define _3DPRIM_LINESTRIP_CONT_BF 0x14
+#define _3DPRIM_TRIFAN_NOSTIPPLE 0x16
+#define _3DPRIM_PATCHLIST(n) ({ assert(n > 0 && n <= 32); 0x20 + (n - 1); })
+
+enum brw_barycentric_mode {
+ BRW_BARYCENTRIC_PERSPECTIVE_PIXEL = 0,
+ BRW_BARYCENTRIC_PERSPECTIVE_CENTROID = 1,
+ BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE = 2,
+ BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL = 3,
+ BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID = 4,
+ BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE = 5,
+ BRW_BARYCENTRIC_MODE_COUNT = 6
+};
+#define BRW_BARYCENTRIC_NONPERSPECTIVE_BITS \
+ ((1 << BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL) | \
+ (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID) | \
+ (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))
+
+enum brw_pixel_shader_computed_depth_mode {
+ BRW_PSCDEPTH_OFF = 0, /* PS does not compute depth */
+ BRW_PSCDEPTH_ON = 1, /* PS computes depth; no guarantee about value */
+ BRW_PSCDEPTH_ON_GE = 2, /* PS guarantees output depth >= source depth */
+ BRW_PSCDEPTH_ON_LE = 3, /* PS guarantees output depth <= source depth */
+};
+
+/* Bitfields for the URB_WRITE message, DW2 of message header: */
+#define URB_WRITE_PRIM_END 0x1
+#define URB_WRITE_PRIM_START 0x2
+#define URB_WRITE_PRIM_TYPE_SHIFT 2
+
+# define GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT 0
+# define GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID 1
+
+/* Execution Unit (EU) defines
+ */
+
+#define BRW_ALIGN_1 0
+#define BRW_ALIGN_16 1
+
+#define BRW_ADDRESS_DIRECT 0
+#define BRW_ADDRESS_REGISTER_INDIRECT_REGISTER 1
+
+#define BRW_CHANNEL_X 0
+#define BRW_CHANNEL_Y 1
+#define BRW_CHANNEL_Z 2
+#define BRW_CHANNEL_W 3
+
+enum brw_compression {
+ BRW_COMPRESSION_NONE = 0,
+ BRW_COMPRESSION_2NDHALF = 1,
+ BRW_COMPRESSION_COMPRESSED = 2,
+};
+
+#define GEN6_COMPRESSION_1Q 0
+#define GEN6_COMPRESSION_2Q 1
+#define GEN6_COMPRESSION_3Q 2
+#define GEN6_COMPRESSION_4Q 3
+#define GEN6_COMPRESSION_1H 0
+#define GEN6_COMPRESSION_2H 2
+
+enum PACKED brw_conditional_mod {
+ BRW_CONDITIONAL_NONE = 0,
+ BRW_CONDITIONAL_Z = 1,
+ BRW_CONDITIONAL_NZ = 2,
+ BRW_CONDITIONAL_EQ = 1, /* Z */
+ BRW_CONDITIONAL_NEQ = 2, /* NZ */
+ BRW_CONDITIONAL_G = 3,
+ BRW_CONDITIONAL_GE = 4,
+ BRW_CONDITIONAL_L = 5,
+ BRW_CONDITIONAL_LE = 6,
+ BRW_CONDITIONAL_R = 7, /* Gen <= 5 */
+ BRW_CONDITIONAL_O = 8,
+ BRW_CONDITIONAL_U = 9,
+};
+
+#define BRW_DEBUG_NONE 0
+#define BRW_DEBUG_BREAKPOINT 1
+
+#define BRW_DEPENDENCY_NORMAL 0
+#define BRW_DEPENDENCY_NOTCLEARED 1
+#define BRW_DEPENDENCY_NOTCHECKED 2
+#define BRW_DEPENDENCY_DISABLE 3
+
+enum PACKED brw_execution_size {
+ BRW_EXECUTE_1 = 0,
+ BRW_EXECUTE_2 = 1,
+ BRW_EXECUTE_4 = 2,
+ BRW_EXECUTE_8 = 3,
+ BRW_EXECUTE_16 = 4,
+ BRW_EXECUTE_32 = 5,
+};
+
+enum PACKED brw_horizontal_stride {
+ BRW_HORIZONTAL_STRIDE_0 = 0,
+ BRW_HORIZONTAL_STRIDE_1 = 1,
+ BRW_HORIZONTAL_STRIDE_2 = 2,
+ BRW_HORIZONTAL_STRIDE_4 = 3,
+};
+
+#define BRW_INSTRUCTION_NORMAL 0
+#define BRW_INSTRUCTION_SATURATE 1
+
+#define BRW_MASK_ENABLE 0
+#define BRW_MASK_DISABLE 1
+
+/** @{
+ *
+ * Gen6 has replaced "mask enable/disable" with WECtrl, which is
+ * effectively the same but much simpler to think about. Now, there
+ * are two contributors ANDed together to whether channels are
+ * executed: The predication on the instruction, and the channel write
+ * enable.
+ */
+/**
+ * This is the default value. It means that a channel's write enable is set
+ * if the per-channel IP is pointing at this instruction.
+ */
+#define BRW_WE_NORMAL 0
+/**
+ * This is used like BRW_MASK_DISABLE, and causes all channels to have
+ * their write enable set. Note that predication still contributes to
+ * whether the channel actually gets written.
+ */
+#define BRW_WE_ALL 1
+/** @} */
+
+enum opcode {
+ /* These are the actual hardware opcodes. */
+ BRW_OPCODE_ILLEGAL = 0,
+ BRW_OPCODE_MOV = 1,
+ BRW_OPCODE_SEL = 2,
+ BRW_OPCODE_MOVI = 3, /**< G45+ */
+ BRW_OPCODE_NOT = 4,
+ BRW_OPCODE_AND = 5,
+ BRW_OPCODE_OR = 6,
+ BRW_OPCODE_XOR = 7,
+ BRW_OPCODE_SHR = 8,
+ BRW_OPCODE_SHL = 9,
+ BRW_OPCODE_DIM = 10, /**< Gen7.5 only */ /* Reused */
+ // BRW_OPCODE_SMOV = 10, /**< Gen8+ */ /* Reused */
+ /* Reserved - 11 */
+ BRW_OPCODE_ASR = 12,
+ /* Reserved - 13-15 */
+ BRW_OPCODE_CMP = 16,
+ BRW_OPCODE_CMPN = 17,
+ BRW_OPCODE_CSEL = 18, /**< Gen8+ */
+ BRW_OPCODE_F32TO16 = 19, /**< Gen7 only */
+ BRW_OPCODE_F16TO32 = 20, /**< Gen7 only */
+ /* Reserved - 21-22 */
+ BRW_OPCODE_BFREV = 23, /**< Gen7+ */
+ BRW_OPCODE_BFE = 24, /**< Gen7+ */
+ BRW_OPCODE_BFI1 = 25, /**< Gen7+ */
+ BRW_OPCODE_BFI2 = 26, /**< Gen7+ */
+ /* Reserved - 27-31 */
+ BRW_OPCODE_JMPI = 32,
+ // BRW_OPCODE_BRD = 33, /**< Gen7+ */
+ BRW_OPCODE_IF = 34,
+ BRW_OPCODE_IFF = 35, /**< Pre-Gen6 */ /* Reused */
+ // BRW_OPCODE_BRC = 35, /**< Gen7+ */ /* Reused */
+ BRW_OPCODE_ELSE = 36,
+ BRW_OPCODE_ENDIF = 37,
+ BRW_OPCODE_DO = 38, /**< Pre-Gen6 */ /* Reused */
+ // BRW_OPCODE_CASE = 38, /**< Gen6 only */ /* Reused */
+ BRW_OPCODE_WHILE = 39,
+ BRW_OPCODE_BREAK = 40,
+ BRW_OPCODE_CONTINUE = 41,
+ BRW_OPCODE_HALT = 42,
+ // BRW_OPCODE_CALLA = 43, /**< Gen7.5+ */
+ // BRW_OPCODE_MSAVE = 44, /**< Pre-Gen6 */ /* Reused */
+ // BRW_OPCODE_CALL = 44, /**< Gen6+ */ /* Reused */
+ // BRW_OPCODE_MREST = 45, /**< Pre-Gen6 */ /* Reused */
+ // BRW_OPCODE_RET = 45, /**< Gen6+ */ /* Reused */
+ // BRW_OPCODE_PUSH = 46, /**< Pre-Gen6 */ /* Reused */
+ // BRW_OPCODE_FORK = 46, /**< Gen6 only */ /* Reused */
+ // BRW_OPCODE_GOTO = 46, /**< Gen8+ */ /* Reused */
+ // BRW_OPCODE_POP = 47, /**< Pre-Gen6 */
+ BRW_OPCODE_WAIT = 48,
+ BRW_OPCODE_SEND = 49,
+ BRW_OPCODE_SENDC = 50,
+ BRW_OPCODE_SENDS = 51, /**< Gen9+ */
+ BRW_OPCODE_SENDSC = 52, /**< Gen9+ */
+ /* Reserved 53-55 */
+ BRW_OPCODE_MATH = 56, /**< Gen6+ */
+ /* Reserved 57-63 */
+ BRW_OPCODE_ADD = 64,
+ BRW_OPCODE_MUL = 65,
+ BRW_OPCODE_AVG = 66,
+ BRW_OPCODE_FRC = 67,
+ BRW_OPCODE_RNDU = 68,
+ BRW_OPCODE_RNDD = 69,
+ BRW_OPCODE_RNDE = 70,
+ BRW_OPCODE_RNDZ = 71,
+ BRW_OPCODE_MAC = 72,
+ BRW_OPCODE_MACH = 73,
+ BRW_OPCODE_LZD = 74,
+ BRW_OPCODE_FBH = 75, /**< Gen7+ */
+ BRW_OPCODE_FBL = 76, /**< Gen7+ */
+ BRW_OPCODE_CBIT = 77, /**< Gen7+ */
+ BRW_OPCODE_ADDC = 78, /**< Gen7+ */
+ BRW_OPCODE_SUBB = 79, /**< Gen7+ */
+ BRW_OPCODE_SAD2 = 80,
+ BRW_OPCODE_SADA2 = 81,
+ /* Reserved 82-83 */
+ BRW_OPCODE_DP4 = 84,
+ BRW_OPCODE_DPH = 85,
+ BRW_OPCODE_DP3 = 86,
+ BRW_OPCODE_DP2 = 87,
+ /* Reserved 88 */
+ BRW_OPCODE_LINE = 89,
+ BRW_OPCODE_PLN = 90, /**< G45+ */
+ BRW_OPCODE_MAD = 91, /**< Gen6+ */
+ BRW_OPCODE_LRP = 92, /**< Gen6+ */
+ // BRW_OPCODE_MADM = 93, /**< Gen8+ */
+ /* Reserved 94-124 */
+ BRW_OPCODE_NENOP = 125, /**< G45 only */
+ BRW_OPCODE_NOP = 126,
+ /* Reserved 127 */
+
+ /* These are compiler backend opcodes that get translated into other
+ * instructions.
+ */
+ FS_OPCODE_FB_WRITE = 128,
+
+ /**
+ * Same as FS_OPCODE_FB_WRITE but expects its arguments separately as
+ * individual sources instead of as a single payload blob. The
+ * position/ordering of the arguments are defined by the enum
+ * fb_write_logical_srcs.
+ */
+ FS_OPCODE_FB_WRITE_LOGICAL,
+
+ FS_OPCODE_REP_FB_WRITE,
+
+ FS_OPCODE_FB_READ,
+ FS_OPCODE_FB_READ_LOGICAL,
+
+ SHADER_OPCODE_RCP,
+ SHADER_OPCODE_RSQ,
+ SHADER_OPCODE_SQRT,
+ SHADER_OPCODE_EXP2,
+ SHADER_OPCODE_LOG2,
+ SHADER_OPCODE_POW,
+ SHADER_OPCODE_INT_QUOTIENT,
+ SHADER_OPCODE_INT_REMAINDER,
+ SHADER_OPCODE_SIN,
+ SHADER_OPCODE_COS,
+
+ /**
+ * Texture sampling opcodes.
+ *
+ * LOGICAL opcodes are eventually translated to the matching non-LOGICAL
+ * opcode but instead of taking a single payload blob they expect their
+ * arguments separately as individual sources. The position/ordering of the
+ * arguments are defined by the enum tex_logical_srcs.
+ */
+ SHADER_OPCODE_TEX,
+ SHADER_OPCODE_TEX_LOGICAL,
+ SHADER_OPCODE_TXD,
+ SHADER_OPCODE_TXD_LOGICAL,
+ SHADER_OPCODE_TXF,
+ SHADER_OPCODE_TXF_LOGICAL,
+ SHADER_OPCODE_TXF_LZ,
+ SHADER_OPCODE_TXL,
+ SHADER_OPCODE_TXL_LOGICAL,
+ SHADER_OPCODE_TXL_LZ,
+ SHADER_OPCODE_TXS,
+ SHADER_OPCODE_TXS_LOGICAL,
+ FS_OPCODE_TXB,
+ FS_OPCODE_TXB_LOGICAL,
+ SHADER_OPCODE_TXF_CMS,
+ SHADER_OPCODE_TXF_CMS_LOGICAL,
+ SHADER_OPCODE_TXF_CMS_W,
+ SHADER_OPCODE_TXF_CMS_W_LOGICAL,
+ SHADER_OPCODE_TXF_UMS,
+ SHADER_OPCODE_TXF_UMS_LOGICAL,
+ SHADER_OPCODE_TXF_MCS,
+ SHADER_OPCODE_TXF_MCS_LOGICAL,
+ SHADER_OPCODE_LOD,
+ SHADER_OPCODE_LOD_LOGICAL,
+ SHADER_OPCODE_TG4,
+ SHADER_OPCODE_TG4_LOGICAL,
+ SHADER_OPCODE_TG4_OFFSET,
+ SHADER_OPCODE_TG4_OFFSET_LOGICAL,
+ SHADER_OPCODE_SAMPLEINFO,
+ SHADER_OPCODE_SAMPLEINFO_LOGICAL,
+
+ /**
+ * Combines multiple sources of size 1 into a larger virtual GRF.
+ * For example, parameters for a send-from-GRF message. Or, updating
+ * channels of a size 4 VGRF used to store vec4s such as texturing results.
+ *
+ * This will be lowered into MOVs from each source to consecutive offsets
+ * of the destination VGRF.
+ *
+ * src[0] may be BAD_FILE. If so, the lowering pass skips emitting the MOV,
+ * but still reserves the first channel of the destination VGRF. This can be
+ * used to reserve space for, say, a message header set up by the generators.
+ */
+ SHADER_OPCODE_LOAD_PAYLOAD,
+
+ /**
+ * Packs a number of sources into a single value. Unlike LOAD_PAYLOAD, this
+ * acts intra-channel, obtaining the final value for each channel by
+ * combining the sources values for the same channel, the first source
+ * occupying the lowest bits and the last source occupying the highest
+ * bits.
+ */
+ FS_OPCODE_PACK,
+
+ SHADER_OPCODE_SHADER_TIME_ADD,
+
+ /**
+ * Typed and untyped surface access opcodes.
+ *
+ * LOGICAL opcodes are eventually translated to the matching non-LOGICAL
+ * opcode but instead of taking a single payload blob they expect their
+ * arguments separately as individual sources:
+ *
+ * Source 0: [required] Surface coordinates.
+ * Source 1: [optional] Operation source.
+ * Source 2: [required] Surface index.
+ * Source 3: [required] Number of coordinate components (as UD immediate).
+ * Source 4: [required] Opcode-specific control immediate, same as source 2
+ * of the matching non-LOGICAL opcode.
+ */
+ SHADER_OPCODE_UNTYPED_ATOMIC,
+ SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
+ SHADER_OPCODE_UNTYPED_SURFACE_READ,
+ SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
+ SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
+ SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
+
+ SHADER_OPCODE_TYPED_ATOMIC,
+ SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
+ SHADER_OPCODE_TYPED_SURFACE_READ,
+ SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
+ SHADER_OPCODE_TYPED_SURFACE_WRITE,
+ SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
+
+ SHADER_OPCODE_MEMORY_FENCE,
+
+ SHADER_OPCODE_GEN4_SCRATCH_READ,
+ SHADER_OPCODE_GEN4_SCRATCH_WRITE,
+ SHADER_OPCODE_GEN7_SCRATCH_READ,
+
+ /**
+ * Gen8+ SIMD8 URB Read messages.
+ */
+ SHADER_OPCODE_URB_READ_SIMD8,
+ SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT,
+
+ SHADER_OPCODE_URB_WRITE_SIMD8,
+ SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT,
+ SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
+ SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT,
+
+ /**
+ * Return the index of an arbitrary live channel (i.e. one of the channels
+ * enabled in the current execution mask) and assign it to the first
+ * component of the destination. Expected to be used as input for the
+ * BROADCAST pseudo-opcode.
+ */
+ SHADER_OPCODE_FIND_LIVE_CHANNEL,
+
+ /**
+ * Pick the channel from its first source register given by the index
+ * specified as second source. Useful for variable indexing of surfaces.
+ *
+ * Note that because the result of this instruction is by definition
+ * uniform and it can always be splatted to multiple channels using a
+ * scalar regioning mode, only the first channel of the destination region
+ * is guaranteed to be updated, which implies that BROADCAST instructions
+ * should usually be marked force_writemask_all.
+ */
+ SHADER_OPCODE_BROADCAST,
+
+ VEC4_OPCODE_MOV_BYTES,
+ VEC4_OPCODE_PACK_BYTES,
+ VEC4_OPCODE_UNPACK_UNIFORM,
+ VEC4_OPCODE_FROM_DOUBLE,
+ VEC4_OPCODE_TO_DOUBLE,
+ VEC4_OPCODE_PICK_LOW_32BIT,
+ VEC4_OPCODE_PICK_HIGH_32BIT,
+ VEC4_OPCODE_SET_LOW_32BIT,
+ VEC4_OPCODE_SET_HIGH_32BIT,
+
+ FS_OPCODE_DDX_COARSE,
+ FS_OPCODE_DDX_FINE,
+ /**
+ * Compute dFdy(), dFdyCoarse(), or dFdyFine().
+ */
+ FS_OPCODE_DDY_COARSE,
+ FS_OPCODE_DDY_FINE,
+ FS_OPCODE_CINTERP,
+ FS_OPCODE_LINTERP,
+ FS_OPCODE_PIXEL_X,
+ FS_OPCODE_PIXEL_Y,
+ FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+ FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7,
+ FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4,
+ FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
+ FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
+ FS_OPCODE_GET_BUFFER_SIZE,
+ FS_OPCODE_MOV_DISPATCH_TO_FLAGS,
+ FS_OPCODE_DISCARD_JUMP,
+ FS_OPCODE_SET_SAMPLE_ID,
+ FS_OPCODE_PACK_HALF_2x16_SPLIT,
+ FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X,
+ FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y,
+ FS_OPCODE_PLACEHOLDER_HALT,
+ FS_OPCODE_INTERPOLATE_AT_SAMPLE,
+ FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
+ FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET,
+
+ VS_OPCODE_URB_WRITE,
+ VS_OPCODE_PULL_CONSTANT_LOAD,
+ VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
+ VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
+
+ VS_OPCODE_GET_BUFFER_SIZE,
+
+ VS_OPCODE_UNPACK_FLAGS_SIMD4X2,
+
+ /**
+ * Write geometry shader output data to the URB.
+ *
+ * Unlike VS_OPCODE_URB_WRITE, this opcode doesn't do an implied move from
+ * R0 to the first MRF. This allows the geometry shader to override the
+ * "Slot {0,1} Offset" fields in the message header.
+ */
+ GS_OPCODE_URB_WRITE,
+
+ /**
+ * Write geometry shader output data to the URB and request a new URB
+ * handle (gen6).
+ *
+ * This opcode doesn't do an implied move from R0 to the first MRF.
+ */
+ GS_OPCODE_URB_WRITE_ALLOCATE,
+
+ /**
+ * Terminate the geometry shader thread by doing an empty URB write.
+ *
+ * This opcode doesn't do an implied move from R0 to the first MRF. This
+ * allows the geometry shader to override the "GS Number of Output Vertices
+ * for Slot {0,1}" fields in the message header.
+ */
+ GS_OPCODE_THREAD_END,
+
+ /**
+ * Set the "Slot {0,1} Offset" fields of a URB_WRITE message header.
+ *
+ * - dst is the MRF containing the message header.
+ *
+ * - src0.x indicates which portion of the URB should be written to (e.g. a
+ * vertex number)
+ *
+ * - src1 is an immediate multiplier which will be applied to src0
+ * (e.g. the size of a single vertex in the URB).
+ *
+ * Note: the hardware will apply this offset *in addition to* the offset in
+ * vec4_instruction::offset.
+ */
+ GS_OPCODE_SET_WRITE_OFFSET,
+
+ /**
+ * Set the "GS Number of Output Vertices for Slot {0,1}" fields of a
+ * URB_WRITE message header.
+ *
+ * - dst is the MRF containing the message header.
+ *
+ * - src0.x is the vertex count. The upper 16 bits will be ignored.
+ */
+ GS_OPCODE_SET_VERTEX_COUNT,
+
+ /**
+ * Set DWORD 2 of dst to the value in src.
+ */
+ GS_OPCODE_SET_DWORD_2,
+
+ /**
+ * Prepare the dst register for storage in the "Channel Mask" fields of a
+ * URB_WRITE message header.
+ *
+ * DWORD 4 of dst is shifted left by 4 bits, so that later,
+ * GS_OPCODE_SET_CHANNEL_MASKS can OR DWORDs 0 and 4 together to form the
+ * final channel mask.
+ *
+ * Note: since GS_OPCODE_SET_CHANNEL_MASKS ORs DWORDs 0 and 4 together to
+ * form the final channel mask, DWORDs 0 and 4 of the dst register must not
+ * have any extraneous bits set prior to execution of this opcode (that is,
+ * they should be in the range 0x0 to 0xf).
+ */
+ GS_OPCODE_PREPARE_CHANNEL_MASKS,
+
+ /**
+ * Set the "Channel Mask" fields of a URB_WRITE message header.
+ *
+ * - dst is the MRF containing the message header.
+ *
+ * - src.x is the channel mask, as prepared by
+ * GS_OPCODE_PREPARE_CHANNEL_MASKS. DWORDs 0 and 4 are OR'ed together to
+ * form the final channel mask.
+ */
+ GS_OPCODE_SET_CHANNEL_MASKS,
+
+ /**
+ * Get the "Instance ID" fields from the payload.
+ *
+ * - dst is the GRF for gl_InvocationID.
+ */
+ GS_OPCODE_GET_INSTANCE_ID,
+
+ /**
+ * Send a FF_SYNC message to allocate initial URB handles (gen6).
+ *
+ * - dst will be used as the writeback register for the FF_SYNC operation.
+ *
+ * - src0 is the number of primitives written.
+ *
+ * - src1 is the value to hold in M0.0: number of SO vertices to write
+ * and number of SO primitives needed. Its value will be overwritten
+ * with the SVBI values if transform feedback is enabled.
+ *
+ * Note: This opcode uses an implicit MRF register for the ff_sync message
+ * header, so the caller is expected to set inst->base_mrf and initialize
+ * that MRF register to r0. This opcode will also write to this MRF register
+ * to include the allocated URB handle so it can then be reused directly as
+ * the header in the URB write operation we are allocating the handle for.
+ */
+ GS_OPCODE_FF_SYNC,
+
+ /**
+ * Move r0.1 (which holds PrimitiveID information in gen6) to a separate
+ * register.
+ *
+ * - dst is the GRF where PrimitiveID information will be moved.
+ */
+ GS_OPCODE_SET_PRIMITIVE_ID,
+
+ /**
+ * Write transform feedback data to the SVB by sending a SVB WRITE message.
+ * Used in gen6.
+ *
+ * - dst is the MRF register containing the message header.
+ *
+ * - src0 is the register where the vertex data is going to be copied from.
+ *
+ * - src1 is the destination register when write commit occurs.
+ */
+ GS_OPCODE_SVB_WRITE,
+
+ /**
+ * Set destination index in the SVB write message payload (M0.5). Used
+ * in gen6 for transform feedback.
+ *
+ * - dst is the header to save the destination indices for SVB WRITE.
+ * - src is the register that holds the destination indices value.
+ */
+ GS_OPCODE_SVB_SET_DST_INDEX,
+
+ /**
+ * Prepare Mx.0 subregister for being used in the FF_SYNC message header.
+ * Used in gen6 for transform feedback.
+ *
+ * - dst will hold the register with the final Mx.0 value.
+ *
+ * - src0 has the number of vertices emitted in SO (NumSOVertsToWrite)
+ *
+ * - src1 has the number of needed primitives for SO (NumSOPrimsNeeded)
+ *
+ * - src2 is the value to hold in M0: number of SO vertices to write
+ * and number of SO primitives needed.
+ */
+ GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
+
+ /**
+ * Terminate the compute shader.
+ */
+ CS_OPCODE_CS_TERMINATE,
+
+ /**
+ * GLSL barrier()
+ */
+ SHADER_OPCODE_BARRIER,
+
+ /**
+ * Calculate the high 32-bits of a 32x32 multiply.
+ */
+ SHADER_OPCODE_MULH,
+
+ /**
+ * A MOV that uses VxH indirect addressing.
+ *
+ * Source 0: A register to start from (HW_REG).
+ * Source 1: An indirect offset (in bytes, UD GRF).
+ * Source 2: The length of the region that could be accessed (in bytes,
+ * UD immediate).
+ */
+ SHADER_OPCODE_MOV_INDIRECT,
+
+ VEC4_OPCODE_URB_READ,
+ TCS_OPCODE_GET_INSTANCE_ID,
+ TCS_OPCODE_URB_WRITE,
+ TCS_OPCODE_SET_INPUT_URB_OFFSETS,
+ TCS_OPCODE_SET_OUTPUT_URB_OFFSETS,
+ TCS_OPCODE_GET_PRIMITIVE_ID,
+ TCS_OPCODE_CREATE_BARRIER_HEADER,
+ TCS_OPCODE_SRC0_010_IS_ZERO,
+ TCS_OPCODE_RELEASE_INPUT,
+ TCS_OPCODE_THREAD_END,
+
+ TES_OPCODE_GET_PRIMITIVE_ID,
+ TES_OPCODE_CREATE_INPUT_READ_HEADER,
+ TES_OPCODE_ADD_INDIRECT_URB_OFFSET,
+};
+
+enum brw_urb_write_flags {
+ BRW_URB_WRITE_NO_FLAGS = 0,
+
+ /**
+ * Causes a new URB entry to be allocated, and its address stored in the
+ * destination register (gen < 7).
+ */
+ BRW_URB_WRITE_ALLOCATE = 0x1,
+
+ /**
+ * Causes the current URB entry to be deallocated (gen < 7).
+ */
+ BRW_URB_WRITE_UNUSED = 0x2,
+
+ /**
+ * Causes the thread to terminate.
+ */
+ BRW_URB_WRITE_EOT = 0x4,
+
+ /**
+ * Indicates that the given URB entry is complete, and may be sent further
+ * down the 3D pipeline (gen < 7).
+ */
+ BRW_URB_WRITE_COMPLETE = 0x8,
+
+ /**
+ * Indicates that an additional offset (which may be different for the two
+ * vec4 slots) is stored in the message header (gen == 7).
+ */
+ BRW_URB_WRITE_PER_SLOT_OFFSET = 0x10,
+
+ /**
+ * Indicates that the channel masks in the URB_WRITE message header should
+ * not be overridden to 0xff (gen == 7).
+ */
+ BRW_URB_WRITE_USE_CHANNEL_MASKS = 0x20,
+
+ /**
+ * Indicates that the data should be sent to the URB using the
+ * URB_WRITE_OWORD message rather than URB_WRITE_HWORD (gen == 7). This
+ * causes offsets to be interpreted as multiples of an OWORD instead of an
+ * HWORD, and only allows one OWORD to be written.
+ */
+ BRW_URB_WRITE_OWORD = 0x40,
+
+ /**
+ * Convenient combination of flags: end the thread while simultaneously
+ * marking the given URB entry as complete.
+ */
+ BRW_URB_WRITE_EOT_COMPLETE = BRW_URB_WRITE_EOT | BRW_URB_WRITE_COMPLETE,
+
+ /**
+ * Convenient combination of flags: mark the given URB entry as complete
+ * and simultaneously allocate a new one.
+ */
+ BRW_URB_WRITE_ALLOCATE_COMPLETE =
+ BRW_URB_WRITE_ALLOCATE | BRW_URB_WRITE_COMPLETE,
+};
+
+enum fb_write_logical_srcs {
+ FB_WRITE_LOGICAL_SRC_COLOR0, /* REQUIRED */
+ FB_WRITE_LOGICAL_SRC_COLOR1, /* for dual source blend messages */
+ FB_WRITE_LOGICAL_SRC_SRC0_ALPHA,
+ FB_WRITE_LOGICAL_SRC_SRC_DEPTH, /* gl_FragDepth */
+ FB_WRITE_LOGICAL_SRC_DST_DEPTH, /* GEN4-5: passthrough from thread */
+ FB_WRITE_LOGICAL_SRC_SRC_STENCIL, /* gl_FragStencilRefARB */
+ FB_WRITE_LOGICAL_SRC_OMASK, /* Sample Mask (gl_SampleMask) */
+ FB_WRITE_LOGICAL_SRC_COMPONENTS, /* REQUIRED */
+ FB_WRITE_LOGICAL_NUM_SRCS
+};
+
+enum tex_logical_srcs {
+ /** Texture coordinates */
+ TEX_LOGICAL_SRC_COORDINATE,
+ /** Shadow comparator */
+ TEX_LOGICAL_SRC_SHADOW_C,
+ /** dPdx if the operation takes explicit derivatives, otherwise LOD value */
+ TEX_LOGICAL_SRC_LOD,
+ /** dPdy if the operation takes explicit derivatives */
+ TEX_LOGICAL_SRC_LOD2,
+ /** Sample index */
+ TEX_LOGICAL_SRC_SAMPLE_INDEX,
+ /** MCS data */
+ TEX_LOGICAL_SRC_MCS,
+ /** REQUIRED: Texture surface index */
+ TEX_LOGICAL_SRC_SURFACE,
+ /** Texture sampler index */
+ TEX_LOGICAL_SRC_SAMPLER,
+ /** Texel offset for gathers */
+ TEX_LOGICAL_SRC_TG4_OFFSET,
+ /** REQUIRED: Number of coordinate components (as UD immediate) */
+ TEX_LOGICAL_SRC_COORD_COMPONENTS,
+ /** REQUIRED: Number of derivative components (as UD immediate) */
+ TEX_LOGICAL_SRC_GRAD_COMPONENTS,
+
+ TEX_LOGICAL_NUM_SRCS,
+};
+
+#ifdef __cplusplus
+/**
+ * Allow brw_urb_write_flags enums to be ORed together.
+ */
+inline brw_urb_write_flags
+operator|(brw_urb_write_flags x, brw_urb_write_flags y)
+{
+ return static_cast<brw_urb_write_flags>(static_cast<int>(x) |
+ static_cast<int>(y));
+}
+#endif
+
+enum PACKED brw_predicate {
+ BRW_PREDICATE_NONE = 0,
+ BRW_PREDICATE_NORMAL = 1,
+ BRW_PREDICATE_ALIGN1_ANYV = 2,
+ BRW_PREDICATE_ALIGN1_ALLV = 3,
+ BRW_PREDICATE_ALIGN1_ANY2H = 4,
+ BRW_PREDICATE_ALIGN1_ALL2H = 5,
+ BRW_PREDICATE_ALIGN1_ANY4H = 6,
+ BRW_PREDICATE_ALIGN1_ALL4H = 7,
+ BRW_PREDICATE_ALIGN1_ANY8H = 8,
+ BRW_PREDICATE_ALIGN1_ALL8H = 9,
+ BRW_PREDICATE_ALIGN1_ANY16H = 10,
+ BRW_PREDICATE_ALIGN1_ALL16H = 11,
+ BRW_PREDICATE_ALIGN1_ANY32H = 12,
+ BRW_PREDICATE_ALIGN1_ALL32H = 13,
+ BRW_PREDICATE_ALIGN16_REPLICATE_X = 2,
+ BRW_PREDICATE_ALIGN16_REPLICATE_Y = 3,
+ BRW_PREDICATE_ALIGN16_REPLICATE_Z = 4,
+ BRW_PREDICATE_ALIGN16_REPLICATE_W = 5,
+ BRW_PREDICATE_ALIGN16_ANY4H = 6,
+ BRW_PREDICATE_ALIGN16_ALL4H = 7,
+};
+
+enum PACKED brw_reg_file {
+ BRW_ARCHITECTURE_REGISTER_FILE = 0,
+ BRW_GENERAL_REGISTER_FILE = 1,
+ BRW_MESSAGE_REGISTER_FILE = 2,
+ BRW_IMMEDIATE_VALUE = 3,
+
+ ARF = BRW_ARCHITECTURE_REGISTER_FILE,
+ FIXED_GRF = BRW_GENERAL_REGISTER_FILE,
+ MRF = BRW_MESSAGE_REGISTER_FILE,
+ IMM = BRW_IMMEDIATE_VALUE,
+
+ /* These are not hardware values */
+ VGRF,
+ ATTR,
+ UNIFORM, /* prog_data->params[reg] */
+ BAD_FILE,
+};
+
+#define BRW_HW_REG_TYPE_UD 0
+#define BRW_HW_REG_TYPE_D 1
+#define BRW_HW_REG_TYPE_UW 2
+#define BRW_HW_REG_TYPE_W 3
+#define BRW_HW_REG_TYPE_F 7
+#define GEN8_HW_REG_TYPE_UQ 8
+#define GEN8_HW_REG_TYPE_Q 9
+
+#define BRW_HW_REG_NON_IMM_TYPE_UB 4
+#define BRW_HW_REG_NON_IMM_TYPE_B 5
+#define GEN7_HW_REG_NON_IMM_TYPE_DF 6
+#define GEN8_HW_REG_NON_IMM_TYPE_HF 10
+
+#define BRW_HW_REG_IMM_TYPE_UV 4 /* Gen6+ packed unsigned immediate vector */
+#define BRW_HW_REG_IMM_TYPE_VF 5 /* packed float immediate vector */
+#define BRW_HW_REG_IMM_TYPE_V 6 /* packed int imm. vector; uword dest only */
+#define GEN8_HW_REG_IMM_TYPE_DF 10
+#define GEN8_HW_REG_IMM_TYPE_HF 11
+
+/* SNB adds 3-src instructions (MAD and LRP) that only operate on floats, so
+ * the types were implied. IVB adds BFE and BFI2 that operate on doublewords
+ * and unsigned doublewords, so a new field is also available in the da3src
+ * struct (part of struct brw_instruction.bits1 in brw_structs.h) to select
+ * dst and shared-src types. The values are different from BRW_REGISTER_TYPE_*.
+ */
+#define BRW_3SRC_TYPE_F 0
+#define BRW_3SRC_TYPE_D 1
+#define BRW_3SRC_TYPE_UD 2
+#define BRW_3SRC_TYPE_DF 3
+
+#define BRW_ARF_NULL 0x00
+#define BRW_ARF_ADDRESS 0x10
+#define BRW_ARF_ACCUMULATOR 0x20
+#define BRW_ARF_FLAG 0x30
+#define BRW_ARF_MASK 0x40
+#define BRW_ARF_MASK_STACK 0x50
+#define BRW_ARF_MASK_STACK_DEPTH 0x60
+#define BRW_ARF_STATE 0x70
+#define BRW_ARF_CONTROL 0x80
+#define BRW_ARF_NOTIFICATION_COUNT 0x90
+#define BRW_ARF_IP 0xA0
+#define BRW_ARF_TDR 0xB0
+#define BRW_ARF_TIMESTAMP 0xC0
+
+#define BRW_MRF_COMPR4 (1 << 7)
+
+#define BRW_AMASK 0
+#define BRW_IMASK 1
+#define BRW_LMASK 2
+#define BRW_CMASK 3
+
+
+
+#define BRW_THREAD_NORMAL 0
+#define BRW_THREAD_ATOMIC 1
+#define BRW_THREAD_SWITCH 2
+
+enum PACKED brw_vertical_stride {
+ BRW_VERTICAL_STRIDE_0 = 0,
+ BRW_VERTICAL_STRIDE_1 = 1,
+ BRW_VERTICAL_STRIDE_2 = 2,
+ BRW_VERTICAL_STRIDE_4 = 3,
+ BRW_VERTICAL_STRIDE_8 = 4,
+ BRW_VERTICAL_STRIDE_16 = 5,
+ BRW_VERTICAL_STRIDE_32 = 6,
+ BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL = 0xF,
+};
+
+enum PACKED brw_width {
+ BRW_WIDTH_1 = 0,
+ BRW_WIDTH_2 = 1,
+ BRW_WIDTH_4 = 2,
+ BRW_WIDTH_8 = 3,
+ BRW_WIDTH_16 = 4,
+};
+
+/**
+ * Message target: Shared Function ID for where to SEND a message.
+ *
+ * These are enumerated in the ISA reference under "send - Send Message".
+ * In particular, see the following tables:
+ * - G45 PRM, Volume 4, Table 14-15 "Message Descriptor Definition"
+ * - Sandybridge PRM, Volume 4 Part 2, Table 8-16 "Extended Message Descriptor"
+ * - Ivybridge PRM, Volume 1 Part 1, section 3.2.7 "GPE Function IDs"
+ */
+enum brw_message_target {
+ BRW_SFID_NULL = 0,
+ BRW_SFID_MATH = 1, /* Only valid on Gen4-5 */
+ BRW_SFID_SAMPLER = 2,
+ BRW_SFID_MESSAGE_GATEWAY = 3,
+ BRW_SFID_DATAPORT_READ = 4,
+ BRW_SFID_DATAPORT_WRITE = 5,
+ BRW_SFID_URB = 6,
+ BRW_SFID_THREAD_SPAWNER = 7,
+ BRW_SFID_VME = 8,
+
+ GEN6_SFID_DATAPORT_SAMPLER_CACHE = 4,
+ GEN6_SFID_DATAPORT_RENDER_CACHE = 5,
+ GEN6_SFID_DATAPORT_CONSTANT_CACHE = 9,
+
+ GEN7_SFID_DATAPORT_DATA_CACHE = 10,
+ GEN7_SFID_PIXEL_INTERPOLATOR = 11,
+ HSW_SFID_DATAPORT_DATA_CACHE_1 = 12,
+ HSW_SFID_CRE = 13,
+};
+
+#define GEN7_MESSAGE_TARGET_DP_DATA_CACHE 10
+
+#define BRW_SAMPLER_RETURN_FORMAT_FLOAT32 0
+#define BRW_SAMPLER_RETURN_FORMAT_UINT32 2
+#define BRW_SAMPLER_RETURN_FORMAT_SINT32 3
+
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE 0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE 0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS 0
+#define BRW_SAMPLER_MESSAGE_SIMD8_KILLPIX 1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD 1
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD 1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS 2
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS 2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE 0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE 2
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE 0
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE 1
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE 1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO 2
+#define BRW_SAMPLER_MESSAGE_SIMD16_RESINFO 2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_LD 3
+#define BRW_SAMPLER_MESSAGE_SIMD8_LD 3
+#define BRW_SAMPLER_MESSAGE_SIMD16_LD 3
+
+#define GEN5_SAMPLER_MESSAGE_SAMPLE 0
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS 1
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_LOD 2
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE 3
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS 4
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE 5
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE 6
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_LD 7
+#define GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4 8
+#define GEN5_SAMPLER_MESSAGE_LOD 9
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO 10
+#define GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO 11
+#define GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C 16
+#define GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO 17
+#define GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C 18
+#define HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE 20
+#define GEN9_SAMPLER_MESSAGE_SAMPLE_LZ 24
+#define GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ 25
+#define GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ 26
+#define GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W 28
+#define GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS 29
+#define GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS 30
+#define GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS 31
+
+/* for GEN5 only */
+#define BRW_SAMPLER_SIMD_MODE_SIMD4X2 0
+#define BRW_SAMPLER_SIMD_MODE_SIMD8 1
+#define BRW_SAMPLER_SIMD_MODE_SIMD16 2
+#define BRW_SAMPLER_SIMD_MODE_SIMD32_64 3
+
+/* GEN9 changes SIMD mode 0 to mean SIMD8D, but lets us get the SIMD4x2
+ * behavior by setting bit 22 of dword 2 in the message header. */
+#define GEN9_SAMPLER_SIMD_MODE_SIMD8D 0
+#define GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2 (1 << 22)
+
+#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW 0
+#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH 1
+#define BRW_DATAPORT_OWORD_BLOCK_2_OWORDS 2
+#define BRW_DATAPORT_OWORD_BLOCK_4_OWORDS 3
+#define BRW_DATAPORT_OWORD_BLOCK_8_OWORDS 4
+#define BRW_DATAPORT_OWORD_BLOCK_DWORDS(n) \
+ ((n) == 4 ? BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW : \
+ (n) == 8 ? BRW_DATAPORT_OWORD_BLOCK_2_OWORDS : \
+ (n) == 16 ? BRW_DATAPORT_OWORD_BLOCK_4_OWORDS : \
+ (n) == 32 ? BRW_DATAPORT_OWORD_BLOCK_8_OWORDS : \
+ (abort(), ~0))
+
+#define BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD 0
+#define BRW_DATAPORT_OWORD_DUAL_BLOCK_4OWORDS 2
+
+#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS 2
+#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS 3
+
+/* This one stays the same across generations. */
+#define BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ 0
+/* GEN4 */
+#define BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ 1
+#define BRW_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ 2
+#define BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ 3
+/* G45, GEN5 */
+#define G45_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ 1
+#define G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ 2
+#define G45_DATAPORT_READ_MESSAGE_AVC_LOOP_FILTER_READ 3
+#define G45_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ 4
+#define G45_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ 6
+/* GEN6 */
+#define GEN6_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ 1
+#define GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ 2
+#define GEN6_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ 4
+#define GEN6_DATAPORT_READ_MESSAGE_OWORD_UNALIGN_BLOCK_READ 5
+#define GEN6_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ 6
+
+#define BRW_DATAPORT_READ_TARGET_DATA_CACHE 0
+#define BRW_DATAPORT_READ_TARGET_RENDER_CACHE 1
+#define BRW_DATAPORT_READ_TARGET_SAMPLER_CACHE 2
+
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE 0
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED 1
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01 2
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23 3
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01 4
+
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE 0
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE 1
+#define BRW_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE 2
+#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE 3
+#define BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE 4
+#define BRW_DATAPORT_WRITE_MESSAGE_STREAMED_VERTEX_BUFFER_WRITE 5
+#define BRW_DATAPORT_WRITE_MESSAGE_FLUSH_RENDER_CACHE 7
+
+/* GEN6 */
+#define GEN6_DATAPORT_WRITE_MESSAGE_DWORD_ATOMIC_WRITE 7
+#define GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE 8
+#define GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE 9
+#define GEN6_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE 10
+#define GEN6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE 11
+#define GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE 12
+#define GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE 13
+#define GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_UNORM_WRITE 14
+
+/* GEN7 */
+#define GEN7_DATAPORT_RC_MEDIA_BLOCK_READ 4
+#define GEN7_DATAPORT_RC_TYPED_SURFACE_READ 5
+#define GEN7_DATAPORT_RC_TYPED_ATOMIC_OP 6
+#define GEN7_DATAPORT_RC_MEMORY_FENCE 7
+#define GEN7_DATAPORT_RC_MEDIA_BLOCK_WRITE 10
+#define GEN7_DATAPORT_RC_RENDER_TARGET_WRITE 12
+#define GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE 13
+#define GEN7_DATAPORT_DC_OWORD_BLOCK_READ 0
+#define GEN7_DATAPORT_DC_UNALIGNED_OWORD_BLOCK_READ 1
+#define GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_READ 2
+#define GEN7_DATAPORT_DC_DWORD_SCATTERED_READ 3
+#define GEN7_DATAPORT_DC_BYTE_SCATTERED_READ 4
+#define GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ 5
+#define GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP 6
+#define GEN7_DATAPORT_DC_MEMORY_FENCE 7
+#define GEN7_DATAPORT_DC_OWORD_BLOCK_WRITE 8
+#define GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE 10
+#define GEN7_DATAPORT_DC_DWORD_SCATTERED_WRITE 11
+#define GEN7_DATAPORT_DC_BYTE_SCATTERED_WRITE 12
+#define GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE 13
+
+#define GEN7_DATAPORT_SCRATCH_READ ((1 << 18) | \
+ (0 << 17))
+#define GEN7_DATAPORT_SCRATCH_WRITE ((1 << 18) | \
+ (1 << 17))
+#define GEN7_DATAPORT_SCRATCH_NUM_REGS_SHIFT 12
+
+#define GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET 0
+#define GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE 1
+#define GEN7_PIXEL_INTERPOLATOR_LOC_CENTROID 2
+#define GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET 3
+
+/* HSW */
+#define HSW_DATAPORT_DC_PORT0_OWORD_BLOCK_READ 0
+#define HSW_DATAPORT_DC_PORT0_UNALIGNED_OWORD_BLOCK_READ 1
+#define HSW_DATAPORT_DC_PORT0_OWORD_DUAL_BLOCK_READ 2
+#define HSW_DATAPORT_DC_PORT0_DWORD_SCATTERED_READ 3
+#define HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ 4
+#define HSW_DATAPORT_DC_PORT0_MEMORY_FENCE 7
+#define HSW_DATAPORT_DC_PORT0_OWORD_BLOCK_WRITE 8
+#define HSW_DATAPORT_DC_PORT0_OWORD_DUAL_BLOCK_WRITE 10
+#define HSW_DATAPORT_DC_PORT0_DWORD_SCATTERED_WRITE 11
+#define HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE 12
+
+#define HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ 1
+#define HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP 2
+#define HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2 3
+#define HSW_DATAPORT_DC_PORT1_MEDIA_BLOCK_READ 4
+#define HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ 5
+#define HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP 6
+#define HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2 7
+#define HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE 9
+#define HSW_DATAPORT_DC_PORT1_MEDIA_BLOCK_WRITE 10
+#define HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP 11
+#define HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2 12
+#define HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE 13
+
+/* GEN9 */
+#define GEN9_DATAPORT_RC_RENDER_TARGET_WRITE 12
+#define GEN9_DATAPORT_RC_RENDER_TARGET_READ 13
+
+/* Dataport special binding table indices: */
+#define BRW_BTI_STATELESS 255
+#define GEN7_BTI_SLM 254
+/* Note that on Gen8+ BTI 255 was redefined to be IA-coherent according to the
+ * hardware spec, however because the DRM sets bit 4 of HDC_CHICKEN0 on BDW,
+ * CHV and at least some pre-production steppings of SKL due to
+ * WaForceEnableNonCoherent, HDC memory access may have been overridden by the
+ * kernel to be non-coherent (matching the behavior of the same BTI on
+ * pre-Gen8 hardware) and BTI 255 may actually be an alias for BTI 253.
+ */
+#define GEN8_BTI_STATELESS_IA_COHERENT 255
+#define GEN8_BTI_STATELESS_NON_COHERENT 253
+
+/* dataport atomic operations. */
+#define BRW_AOP_AND 1
+#define BRW_AOP_OR 2
+#define BRW_AOP_XOR 3
+#define BRW_AOP_MOV 4
+#define BRW_AOP_INC 5
+#define BRW_AOP_DEC 6
+#define BRW_AOP_ADD 7
+#define BRW_AOP_SUB 8
+#define BRW_AOP_REVSUB 9
+#define BRW_AOP_IMAX 10
+#define BRW_AOP_IMIN 11
+#define BRW_AOP_UMAX 12
+#define BRW_AOP_UMIN 13
+#define BRW_AOP_CMPWR 14
+#define BRW_AOP_PREDEC 15
+
+#define BRW_MATH_FUNCTION_INV 1
+#define BRW_MATH_FUNCTION_LOG 2
+#define BRW_MATH_FUNCTION_EXP 3
+#define BRW_MATH_FUNCTION_SQRT 4
+#define BRW_MATH_FUNCTION_RSQ 5
+#define BRW_MATH_FUNCTION_SIN 6
+#define BRW_MATH_FUNCTION_COS 7
+#define BRW_MATH_FUNCTION_SINCOS 8 /* gen4, gen5 */
+#define BRW_MATH_FUNCTION_FDIV 9 /* gen6+ */
+#define BRW_MATH_FUNCTION_POW 10
+#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER 11
+#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT 12
+#define BRW_MATH_FUNCTION_INT_DIV_REMAINDER 13
+#define GEN8_MATH_FUNCTION_INVM 14
+#define GEN8_MATH_FUNCTION_RSQRTM 15
+
+#define BRW_MATH_INTEGER_UNSIGNED 0
+#define BRW_MATH_INTEGER_SIGNED 1
+
+#define BRW_MATH_PRECISION_FULL 0
+#define BRW_MATH_PRECISION_PARTIAL 1
+
+#define BRW_MATH_SATURATE_NONE 0
+#define BRW_MATH_SATURATE_SATURATE 1
+
+#define BRW_MATH_DATA_VECTOR 0
+#define BRW_MATH_DATA_SCALAR 1
+
+#define BRW_URB_OPCODE_WRITE_HWORD 0
+#define BRW_URB_OPCODE_WRITE_OWORD 1
+#define BRW_URB_OPCODE_READ_HWORD 2
+#define BRW_URB_OPCODE_READ_OWORD 3
+#define GEN7_URB_OPCODE_ATOMIC_MOV 4
+#define GEN7_URB_OPCODE_ATOMIC_INC 5
+#define GEN8_URB_OPCODE_ATOMIC_ADD 6
+#define GEN8_URB_OPCODE_SIMD8_WRITE 7
+#define GEN8_URB_OPCODE_SIMD8_READ 8
+
+#define BRW_URB_SWIZZLE_NONE 0
+#define BRW_URB_SWIZZLE_INTERLEAVE 1
+#define BRW_URB_SWIZZLE_TRANSPOSE 2
+
+#define BRW_SCRATCH_SPACE_SIZE_1K 0
+#define BRW_SCRATCH_SPACE_SIZE_2K 1
+#define BRW_SCRATCH_SPACE_SIZE_4K 2
+#define BRW_SCRATCH_SPACE_SIZE_8K 3
+#define BRW_SCRATCH_SPACE_SIZE_16K 4
+#define BRW_SCRATCH_SPACE_SIZE_32K 5
+#define BRW_SCRATCH_SPACE_SIZE_64K 6
+#define BRW_SCRATCH_SPACE_SIZE_128K 7
+#define BRW_SCRATCH_SPACE_SIZE_256K 8
+#define BRW_SCRATCH_SPACE_SIZE_512K 9
+#define BRW_SCRATCH_SPACE_SIZE_1M 10
+#define BRW_SCRATCH_SPACE_SIZE_2M 11
+
+#define BRW_MESSAGE_GATEWAY_SFID_OPEN_GATEWAY 0
+#define BRW_MESSAGE_GATEWAY_SFID_CLOSE_GATEWAY 1
+#define BRW_MESSAGE_GATEWAY_SFID_FORWARD_MSG 2
+#define BRW_MESSAGE_GATEWAY_SFID_GET_TIMESTAMP 3
+#define BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG 4
+#define BRW_MESSAGE_GATEWAY_SFID_UPDATE_GATEWAY_STATE 5
+#define BRW_MESSAGE_GATEWAY_SFID_MMIO_READ_WRITE 6
+
+
+/* Gen7 "GS URB Entry Allocation Size" is a U9-1 field, so the maximum gs_size
+ * is 2^9, or 512. It's counted in multiples of 64 bytes.
+ *
+ * Identical for VS, DS, and HS.
+ */
+#define GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES (512*64)
+#define GEN7_MAX_DS_URB_ENTRY_SIZE_BYTES (512*64)
+#define GEN7_MAX_HS_URB_ENTRY_SIZE_BYTES (512*64)
+#define GEN7_MAX_VS_URB_ENTRY_SIZE_BYTES (512*64)
+
+/* Gen6 "GS URB Entry Allocation Size" is defined as a number of 1024-bit
+ * (128 bytes) URB rows and the maximum allowed value is 5 rows.
+ */
+#define GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES (5*128)
+
+/* GS Thread Payload
+ */
+/* R0 */
+# define GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT 27
+
+#endif /* BRW_EU_DEFINES_H */
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
new file mode 100644
index 00000000000..058742d4f6e
--- /dev/null
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -0,0 +1,3675 @@
+/*
+ Copyright (C) Intel Corp. 2006. All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+ * Authors:
+ * Keith Whitwell <[email protected]>
+ */
+
+
+#include "brw_eu_defines.h"
+#include "brw_eu.h"
+
+#include "util/ralloc.h"
+
+/**
+ * Prior to Sandybridge, the SEND instruction accepted non-MRF source
+ * registers, implicitly moving the operand to a message register.
+ *
+ * On Sandybridge, this is no longer the case. This function performs the
+ * explicit move; it should be called before emitting a SEND instruction.
+ */
+void
+gen6_resolve_implied_move(struct brw_codegen *p,
+ struct brw_reg *src,
+ unsigned msg_reg_nr)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ if (devinfo->gen < 6)
+ return;
+
+ if (src->file == BRW_MESSAGE_REGISTER_FILE)
+ return;
+
+ if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
+ brw_push_insn_state(p);
+ brw_set_default_exec_size(p, BRW_EXECUTE_8);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
+ retype(*src, BRW_REGISTER_TYPE_UD));
+ brw_pop_insn_state(p);
+ }
+ *src = brw_message_reg(msg_reg_nr);
+}
+
+static void
+gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
+{
+ /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
+ * "The send with EOT should use register space R112-R127 for <src>. This is
+ * to enable loading of a new thread into the same slot while the message
+ * with EOT for current thread is pending dispatch."
+ *
+ * Since we're pretending to have 16 MRFs anyway, we may as well use the
+ * registers required for messages with EOT.
+ */
+ const struct gen_device_info *devinfo = p->devinfo;
+ if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
+ reg->file = BRW_GENERAL_REGISTER_FILE;
+ reg->nr += GEN7_MRF_HACK_START;
+ }
+}
+
+/**
+ * Convert a brw_reg_type enumeration value into the hardware representation.
+ *
+ * The hardware encoding may depend on whether the value is an immediate.
+ */
+unsigned
+brw_reg_type_to_hw_type(const struct gen_device_info *devinfo,
+ enum brw_reg_type type, enum brw_reg_file file)
+{
+ if (file == BRW_IMMEDIATE_VALUE) {
+ static const int imm_hw_types[] = {
+ [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
+ [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
+ [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
+ [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
+ [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
+ [BRW_REGISTER_TYPE_UB] = -1,
+ [BRW_REGISTER_TYPE_B] = -1,
+ [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
+ [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
+ [BRW_REGISTER_TYPE_V] = BRW_HW_REG_IMM_TYPE_V,
+ [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
+ [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
+ [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
+ [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
+ };
+ assert(type < ARRAY_SIZE(imm_hw_types));
+ assert(imm_hw_types[type] != -1);
+ assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
+ return imm_hw_types[type];
+ } else {
+ /* Non-immediate registers */
+ static const int hw_types[] = {
+ [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
+ [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D,
+ [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
+ [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W,
+ [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
+ [BRW_REGISTER_TYPE_B] = BRW_HW_REG_NON_IMM_TYPE_B,
+ [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F,
+ [BRW_REGISTER_TYPE_UV] = -1,
+ [BRW_REGISTER_TYPE_VF] = -1,
+ [BRW_REGISTER_TYPE_V] = -1,
+ [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
+ [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
+ [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
+ [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q,
+ };
+ assert(type < ARRAY_SIZE(hw_types));
+ assert(hw_types[type] != -1);
+ assert(devinfo->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
+ assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_Q);
+ return hw_types[type];
+ }
+}
+
+/**
+ * Return the element size given a hardware register type and file.
+ *
+ * The hardware encoding may depend on whether the value is an immediate.
+ */
+unsigned
+brw_hw_reg_type_to_size(const struct gen_device_info *devinfo,
+ unsigned type, enum brw_reg_file file)
+{
+ if (file == BRW_IMMEDIATE_VALUE) {
+ static const unsigned imm_hw_sizes[] = {
+ [BRW_HW_REG_TYPE_UD] = 4,
+ [BRW_HW_REG_TYPE_D] = 4,
+ [BRW_HW_REG_TYPE_UW] = 2,
+ [BRW_HW_REG_TYPE_W] = 2,
+ [BRW_HW_REG_IMM_TYPE_UV] = 2,
+ [BRW_HW_REG_IMM_TYPE_VF] = 4,
+ [BRW_HW_REG_IMM_TYPE_V] = 2,
+ [BRW_HW_REG_TYPE_F] = 4,
+ [GEN8_HW_REG_TYPE_UQ] = 8,
+ [GEN8_HW_REG_TYPE_Q] = 8,
+ [GEN8_HW_REG_IMM_TYPE_DF] = 8,
+ [GEN8_HW_REG_IMM_TYPE_HF] = 2,
+ };
+ assert(type < ARRAY_SIZE(imm_hw_sizes));
+ assert(devinfo->gen >= 6 || type != BRW_HW_REG_IMM_TYPE_UV);
+ assert(devinfo->gen >= 8 || type <= BRW_HW_REG_TYPE_F);
+ return imm_hw_sizes[type];
+ } else {
+ /* Non-immediate registers */
+ static const unsigned hw_sizes[] = {
+ [BRW_HW_REG_TYPE_UD] = 4,
+ [BRW_HW_REG_TYPE_D] = 4,
+ [BRW_HW_REG_TYPE_UW] = 2,
+ [BRW_HW_REG_TYPE_W] = 2,
+ [BRW_HW_REG_NON_IMM_TYPE_UB] = 1,
+ [BRW_HW_REG_NON_IMM_TYPE_B] = 1,
+ [GEN7_HW_REG_NON_IMM_TYPE_DF] = 8,
+ [BRW_HW_REG_TYPE_F] = 4,
+ [GEN8_HW_REG_TYPE_UQ] = 8,
+ [GEN8_HW_REG_TYPE_Q] = 8,
+ [GEN8_HW_REG_NON_IMM_TYPE_HF] = 2,
+ };
+ assert(type < ARRAY_SIZE(hw_sizes));
+ assert(devinfo->gen >= 7 ||
+ (type < GEN7_HW_REG_NON_IMM_TYPE_DF || type == BRW_HW_REG_TYPE_F));
+ assert(devinfo->gen >= 8 || type <= BRW_HW_REG_TYPE_F);
+ return hw_sizes[type];
+ }
+}
+
+void
+brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+
+ if (dest.file == BRW_MESSAGE_REGISTER_FILE)
+ assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
+ else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
+ assert(dest.nr < 128);
+
+ gen7_convert_mrf_to_grf(p, &dest);
+
+ brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
+ brw_inst_set_dst_reg_type(devinfo, inst,
+ brw_reg_type_to_hw_type(devinfo, dest.type,
+ dest.file));
+ brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
+
+ if (dest.address_mode == BRW_ADDRESS_DIRECT) {
+ brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
+
+ if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+ brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
+ if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
+ dest.hstride = BRW_HORIZONTAL_STRIDE_1;
+ brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
+ } else {
+ brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
+ brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
+ if (dest.file == BRW_GENERAL_REGISTER_FILE ||
+ dest.file == BRW_MESSAGE_REGISTER_FILE) {
+ assert(dest.writemask != 0);
+ }
+ /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
+ * Although Dst.HorzStride is a don't care for Align16, HW needs
+ * this to be programmed as "01".
+ */
+ brw_inst_set_dst_hstride(devinfo, inst, 1);
+ }
+ } else {
+ brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
+
+ /* These are different sizes in align1 vs align16:
+ */
+ if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+ brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
+ dest.indirect_offset);
+ if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
+ dest.hstride = BRW_HORIZONTAL_STRIDE_1;
+ brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
+ } else {
+ brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
+ dest.indirect_offset);
+ /* even ignored in da16, still need to set as '01' */
+ brw_inst_set_dst_hstride(devinfo, inst, 1);
+ }
+ }
+
+ /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
+ * or 16 (SIMD16), as that's normally correct. However, when dealing with
+ * small registers, we automatically reduce it to match the register size.
+ *
+ * In platforms that support fp64 we can emit instructions with a width of
+ * 4 that need two SIMD8 registers and an exec_size of 8 or 16. In these
+ * cases we need to make sure that these instructions have their exec sizes
+ * set properly when they are emitted and we can't rely on this code to fix
+ * it.
+ */
+ bool fix_exec_size;
+ if (devinfo->gen >= 6)
+ fix_exec_size = dest.width < BRW_EXECUTE_4;
+ else
+ fix_exec_size = dest.width < BRW_EXECUTE_8;
+
+ if (fix_exec_size)
+ brw_inst_set_exec_size(devinfo, inst, dest.width);
+}
+
+static void
+validate_reg(const struct gen_device_info *devinfo,
+ brw_inst *inst, struct brw_reg reg)
+{
+ const int hstride_for_reg[] = {0, 1, 2, 4};
+ const int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32};
+ const int width_for_reg[] = {1, 2, 4, 8, 16};
+ const int execsize_for_reg[] = {1, 2, 4, 8, 16, 32};
+ int width, hstride, vstride, execsize;
+
+ if (reg.file == BRW_IMMEDIATE_VALUE) {
+ /* 3.3.6: Region Parameters. Restriction: Immediate vectors
+ * mean the destination has to be 128-bit aligned and the
+ * destination horiz stride has to be a word.
+ */
+ if (reg.type == BRW_REGISTER_TYPE_V) {
+ unsigned UNUSED elem_size = brw_element_size(devinfo, inst, dst);
+ assert(hstride_for_reg[brw_inst_dst_hstride(devinfo, inst)] *
+ elem_size == 2);
+ }
+
+ return;
+ }
+
+ if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+ reg.file == BRW_ARF_NULL)
+ return;
+
+ /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
+ *
+ * "Swizzling is not allowed when an accumulator is used as an implicit
+ * source or an explicit source in an instruction."
+ */
+ if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+ reg.nr == BRW_ARF_ACCUMULATOR)
+ assert(reg.swizzle == BRW_SWIZZLE_XYZW);
+
+ assert(reg.hstride >= 0 && reg.hstride < ARRAY_SIZE(hstride_for_reg));
+ hstride = hstride_for_reg[reg.hstride];
+
+ if (reg.vstride == 0xf) {
+ vstride = -1;
+ } else {
+ assert(reg.vstride >= 0 && reg.vstride < ARRAY_SIZE(vstride_for_reg));
+ vstride = vstride_for_reg[reg.vstride];
+ }
+
+ assert(reg.width >= 0 && reg.width < ARRAY_SIZE(width_for_reg));
+ width = width_for_reg[reg.width];
+
+ assert(brw_inst_exec_size(devinfo, inst) >= 0 &&
+ brw_inst_exec_size(devinfo, inst) < ARRAY_SIZE(execsize_for_reg));
+ execsize = execsize_for_reg[brw_inst_exec_size(devinfo, inst)];
+
+ /* Restrictions from 3.3.10: Register Region Restrictions. */
+ /* 3. */
+ assert(execsize >= width);
+
+ /* 4. */
+ if (execsize == width && hstride != 0) {
+ assert(vstride == -1 || vstride == width * hstride);
+ }
+
+ /* 5. */
+ if (execsize == width && hstride == 0) {
+ /* no restriction on vstride. */
+ }
+
+ /* 6. */
+ if (width == 1) {
+ assert(hstride == 0);
+ }
+
+ /* 7. */
+ if (execsize == 1 && width == 1) {
+ assert(hstride == 0);
+ assert(vstride == 0);
+ }
+
+ /* 8. */
+ if (vstride == 0 && hstride == 0) {
+ assert(width == 1);
+ }
+
+ /* 10. Check destination issues. */
+}
+
+static bool
+is_compactable_immediate(unsigned imm)
+{
+ /* We get the low 12 bits as-is. */
+ imm &= ~0xfff;
+
+ /* We get one bit replicated through the top 20 bits. */
+ return imm == 0 || imm == 0xfffff000;
+}
+
+void
+brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+
+ if (reg.file == BRW_MESSAGE_REGISTER_FILE)
+ assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
+ else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
+ assert(reg.nr < 128);
+
+ gen7_convert_mrf_to_grf(p, &reg);
+
+ if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
+ brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
+ /* Any source modifiers or regions will be ignored, since this just
+ * identifies the MRF/GRF to start reading the message contents from.
+ * Check for some likely failures.
+ */
+ assert(!reg.negate);
+ assert(!reg.abs);
+ assert(reg.address_mode == BRW_ADDRESS_DIRECT);
+ }
+
+ validate_reg(devinfo, inst, reg);
+
+ brw_inst_set_src0_reg_file(devinfo, inst, reg.file);
+ brw_inst_set_src0_reg_type(devinfo, inst,
+ brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
+ brw_inst_set_src0_abs(devinfo, inst, reg.abs);
+ brw_inst_set_src0_negate(devinfo, inst, reg.negate);
+ brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
+
+ if (reg.file == BRW_IMMEDIATE_VALUE) {
+ if (reg.type == BRW_REGISTER_TYPE_DF ||
+ brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
+ brw_inst_set_imm_df(devinfo, inst, reg.df);
+ else if (reg.type == BRW_REGISTER_TYPE_UQ ||
+ reg.type == BRW_REGISTER_TYPE_Q)
+ brw_inst_set_imm_uq(devinfo, inst, reg.u64);
+ else
+ brw_inst_set_imm_ud(devinfo, inst, reg.ud);
+
+ /* The Bspec's section titled "Non-present Operands" claims that if src0
+ * is an immediate that src1's type must be the same as that of src0.
+ *
+ * The SNB+ DataTypeIndex instruction compaction tables contain mappings
+ * that do not follow this rule. E.g., from the IVB/HSW table:
+ *
+ * DataTypeIndex 18-Bit Mapping Mapped Meaning
+ * 3 001000001011111101 r:f | i:vf | a:ud | <1> | dir |
+ *
+ * And from the SNB table:
+ *
+ * DataTypeIndex 18-Bit Mapping Mapped Meaning
+ * 8 001000000111101100 a:w | i:w | a:ud | <1> | dir |
+ *
+ * Neither of these cause warnings from the simulator when used,
+ * compacted or otherwise. In fact, all compaction mappings that have an
+ * immediate in src0 use a:ud for src1.
+ *
+ * The GM45 instruction compaction tables do not contain mapped meanings
+ * so it's not clear whether it has the restriction. We'll assume it was
+ * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
+ *
+ * Don't do any of this for 64-bit immediates, since the src1 fields
+ * overlap with the immediate and setting them would overwrite the
+ * immediate we set.
+ */
+ if (type_sz(reg.type) < 8) {
+ brw_inst_set_src1_reg_file(devinfo, inst,
+ BRW_ARCHITECTURE_REGISTER_FILE);
+ if (devinfo->gen < 6) {
+ brw_inst_set_src1_reg_type(devinfo, inst,
+ brw_inst_src0_reg_type(devinfo, inst));
+ } else {
+ brw_inst_set_src1_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
+ }
+ }
+
+ /* Compacted instructions only have 12-bits (plus 1 for the other 20)
+ * for immediate values. Presumably the hardware engineers realized
+ * that the only useful floating-point value that could be represented
+ * in this format is 0.0, which can also be represented as a VF-typed
+ * immediate, so they gave us the previously mentioned mapping on IVB+.
+ *
+ * Strangely, we do have a mapping for imm:f in src1, so we don't need
+ * to do this there.
+ *
+ * If we see a 0.0:F, change the type to VF so that it can be compacted.
+ */
+ if (brw_inst_imm_ud(devinfo, inst) == 0x0 &&
+ brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_F &&
+ brw_inst_dst_reg_type(devinfo, inst) != GEN7_HW_REG_NON_IMM_TYPE_DF) {
+ brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_IMM_TYPE_VF);
+ }
+
+ /* There are no mappings for dst:d | i:d, so if the immediate is suitable
+ * set the types to :UD so the instruction can be compacted.
+ */
+ if (is_compactable_immediate(brw_inst_imm_ud(devinfo, inst)) &&
+ brw_inst_cond_modifier(devinfo, inst) == BRW_CONDITIONAL_NONE &&
+ brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D &&
+ brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D) {
+ brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
+ brw_inst_set_dst_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
+ }
+ } else {
+ if (reg.address_mode == BRW_ADDRESS_DIRECT) {
+ brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
+ if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+ brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
+ } else {
+ brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
+ }
+ } else {
+ brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
+
+ if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+ brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
+ } else {
+ brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
+ }
+ }
+
+ if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+ if (reg.width == BRW_WIDTH_1 &&
+ brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
+ brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
+ brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
+ brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
+ } else {
+ brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
+ brw_inst_set_src0_width(devinfo, inst, reg.width);
+ brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
+ }
+ } else {
+ brw_inst_set_src0_da16_swiz_x(devinfo, inst,
+ BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
+ brw_inst_set_src0_da16_swiz_y(devinfo, inst,
+ BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
+ brw_inst_set_src0_da16_swiz_z(devinfo, inst,
+ BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
+ brw_inst_set_src0_da16_swiz_w(devinfo, inst,
+ BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
+
+ /* This is an oddity of the fact we're using the same
+ * descriptions for registers in align_16 as align_1:
+ */
+ if (reg.vstride == BRW_VERTICAL_STRIDE_8)
+ brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
+ else
+ brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
+ }
+ }
+}
+
+
+void
+brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+
+ if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
+ assert(reg.nr < 128);
+
+ /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
+ *
+ * "Accumulator registers may be accessed explicitly as src0
+ * operands only."
+ */
+ assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
+ reg.nr != BRW_ARF_ACCUMULATOR);
+
+ gen7_convert_mrf_to_grf(p, &reg);
+ assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
+
+ validate_reg(devinfo, inst, reg);
+
+ brw_inst_set_src1_reg_file(devinfo, inst, reg.file);
+ brw_inst_set_src1_reg_type(devinfo, inst,
+ brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
+ brw_inst_set_src1_abs(devinfo, inst, reg.abs);
+ brw_inst_set_src1_negate(devinfo, inst, reg.negate);
+
+ /* Only src1 can be immediate in two-argument instructions.
+ */
+ assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
+
+ if (reg.file == BRW_IMMEDIATE_VALUE) {
+ /* two-argument instructions can only use 32-bit immediates */
+ assert(type_sz(reg.type) < 8);
+ brw_inst_set_imm_ud(devinfo, inst, reg.ud);
+ } else {
+ /* This is a hardware restriction, which may or may not be lifted
+ * in the future:
+ */
+ assert (reg.address_mode == BRW_ADDRESS_DIRECT);
+ /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
+
+ brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
+ if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+ brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
+ } else {
+ brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
+ }
+
+ if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+ if (reg.width == BRW_WIDTH_1 &&
+ brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
+ brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
+ brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
+ brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
+ } else {
+ brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
+ brw_inst_set_src1_width(devinfo, inst, reg.width);
+ brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
+ }
+ } else {
+ brw_inst_set_src1_da16_swiz_x(devinfo, inst,
+ BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
+ brw_inst_set_src1_da16_swiz_y(devinfo, inst,
+ BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
+ brw_inst_set_src1_da16_swiz_z(devinfo, inst,
+ BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
+ brw_inst_set_src1_da16_swiz_w(devinfo, inst,
+ BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
+
+ /* This is an oddity of the fact we're using the same
+ * descriptions for registers in align_16 as align_1:
+ */
+ if (reg.vstride == BRW_VERTICAL_STRIDE_8)
+ brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
+ else
+ brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
+ }
+ }
+}
+
+/**
+ * Set the Message Descriptor and Extended Message Descriptor fields
+ * for SEND messages.
+ *
+ * \note This zeroes out the Function Control bits, so it must be called
+ * \b before filling out any message-specific data. Callers can
+ * choose not to fill in irrelevant bits; they will be zero.
+ */
+void
+brw_set_message_descriptor(struct brw_codegen *p,
+ brw_inst *inst,
+ enum brw_message_target sfid,
+ unsigned msg_length,
+ unsigned response_length,
+ bool header_present,
+ bool end_of_thread)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+
+ brw_set_src1(p, inst, brw_imm_d(0));
+
+ /* For indirect sends, `inst` will not be the SEND/SENDC instruction
+ * itself; instead, it will be a MOV/OR into the address register.
+ *
+ * In this case, we avoid setting the extended message descriptor bits,
+ * since they go on the later SEND/SENDC instead and if set here would
+ * instead clobber the conditionalmod bits.
+ */
+ unsigned opcode = brw_inst_opcode(devinfo, inst);
+ if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
+ brw_inst_set_sfid(devinfo, inst, sfid);
+ }
+
+ brw_inst_set_mlen(devinfo, inst, msg_length);
+ brw_inst_set_rlen(devinfo, inst, response_length);
+ brw_inst_set_eot(devinfo, inst, end_of_thread);
+
+ if (devinfo->gen >= 5) {
+ brw_inst_set_header_present(devinfo, inst, header_present);
+ }
+}
+
+static void brw_set_math_message( struct brw_codegen *p,
+ brw_inst *inst,
+ unsigned function,
+ unsigned integer_type,
+ bool low_precision,
+ unsigned dataType )
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ unsigned msg_length;
+ unsigned response_length;
+
+ /* Infer message length from the function */
+ switch (function) {
+ case BRW_MATH_FUNCTION_POW:
+ case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
+ case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
+ case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
+ msg_length = 2;
+ break;
+ default:
+ msg_length = 1;
+ break;
+ }
+
+ /* Infer response length from the function */
+ switch (function) {
+ case BRW_MATH_FUNCTION_SINCOS:
+ case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
+ response_length = 2;
+ break;
+ default:
+ response_length = 1;
+ break;
+ }
+
+
+ brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
+ msg_length, response_length, false, false);
+ brw_inst_set_math_msg_function(devinfo, inst, function);
+ brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
+ brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
+ brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
+ brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
+ brw_inst_set_saturate(devinfo, inst, 0);
+}
+
+
+static void brw_set_ff_sync_message(struct brw_codegen *p,
+ brw_inst *insn,
+ bool allocate,
+ unsigned response_length,
+ bool end_of_thread)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+
+ brw_set_message_descriptor(p, insn, BRW_SFID_URB,
+ 1, response_length, true, end_of_thread);
+ brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
+ brw_inst_set_urb_allocate(devinfo, insn, allocate);
+ /* The following fields are not used by FF_SYNC: */
+ brw_inst_set_urb_global_offset(devinfo, insn, 0);
+ brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
+ brw_inst_set_urb_used(devinfo, insn, 0);
+ brw_inst_set_urb_complete(devinfo, insn, 0);
+}
+
+static void brw_set_urb_message( struct brw_codegen *p,
+ brw_inst *insn,
+ enum brw_urb_write_flags flags,
+ unsigned msg_length,
+ unsigned response_length,
+ unsigned offset,
+ unsigned swizzle_control )
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+
+ assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
+ assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
+ assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
+
+ brw_set_message_descriptor(p, insn, BRW_SFID_URB,
+ msg_length, response_length, true,
+ flags & BRW_URB_WRITE_EOT);
+
+ if (flags & BRW_URB_WRITE_OWORD) {
+ assert(msg_length == 2); /* header + one OWORD of data */
+ brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
+ } else {
+ brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
+ }
+
+ brw_inst_set_urb_global_offset(devinfo, insn, offset);
+ brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
+
+ if (devinfo->gen < 8) {
+ brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
+ }
+
+ if (devinfo->gen < 7) {
+ brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
+ brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
+ } else {
+ brw_inst_set_urb_per_slot_offset(devinfo, insn,
+ !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
+ }
+}
+
+void
+brw_set_dp_write_message(struct brw_codegen *p,
+ brw_inst *insn,
+ unsigned binding_table_index,
+ unsigned msg_control,
+ unsigned msg_type,
+ unsigned target_cache,
+ unsigned msg_length,
+ bool header_present,
+ unsigned last_render_target,
+ unsigned response_length,
+ unsigned end_of_thread,
+ unsigned send_commit_msg)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
+ BRW_SFID_DATAPORT_WRITE);
+
+ brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
+ header_present, end_of_thread);
+
+ brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
+ brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
+ brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
+ brw_inst_set_rt_last(devinfo, insn, last_render_target);
+ if (devinfo->gen < 7) {
+ brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
+ }
+}
+
+void
+brw_set_dp_read_message(struct brw_codegen *p,
+ brw_inst *insn,
+ unsigned binding_table_index,
+ unsigned msg_control,
+ unsigned msg_type,
+ unsigned target_cache,
+ unsigned msg_length,
+ bool header_present,
+ unsigned response_length)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
+ BRW_SFID_DATAPORT_READ);
+
+ brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
+ header_present, false);
+
+ brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
+ brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
+ brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
+ if (devinfo->gen < 6)
+ brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
+}
+
+void
+brw_set_sampler_message(struct brw_codegen *p,
+ brw_inst *inst,
+ unsigned binding_table_index,
+ unsigned sampler,
+ unsigned msg_type,
+ unsigned response_length,
+ unsigned msg_length,
+ unsigned header_present,
+ unsigned simd_mode,
+ unsigned return_format)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+
+ brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
+ response_length, header_present, false);
+
+ brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
+ brw_inst_set_sampler(devinfo, inst, sampler);
+ brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
+ if (devinfo->gen >= 5) {
+ brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
+ } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
+ brw_inst_set_sampler_return_format(devinfo, inst, return_format);
+ }
+}
+
+static void
+gen7_set_dp_scratch_message(struct brw_codegen *p,
+ brw_inst *inst,
+ bool write,
+ bool dword,
+ bool invalidate_after_read,
+ unsigned num_regs,
+ unsigned addr_offset,
+ unsigned mlen,
+ unsigned rlen,
+ bool header_present)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
+ (devinfo->gen >= 8 && num_regs == 8));
+ const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
+ num_regs - 1);
+
+ brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
+ mlen, rlen, header_present, false);
+ brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
+ brw_inst_set_scratch_read_write(devinfo, inst, write);
+ brw_inst_set_scratch_type(devinfo, inst, dword);
+ brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
+ brw_inst_set_scratch_block_size(devinfo, inst, block_size);
+ brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
+}
+
+#define next_insn brw_next_insn
+brw_inst *
+brw_next_insn(struct brw_codegen *p, unsigned opcode)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ brw_inst *insn;
+
+ if (p->nr_insn + 1 > p->store_size) {
+ p->store_size <<= 1;
+ p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
+ }
+
+ p->next_insn_offset += 16;
+ insn = &p->store[p->nr_insn++];
+ memcpy(insn, p->current, sizeof(*insn));
+
+ brw_inst_set_opcode(devinfo, insn, opcode);
+ return insn;
+}
+
+static brw_inst *
+brw_alu1(struct brw_codegen *p, unsigned opcode,
+ struct brw_reg dest, struct brw_reg src)
+{
+ brw_inst *insn = next_insn(p, opcode);
+ brw_set_dest(p, insn, dest);
+ brw_set_src0(p, insn, src);
+ return insn;
+}
+
+static brw_inst *
+brw_alu2(struct brw_codegen *p, unsigned opcode,
+ struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
+{
+ /* 64-bit immediates are only supported on 1-src instructions */
+ assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
+ assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
+
+ brw_inst *insn = next_insn(p, opcode);
+ brw_set_dest(p, insn, dest);
+ brw_set_src0(p, insn, src0);
+ brw_set_src1(p, insn, src1);
+ return insn;
+}
+
+static int
+get_3src_subreg_nr(struct brw_reg reg)
+{
+ /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
+ * use 32-bit units (components 0..7). Since they only support F/D/UD
+ * types, this doesn't lose any flexibility, but uses fewer bits.
+ */
+ return reg.subnr / 4;
+}
+
+static brw_inst *
+brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
+ struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ brw_inst *inst = next_insn(p, opcode);
+
+ gen7_convert_mrf_to_grf(p, &dest);
+
+ assert(brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16);
+
+ assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
+ dest.file == BRW_MESSAGE_REGISTER_FILE);
+ assert(dest.nr < 128);
+ assert(dest.address_mode == BRW_ADDRESS_DIRECT);
+ assert(dest.type == BRW_REGISTER_TYPE_F ||
+ dest.type == BRW_REGISTER_TYPE_DF ||
+ dest.type == BRW_REGISTER_TYPE_D ||
+ dest.type == BRW_REGISTER_TYPE_UD);
+ if (devinfo->gen == 6) {
+ brw_inst_set_3src_dst_reg_file(devinfo, inst,
+ dest.file == BRW_MESSAGE_REGISTER_FILE);
+ }
+ brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
+ brw_inst_set_3src_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
+ brw_inst_set_3src_dst_writemask(devinfo, inst, dest.writemask);
+
+ assert(src0.file == BRW_GENERAL_REGISTER_FILE);
+ assert(src0.address_mode == BRW_ADDRESS_DIRECT);
+ assert(src0.nr < 128);
+ brw_inst_set_3src_src0_swizzle(devinfo, inst, src0.swizzle);
+ brw_inst_set_3src_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
+ brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
+ brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
+ brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
+ brw_inst_set_3src_src0_rep_ctrl(devinfo, inst,
+ src0.vstride == BRW_VERTICAL_STRIDE_0);
+
+ assert(src1.file == BRW_GENERAL_REGISTER_FILE);
+ assert(src1.address_mode == BRW_ADDRESS_DIRECT);
+ assert(src1.nr < 128);
+ brw_inst_set_3src_src1_swizzle(devinfo, inst, src1.swizzle);
+ brw_inst_set_3src_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
+ brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
+ brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
+ brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
+ brw_inst_set_3src_src1_rep_ctrl(devinfo, inst,
+ src1.vstride == BRW_VERTICAL_STRIDE_0);
+
+ assert(src2.file == BRW_GENERAL_REGISTER_FILE);
+ assert(src2.address_mode == BRW_ADDRESS_DIRECT);
+ assert(src2.nr < 128);
+ brw_inst_set_3src_src2_swizzle(devinfo, inst, src2.swizzle);
+ brw_inst_set_3src_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
+ brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
+ brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
+ brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
+ brw_inst_set_3src_src2_rep_ctrl(devinfo, inst,
+ src2.vstride == BRW_VERTICAL_STRIDE_0);
+
+ if (devinfo->gen >= 7) {
+ /* Set both the source and destination types based on dest.type,
+ * ignoring the source register types. The MAD and LRP emitters ensure
+ * that all four types are float. The BFE and BFI2 emitters, however,
+ * may send us mixed D and UD types and want us to ignore that and use
+ * the destination type.
+ */
+ switch (dest.type) {
+ case BRW_REGISTER_TYPE_F:
+ brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_F);
+ brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_F);
+ break;
+ case BRW_REGISTER_TYPE_DF:
+ brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_DF);
+ brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_DF);
+ break;
+ case BRW_REGISTER_TYPE_D:
+ brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_D);
+ brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_D);
+ break;
+ case BRW_REGISTER_TYPE_UD:
+ brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_UD);
+ brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_UD);
+ break;
+ default:
+ unreachable("not reached");
+ }
+ }
+
+ return inst;
+}
+
+
+/***********************************************************************
+ * Convenience routines.
+ */
+#define ALU1(OP) \
+brw_inst *brw_##OP(struct brw_codegen *p, \
+ struct brw_reg dest, \
+ struct brw_reg src0) \
+{ \
+ return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
+}
+
+#define ALU2(OP) \
+brw_inst *brw_##OP(struct brw_codegen *p, \
+ struct brw_reg dest, \
+ struct brw_reg src0, \
+ struct brw_reg src1) \
+{ \
+ return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
+}
+
+#define ALU3(OP) \
+brw_inst *brw_##OP(struct brw_codegen *p, \
+ struct brw_reg dest, \
+ struct brw_reg src0, \
+ struct brw_reg src1, \
+ struct brw_reg src2) \
+{ \
+ return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
+}
+
+#define ALU3F(OP) \
+brw_inst *brw_##OP(struct brw_codegen *p, \
+ struct brw_reg dest, \
+ struct brw_reg src0, \
+ struct brw_reg src1, \
+ struct brw_reg src2) \
+{ \
+ assert(dest.type == BRW_REGISTER_TYPE_F || \
+ dest.type == BRW_REGISTER_TYPE_DF); \
+ if (dest.type == BRW_REGISTER_TYPE_F) { \
+ assert(src0.type == BRW_REGISTER_TYPE_F); \
+ assert(src1.type == BRW_REGISTER_TYPE_F); \
+ assert(src2.type == BRW_REGISTER_TYPE_F); \
+ } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
+ assert(src0.type == BRW_REGISTER_TYPE_DF); \
+ assert(src1.type == BRW_REGISTER_TYPE_DF); \
+ assert(src2.type == BRW_REGISTER_TYPE_DF); \
+ } \
+ return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
+}
+
+/* Rounding operations (other than RNDD) require two instructions - the first
+ * stores a rounded value (possibly the wrong way) in the dest register, but
+ * also sets a per-channel "increment bit" in the flag register. A predicated
+ * add of 1.0 fixes dest to contain the desired result.
+ *
+ * Sandybridge and later appear to round correctly without an ADD.
+ */
+#define ROUND(OP) \
+void brw_##OP(struct brw_codegen *p, \
+ struct brw_reg dest, \
+ struct brw_reg src) \
+{ \
+ const struct gen_device_info *devinfo = p->devinfo; \
+ brw_inst *rnd, *add; \
+ rnd = next_insn(p, BRW_OPCODE_##OP); \
+ brw_set_dest(p, rnd, dest); \
+ brw_set_src0(p, rnd, src); \
+ \
+ if (devinfo->gen < 6) { \
+ /* turn on round-increments */ \
+ brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
+ add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
+ brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
+ } \
+}
+
+
+ALU1(MOV)
+ALU2(SEL)
+ALU1(NOT)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(SHR)
+ALU2(SHL)
+ALU1(DIM)
+ALU2(ASR)
+ALU1(FRC)
+ALU1(RNDD)
+ALU2(MAC)
+ALU2(MACH)
+ALU1(LZD)
+ALU2(DP4)
+ALU2(DPH)
+ALU2(DP3)
+ALU2(DP2)
+ALU3F(MAD)
+ALU3F(LRP)
+ALU1(BFREV)
+ALU3(BFE)
+ALU2(BFI1)
+ALU3(BFI2)
+ALU1(FBH)
+ALU1(FBL)
+ALU1(CBIT)
+ALU2(ADDC)
+ALU2(SUBB)
+
+ROUND(RNDZ)
+ROUND(RNDE)
+
+
+brw_inst *
+brw_ADD(struct brw_codegen *p, struct brw_reg dest,
+ struct brw_reg src0, struct brw_reg src1)
+{
+ /* 6.2.2: add */
+ if (src0.type == BRW_REGISTER_TYPE_F ||
+ (src0.file == BRW_IMMEDIATE_VALUE &&
+ src0.type == BRW_REGISTER_TYPE_VF)) {
+ assert(src1.type != BRW_REGISTER_TYPE_UD);
+ assert(src1.type != BRW_REGISTER_TYPE_D);
+ }
+
+ if (src1.type == BRW_REGISTER_TYPE_F ||
+ (src1.file == BRW_IMMEDIATE_VALUE &&
+ src1.type == BRW_REGISTER_TYPE_VF)) {
+ assert(src0.type != BRW_REGISTER_TYPE_UD);
+ assert(src0.type != BRW_REGISTER_TYPE_D);
+ }
+
+ return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
+}
+
+brw_inst *
+brw_AVG(struct brw_codegen *p, struct brw_reg dest,
+ struct brw_reg src0, struct brw_reg src1)
+{
+ assert(dest.type == src0.type);
+ assert(src0.type == src1.type);
+ switch (src0.type) {
+ case BRW_REGISTER_TYPE_B:
+ case BRW_REGISTER_TYPE_UB:
+ case BRW_REGISTER_TYPE_W:
+ case BRW_REGISTER_TYPE_UW:
+ case BRW_REGISTER_TYPE_D:
+ case BRW_REGISTER_TYPE_UD:
+ break;
+ default:
+ unreachable("Bad type for brw_AVG");
+ }
+
+ return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
+}
+
+brw_inst *
+brw_MUL(struct brw_codegen *p, struct brw_reg dest,
+ struct brw_reg src0, struct brw_reg src1)
+{
+ /* 6.32.38: mul */
+ if (src0.type == BRW_REGISTER_TYPE_D ||
+ src0.type == BRW_REGISTER_TYPE_UD ||
+ src1.type == BRW_REGISTER_TYPE_D ||
+ src1.type == BRW_REGISTER_TYPE_UD) {
+ assert(dest.type != BRW_REGISTER_TYPE_F);
+ }
+
+ if (src0.type == BRW_REGISTER_TYPE_F ||
+ (src0.file == BRW_IMMEDIATE_VALUE &&
+ src0.type == BRW_REGISTER_TYPE_VF)) {
+ assert(src1.type != BRW_REGISTER_TYPE_UD);
+ assert(src1.type != BRW_REGISTER_TYPE_D);
+ }
+
+ if (src1.type == BRW_REGISTER_TYPE_F ||
+ (src1.file == BRW_IMMEDIATE_VALUE &&
+ src1.type == BRW_REGISTER_TYPE_VF)) {
+ assert(src0.type != BRW_REGISTER_TYPE_UD);
+ assert(src0.type != BRW_REGISTER_TYPE_D);
+ }
+
+ assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
+ src0.nr != BRW_ARF_ACCUMULATOR);
+ assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
+ src1.nr != BRW_ARF_ACCUMULATOR);
+
+ return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
+}
+
+brw_inst *
+brw_LINE(struct brw_codegen *p, struct brw_reg dest,
+ struct brw_reg src0, struct brw_reg src1)
+{
+ src0.vstride = BRW_VERTICAL_STRIDE_0;
+ src0.width = BRW_WIDTH_1;
+ src0.hstride = BRW_HORIZONTAL_STRIDE_0;
+ return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
+}
+
+brw_inst *
+brw_PLN(struct brw_codegen *p, struct brw_reg dest,
+ struct brw_reg src0, struct brw_reg src1)
+{
+ src0.vstride = BRW_VERTICAL_STRIDE_0;
+ src0.width = BRW_WIDTH_1;
+ src0.hstride = BRW_HORIZONTAL_STRIDE_0;
+ src1.vstride = BRW_VERTICAL_STRIDE_8;
+ src1.width = BRW_WIDTH_8;
+ src1.hstride = BRW_HORIZONTAL_STRIDE_1;
+ return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
+}
+
+brw_inst *
+brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ const bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
+ /* The F32TO16 instruction doesn't support 32-bit destination types in
+ * Align1 mode, and neither does the Gen8 implementation in terms of a
+ * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
+ * an undocumented feature.
+ */
+ const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
+ (!align16 || devinfo->gen >= 8));
+ brw_inst *inst;
+
+ if (align16) {
+ assert(dst.type == BRW_REGISTER_TYPE_UD);
+ } else {
+ assert(dst.type == BRW_REGISTER_TYPE_UD ||
+ dst.type == BRW_REGISTER_TYPE_W ||
+ dst.type == BRW_REGISTER_TYPE_UW ||
+ dst.type == BRW_REGISTER_TYPE_HF);
+ }
+
+ brw_push_insn_state(p);
+
+ if (needs_zero_fill) {
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
+ }
+
+ if (devinfo->gen >= 8) {
+ inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
+ } else {
+ assert(devinfo->gen == 7);
+ inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
+ }
+
+ if (needs_zero_fill) {
+ brw_inst_set_no_dd_clear(devinfo, inst, true);
+ inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
+ brw_inst_set_no_dd_check(devinfo, inst, true);
+ }
+
+ brw_pop_insn_state(p);
+ return inst;
+}
+
+brw_inst *
+brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
+
+ if (align16) {
+ assert(src.type == BRW_REGISTER_TYPE_UD);
+ } else {
+ /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
+ *
+ * Because this instruction does not have a 16-bit floating-point
+ * type, the source data type must be Word (W). The destination type
+ * must be F (Float).
+ */
+ if (src.type == BRW_REGISTER_TYPE_UD)
+ src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
+
+ assert(src.type == BRW_REGISTER_TYPE_W ||
+ src.type == BRW_REGISTER_TYPE_UW ||
+ src.type == BRW_REGISTER_TYPE_HF);
+ }
+
+ if (devinfo->gen >= 8) {
+ return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
+ } else {
+ assert(devinfo->gen == 7);
+ return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
+ }
+}
+
+
+void brw_NOP(struct brw_codegen *p)
+{
+ brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
+ memset(insn, 0, sizeof(*insn));
+ brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
+}
+
+
+
+
+
+/***********************************************************************
+ * Comparisons, if/else/endif
+ */
+
+brw_inst *
+brw_JMPI(struct brw_codegen *p, struct brw_reg index,
+ unsigned predicate_control)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ struct brw_reg ip = brw_ip_reg();
+ brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
+
+ brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_2);
+ brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
+ brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
+ brw_inst_set_pred_control(devinfo, inst, predicate_control);
+
+ return inst;
+}
+
+static void
+push_if_stack(struct brw_codegen *p, brw_inst *inst)
+{
+ p->if_stack[p->if_stack_depth] = inst - p->store;
+
+ p->if_stack_depth++;
+ if (p->if_stack_array_size <= p->if_stack_depth) {
+ p->if_stack_array_size *= 2;
+ p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
+ p->if_stack_array_size);
+ }
+}
+
+static brw_inst *
+pop_if_stack(struct brw_codegen *p)
+{
+ p->if_stack_depth--;
+ return &p->store[p->if_stack[p->if_stack_depth]];
+}
+
+static void
+push_loop_stack(struct brw_codegen *p, brw_inst *inst)
+{
+ if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
+ p->loop_stack_array_size *= 2;
+ p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
+ p->loop_stack_array_size);
+ p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
+ p->loop_stack_array_size);
+ }
+
+ p->loop_stack[p->loop_stack_depth] = inst - p->store;
+ p->loop_stack_depth++;
+ p->if_depth_in_loop[p->loop_stack_depth] = 0;
+}
+
+static brw_inst *
+get_inner_do_insn(struct brw_codegen *p)
+{
+ return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
+}
+
+/* EU takes the value from the flag register and pushes it onto some
+ * sort of a stack (presumably merging with any flag value already on
+ * the stack). Within an if block, the flags at the top of the stack
+ * control execution on each channel of the unit, eg. on each of the
+ * 16 pixel values in our wm programs.
+ *
+ * When the matching 'else' instruction is reached (presumably by
+ * countdown of the instruction count patched in by our ELSE/ENDIF
+ * functions), the relevant flags are inverted.
+ *
+ * When the matching 'endif' instruction is reached, the flags are
+ * popped off. If the stack is now empty, normal execution resumes.
+ */
+brw_inst *
+brw_IF(struct brw_codegen *p, unsigned execute_size)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ brw_inst *insn;
+
+ insn = next_insn(p, BRW_OPCODE_IF);
+
+ /* Override the defaults for this instruction:
+ */
+ if (devinfo->gen < 6) {
+ brw_set_dest(p, insn, brw_ip_reg());
+ brw_set_src0(p, insn, brw_ip_reg());
+ brw_set_src1(p, insn, brw_imm_d(0x0));
+ } else if (devinfo->gen == 6) {
+ brw_set_dest(p, insn, brw_imm_w(0));
+ brw_inst_set_gen6_jump_count(devinfo, insn, 0);
+ brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
+ brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
+ } else if (devinfo->gen == 7) {
+ brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
+ brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
+ brw_set_src1(p, insn, brw_imm_w(0));
+ brw_inst_set_jip(devinfo, insn, 0);
+ brw_inst_set_uip(devinfo, insn, 0);
+ } else {
+ brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
+ brw_set_src0(p, insn, brw_imm_d(0));
+ brw_inst_set_jip(devinfo, insn, 0);
+ brw_inst_set_uip(devinfo, insn, 0);
+ }
+
+ brw_inst_set_exec_size(devinfo, insn, execute_size);
+ brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+ brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
+ brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
+ if (!p->single_program_flow && devinfo->gen < 6)
+ brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
+
+ push_if_stack(p, insn);
+ p->if_depth_in_loop[p->loop_stack_depth]++;
+ return insn;
+}
+
+/* This function is only used for gen6-style IF instructions with an
+ * embedded comparison (conditional modifier). It is not used on gen7.
+ */
+brw_inst *
+gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
+ struct brw_reg src0, struct brw_reg src1)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ brw_inst *insn;
+
+ insn = next_insn(p, BRW_OPCODE_IF);
+
+ brw_set_dest(p, insn, brw_imm_w(0));
+ brw_inst_set_exec_size(devinfo, insn,
+ brw_inst_exec_size(devinfo, p->current));
+ brw_inst_set_gen6_jump_count(devinfo, insn, 0);
+ brw_set_src0(p, insn, src0);
+ brw_set_src1(p, insn, src1);
+
+ assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
+ assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
+ brw_inst_set_cond_modifier(devinfo, insn, conditional);
+
+ push_if_stack(p, insn);
+ return insn;
+}
+
+/**
+ * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
+ */
+static void
+convert_IF_ELSE_to_ADD(struct brw_codegen *p,
+ brw_inst *if_inst, brw_inst *else_inst)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+
+ /* The next instruction (where the ENDIF would be, if it existed) */
+ brw_inst *next_inst = &p->store[p->nr_insn];
+
+ assert(p->single_program_flow);
+ assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
+ assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
+ assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
+
+ /* Convert IF to an ADD instruction that moves the instruction pointer
+ * to the first instruction of the ELSE block. If there is no ELSE
+ * block, point to where ENDIF would be. Reverse the predicate.
+ *
+ * There's no need to execute an ENDIF since we don't need to do any
+ * stack operations, and if we're currently executing, we just want to
+ * continue normally.
+ */
+ brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
+ brw_inst_set_pred_inv(devinfo, if_inst, true);
+
+ if (else_inst != NULL) {
+ /* Convert ELSE to an ADD instruction that points where the ENDIF
+ * would be.
+ */
+ brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
+
+ brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
+ brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
+ } else {
+ brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
+ }
+}
+
+/**
+ * Patch IF and ELSE instructions with appropriate jump targets.
+ */
+static void
+patch_IF_ELSE(struct brw_codegen *p,
+ brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+
+ /* We shouldn't be patching IF and ELSE instructions in single program flow
+ * mode when gen < 6, because in single program flow mode on those
+ * platforms, we convert flow control instructions to conditional ADDs that
+ * operate on IP (see brw_ENDIF).
+ *
+ * However, on Gen6, writing to IP doesn't work in single program flow mode
+ * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
+ * not be updated by non-flow control instructions."). And on later
+ * platforms, there is no significant benefit to converting control flow
+ * instructions to conditional ADDs. So we do patch IF and ELSE
+ * instructions in single program flow mode on those platforms.
+ */
+ if (devinfo->gen < 6)
+ assert(!p->single_program_flow);
+
+ assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
+ assert(endif_inst != NULL);
+ assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
+
+ unsigned br = brw_jump_scale(devinfo);
+
+ assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
+ brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
+
+ if (else_inst == NULL) {
+ /* Patch IF -> ENDIF */
+ if (devinfo->gen < 6) {
+ /* Turn it into an IFF, which means no mask stack operations for
+ * all-false and jumping past the ENDIF.
+ */
+ brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
+ brw_inst_set_gen4_jump_count(devinfo, if_inst,
+ br * (endif_inst - if_inst + 1));
+ brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
+ } else if (devinfo->gen == 6) {
+ /* As of gen6, there is no IFF and IF must point to the ENDIF. */
+ brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
+ } else {
+ brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
+ brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
+ }
+ } else {
+ brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
+
+ /* Patch IF -> ELSE */
+ if (devinfo->gen < 6) {
+ brw_inst_set_gen4_jump_count(devinfo, if_inst,
+ br * (else_inst - if_inst));
+ brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
+ } else if (devinfo->gen == 6) {
+ brw_inst_set_gen6_jump_count(devinfo, if_inst,
+ br * (else_inst - if_inst + 1));
+ }
+
+ /* Patch ELSE -> ENDIF */
+ if (devinfo->gen < 6) {
+ /* BRW_OPCODE_ELSE pre-gen6 should point just past the
+ * matching ENDIF.
+ */
+ brw_inst_set_gen4_jump_count(devinfo, else_inst,
+ br * (endif_inst - else_inst + 1));
+ brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
+ } else if (devinfo->gen == 6) {
+ /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
+ brw_inst_set_gen6_jump_count(devinfo, else_inst,
+ br * (endif_inst - else_inst));
+ } else {
+ /* The IF instruction's JIP should point just past the ELSE */
+ brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
+ /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
+ brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
+ brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
+ if (devinfo->gen >= 8) {
+ /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
+ * should point to ENDIF.
+ */
+ brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
+ }
+ }
+ }
+}
+
+void
+brw_ELSE(struct brw_codegen *p)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ brw_inst *insn;
+
+ insn = next_insn(p, BRW_OPCODE_ELSE);
+
+ if (devinfo->gen < 6) {
+ brw_set_dest(p, insn, brw_ip_reg());
+ brw_set_src0(p, insn, brw_ip_reg());
+ brw_set_src1(p, insn, brw_imm_d(0x0));
+ } else if (devinfo->gen == 6) {
+ brw_set_dest(p, insn, brw_imm_w(0));
+ brw_inst_set_gen6_jump_count(devinfo, insn, 0);
+ brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ } else if (devinfo->gen == 7) {
+ brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ brw_set_src1(p, insn, brw_imm_w(0));
+ brw_inst_set_jip(devinfo, insn, 0);
+ brw_inst_set_uip(devinfo, insn, 0);
+ } else {
+ brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ brw_set_src0(p, insn, brw_imm_d(0));
+ brw_inst_set_jip(devinfo, insn, 0);
+ brw_inst_set_uip(devinfo, insn, 0);
+ }
+
+ brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+ brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
+ if (!p->single_program_flow && devinfo->gen < 6)
+ brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
+
+ push_if_stack(p, insn);
+}
+
+void
+brw_ENDIF(struct brw_codegen *p)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ brw_inst *insn = NULL;
+ brw_inst *else_inst = NULL;
+ brw_inst *if_inst = NULL;
+ brw_inst *tmp;
+ bool emit_endif = true;
+
+ /* In single program flow mode, we can express IF and ELSE instructions
+ * equivalently as ADD instructions that operate on IP. On platforms prior
+ * to Gen6, flow control instructions cause an implied thread switch, so
+ * this is a significant savings.
+ *
+ * However, on Gen6, writing to IP doesn't work in single program flow mode
+ * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
+ * not be updated by non-flow control instructions."). And on later
+ * platforms, there is no significant benefit to converting control flow
+ * instructions to conditional ADDs. So we only do this trick on Gen4 and
+ * Gen5.
+ */
+ if (devinfo->gen < 6 && p->single_program_flow)
+ emit_endif = false;
+
+ /*
+ * A single next_insn() may change the base address of instruction store
+ * memory(p->store), so call it first before referencing the instruction
+ * store pointer from an index
+ */
+ if (emit_endif)
+ insn = next_insn(p, BRW_OPCODE_ENDIF);
+
+ /* Pop the IF and (optional) ELSE instructions from the stack */
+ p->if_depth_in_loop[p->loop_stack_depth]--;
+ tmp = pop_if_stack(p);
+ if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
+ else_inst = tmp;
+ tmp = pop_if_stack(p);
+ }
+ if_inst = tmp;
+
+ if (!emit_endif) {
+ /* ENDIF is useless; don't bother emitting it. */
+ convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
+ return;
+ }
+
+ if (devinfo->gen < 6) {
+ brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ brw_set_src1(p, insn, brw_imm_d(0x0));
+ } else if (devinfo->gen == 6) {
+ brw_set_dest(p, insn, brw_imm_w(0));
+ brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ } else if (devinfo->gen == 7) {
+ brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ brw_set_src1(p, insn, brw_imm_w(0));
+ } else {
+ brw_set_src0(p, insn, brw_imm_d(0));
+ }
+
+ brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+ brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
+ if (devinfo->gen < 6)
+ brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
+
+ /* Also pop item off the stack in the endif instruction: */
+ if (devinfo->gen < 6) {
+ brw_inst_set_gen4_jump_count(devinfo, insn, 0);
+ brw_inst_set_gen4_pop_count(devinfo, insn, 1);
+ } else if (devinfo->gen == 6) {
+ brw_inst_set_gen6_jump_count(devinfo, insn, 2);
+ } else {
+ brw_inst_set_jip(devinfo, insn, 2);
+ }
+ patch_IF_ELSE(p, if_inst, else_inst, insn);
+}
+
+brw_inst *
+brw_BREAK(struct brw_codegen *p)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ brw_inst *insn;
+
+ insn = next_insn(p, BRW_OPCODE_BREAK);
+ if (devinfo->gen >= 8) {
+ brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ brw_set_src0(p, insn, brw_imm_d(0x0));
+ } else if (devinfo->gen >= 6) {
+ brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ brw_set_src1(p, insn, brw_imm_d(0x0));
+ } else {
+ brw_set_dest(p, insn, brw_ip_reg());
+ brw_set_src0(p, insn, brw_ip_reg());
+ brw_set_src1(p, insn, brw_imm_d(0x0));
+ brw_inst_set_gen4_pop_count(devinfo, insn,
+ p->if_depth_in_loop[p->loop_stack_depth]);
+ }
+ brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+ brw_inst_set_exec_size(devinfo, insn,
+ brw_inst_exec_size(devinfo, p->current));
+
+ return insn;
+}
+
+brw_inst *
+brw_CONT(struct brw_codegen *p)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ brw_inst *insn;
+
+ insn = next_insn(p, BRW_OPCODE_CONTINUE);
+ brw_set_dest(p, insn, brw_ip_reg());
+ if (devinfo->gen >= 8) {
+ brw_set_src0(p, insn, brw_imm_d(0x0));
+ } else {
+ brw_set_src0(p, insn, brw_ip_reg());
+ brw_set_src1(p, insn, brw_imm_d(0x0));
+ }
+
+ if (devinfo->gen < 6) {
+ brw_inst_set_gen4_pop_count(devinfo, insn,
+ p->if_depth_in_loop[p->loop_stack_depth]);
+ }
+ brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+ brw_inst_set_exec_size(devinfo, insn,
+ brw_inst_exec_size(devinfo, p->current));
+ return insn;
+}
+
+brw_inst *
+gen6_HALT(struct brw_codegen *p)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ brw_inst *insn;
+
+ insn = next_insn(p, BRW_OPCODE_HALT);
+ brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ if (devinfo->gen >= 8) {
+ brw_set_src0(p, insn, brw_imm_d(0x0));
+ } else {
+ brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
+ }
+
+ brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+ brw_inst_set_exec_size(devinfo, insn,
+ brw_inst_exec_size(devinfo, p->current));
+ return insn;
+}
+
+/* DO/WHILE loop:
+ *
+ * The DO/WHILE is just an unterminated loop -- break or continue are
+ * used for control within the loop. We have a few ways they can be
+ * done.
+ *
+ * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
+ * jip and no DO instruction.
+ *
+ * For non-uniform control flow pre-gen6, there's a DO instruction to
+ * push the mask, and a WHILE to jump back, and BREAK to get out and
+ * pop the mask.
+ *
+ * For gen6, there's no more mask stack, so no need for DO. WHILE
+ * just points back to the first instruction of the loop.
+ */
+brw_inst *
+brw_DO(struct brw_codegen *p, unsigned execute_size)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+
+ if (devinfo->gen >= 6 || p->single_program_flow) {
+ push_loop_stack(p, &p->store[p->nr_insn]);
+ return &p->store[p->nr_insn];
+ } else {
+ brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
+
+ push_loop_stack(p, insn);
+
+ /* Override the defaults for this instruction:
+ */
+ brw_set_dest(p, insn, brw_null_reg());
+ brw_set_src0(p, insn, brw_null_reg());
+ brw_set_src1(p, insn, brw_null_reg());
+
+ brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+ brw_inst_set_exec_size(devinfo, insn, execute_size);
+ brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
+
+ return insn;
+ }
+}
+
+/**
+ * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
+ * instruction here.
+ *
+ * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
+ * nesting, since it can always just point to the end of the block/current loop.
+ */
+static void
+brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ brw_inst *do_inst = get_inner_do_insn(p);
+ brw_inst *inst;
+ unsigned br = brw_jump_scale(devinfo);
+
+ assert(devinfo->gen < 6);
+
+ for (inst = while_inst - 1; inst != do_inst; inst--) {
+ /* If the jump count is != 0, that means that this instruction has already
+ * been patched because it's part of a loop inside of the one we're
+ * patching.
+ */
+ if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
+ brw_inst_gen4_jump_count(devinfo, inst) == 0) {
+ brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
+ } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
+ brw_inst_gen4_jump_count(devinfo, inst) == 0) {
+ brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
+ }
+ }
+}
+
+brw_inst *
+brw_WHILE(struct brw_codegen *p)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ brw_inst *insn, *do_insn;
+ unsigned br = brw_jump_scale(devinfo);
+
+ if (devinfo->gen >= 6) {
+ insn = next_insn(p, BRW_OPCODE_WHILE);
+ do_insn = get_inner_do_insn(p);
+
+ if (devinfo->gen >= 8) {
+ brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ brw_set_src0(p, insn, brw_imm_d(0));
+ brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
+ } else if (devinfo->gen == 7) {
+ brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ brw_set_src1(p, insn, brw_imm_w(0));
+ brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
+ } else {
+ brw_set_dest(p, insn, brw_imm_w(0));
+ brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
+ brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ }
+
+ brw_inst_set_exec_size(devinfo, insn,
+ brw_inst_exec_size(devinfo, p->current));
+
+ } else {
+ if (p->single_program_flow) {
+ insn = next_insn(p, BRW_OPCODE_ADD);
+ do_insn = get_inner_do_insn(p);
+
+ brw_set_dest(p, insn, brw_ip_reg());
+ brw_set_src0(p, insn, brw_ip_reg());
+ brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
+ brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
+ } else {
+ insn = next_insn(p, BRW_OPCODE_WHILE);
+ do_insn = get_inner_do_insn(p);
+
+ assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
+
+ brw_set_dest(p, insn, brw_ip_reg());
+ brw_set_src0(p, insn, brw_ip_reg());
+ brw_set_src1(p, insn, brw_imm_d(0));
+
+ brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
+ brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
+ brw_inst_set_gen4_pop_count(devinfo, insn, 0);
+
+ brw_patch_break_cont(p, insn);
+ }
+ }
+ brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+
+ p->loop_stack_depth--;
+
+ return insn;
+}
+
+/* FORWARD JUMPS:
+ */
+void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ brw_inst *jmp_insn = &p->store[jmp_insn_idx];
+ unsigned jmpi = 1;
+
+ if (devinfo->gen >= 5)
+ jmpi = 2;
+
+ assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
+ assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
+
+ brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
+ jmpi * (p->nr_insn - jmp_insn_idx - 1));
+}
+
+/* To integrate with the above, it makes sense that the comparison
+ * instruction should populate the flag register. It might be simpler
+ * just to use the flag reg for most WM tasks?
+ */
+void brw_CMP(struct brw_codegen *p,
+ struct brw_reg dest,
+ unsigned conditional,
+ struct brw_reg src0,
+ struct brw_reg src1)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
+
+ brw_inst_set_cond_modifier(devinfo, insn, conditional);
+ brw_set_dest(p, insn, dest);
+ brw_set_src0(p, insn, src0);
+ brw_set_src1(p, insn, src1);
+
+ /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
+ * page says:
+ * "Any CMP instruction with a null destination must use a {switch}."
+ *
+ * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
+ * mentioned on their work-arounds pages.
+ */
+ if (devinfo->gen == 7) {
+ if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+ dest.nr == BRW_ARF_NULL) {
+ brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
+ }
+ }
+}
+
+/***********************************************************************
+ * Helpers for the various SEND message types:
+ */
+
+/** Extended math function, float[8].
+ */
+void gen4_math(struct brw_codegen *p,
+ struct brw_reg dest,
+ unsigned function,
+ unsigned msg_reg_nr,
+ struct brw_reg src,
+ unsigned precision )
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
+ unsigned data_type;
+ if (has_scalar_region(src)) {
+ data_type = BRW_MATH_DATA_SCALAR;
+ } else {
+ data_type = BRW_MATH_DATA_VECTOR;
+ }
+
+ assert(devinfo->gen < 6);
+
+ /* Example code doesn't set predicate_control for send
+ * instructions.
+ */
+ brw_inst_set_pred_control(devinfo, insn, 0);
+ brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
+
+ brw_set_dest(p, insn, dest);
+ brw_set_src0(p, insn, src);
+ brw_set_math_message(p,
+ insn,
+ function,
+ src.type == BRW_REGISTER_TYPE_D,
+ precision,
+ data_type);
+}
+
+void gen6_math(struct brw_codegen *p,
+ struct brw_reg dest,
+ unsigned function,
+ struct brw_reg src0,
+ struct brw_reg src1)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
+
+ assert(devinfo->gen >= 6);
+
+ assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
+ (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
+
+ assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
+ if (devinfo->gen == 6) {
+ assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
+ assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
+ }
+
+ if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
+ function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
+ function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
+ assert(src0.type != BRW_REGISTER_TYPE_F);
+ assert(src1.type != BRW_REGISTER_TYPE_F);
+ assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
+ (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
+ } else {
+ assert(src0.type == BRW_REGISTER_TYPE_F);
+ assert(src1.type == BRW_REGISTER_TYPE_F);
+ }
+
+ /* Source modifiers are ignored for extended math instructions on Gen6. */
+ if (devinfo->gen == 6) {
+ assert(!src0.negate);
+ assert(!src0.abs);
+ assert(!src1.negate);
+ assert(!src1.abs);
+ }
+
+ brw_inst_set_math_function(devinfo, insn, function);
+
+ brw_set_dest(p, insn, dest);
+ brw_set_src0(p, insn, src0);
+ brw_set_src1(p, insn, src1);
+}
+
+/**
+ * Return the right surface index to access the thread scratch space using
+ * stateless dataport messages.
+ */
+unsigned
+brw_scratch_surface_idx(const struct brw_codegen *p)
+{
+ /* The scratch space is thread-local so IA coherency is unnecessary. */
+ if (p->devinfo->gen >= 8)
+ return GEN8_BTI_STATELESS_NON_COHERENT;
+ else
+ return BRW_BTI_STATELESS;
+}
+
+/**
+ * Write a block of OWORDs (half a GRF each) from the scratch buffer,
+ * using a constant offset per channel.
+ *
+ * The offset must be aligned to oword size (16 bytes). Used for
+ * register spilling.
+ */
+void brw_oword_block_write_scratch(struct brw_codegen *p,
+ struct brw_reg mrf,
+ int num_regs,
+ unsigned offset)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ const unsigned target_cache =
+ (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
+ devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
+ BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
+ uint32_t msg_type;
+
+ if (devinfo->gen >= 6)
+ offset /= 16;
+
+ mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
+
+ const unsigned mlen = 1 + num_regs;
+
+ /* Set up the message header. This is g0, with g0.2 filled with
+ * the offset. We don't want to leave our offset around in g0 or
+ * it'll screw up texture samples, so set it up inside the message
+ * reg.
+ */
+ {
+ brw_push_insn_state(p);
+ brw_set_default_exec_size(p, BRW_EXECUTE_8);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+
+ brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+ /* set message header global offset field (reg 0, element 2) */
+ brw_MOV(p,
+ retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
+ mrf.nr,
+ 2), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(offset));
+
+ brw_pop_insn_state(p);
+ }
+
+ {
+ struct brw_reg dest;
+ brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
+ int send_commit_msg;
+ struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
+ BRW_REGISTER_TYPE_UW);
+
+ brw_inst_set_compression(devinfo, insn, false);
+
+ if (brw_inst_exec_size(devinfo, insn) >= 16)
+ src_header = vec16(src_header);
+
+ assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
+ if (devinfo->gen < 6)
+ brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
+
+ /* Until gen6, writes followed by reads from the same location
+ * are not guaranteed to be ordered unless write_commit is set.
+ * If set, then a no-op write is issued to the destination
+ * register to set a dependency, and a read from the destination
+ * can be used to ensure the ordering.
+ *
+ * For gen6, only writes between different threads need ordering
+ * protection. Our use of DP writes is all about register
+ * spilling within a thread.
+ */
+ if (devinfo->gen >= 6) {
+ dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
+ send_commit_msg = 0;
+ } else {
+ dest = src_header;
+ send_commit_msg = 1;
+ }
+
+ brw_set_dest(p, insn, dest);
+ if (devinfo->gen >= 6) {
+ brw_set_src0(p, insn, mrf);
+ } else {
+ brw_set_src0(p, insn, brw_null_reg());
+ }
+
+ if (devinfo->gen >= 6)
+ msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
+ else
+ msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
+
+ brw_set_dp_write_message(p,
+ insn,
+ brw_scratch_surface_idx(p),
+ BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
+ msg_type,
+ target_cache,
+ mlen,
+ true, /* header_present */
+ 0, /* not a render target */
+ send_commit_msg, /* response_length */
+ 0, /* eot */
+ send_commit_msg);
+ }
+}
+
+
+/**
+ * Read a block of owords (half a GRF each) from the scratch buffer
+ * using a constant index per channel.
+ *
+ * Offset must be aligned to oword size (16 bytes). Used for register
+ * spilling.
+ */
+void
+brw_oword_block_read_scratch(struct brw_codegen *p,
+ struct brw_reg dest,
+ struct brw_reg mrf,
+ int num_regs,
+ unsigned offset)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+
+ if (devinfo->gen >= 6)
+ offset /= 16;
+
+ if (p->devinfo->gen >= 7) {
+ /* On gen 7 and above, we no longer have message registers and we can
+ * send from any register we want. By using the destination register
+ * for the message, we guarantee that the implied message write won't
+ * accidentally overwrite anything. This has been a problem because
+ * the MRF registers and source for the final FB write are both fixed
+ * and may overlap.
+ */
+ mrf = retype(dest, BRW_REGISTER_TYPE_UD);
+ } else {
+ mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
+ }
+ dest = retype(dest, BRW_REGISTER_TYPE_UW);
+
+ const unsigned rlen = num_regs;
+ const unsigned target_cache =
+ (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
+ devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
+ BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
+
+ {
+ brw_push_insn_state(p);
+ brw_set_default_exec_size(p, BRW_EXECUTE_8);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+ brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+ /* set message header global offset field (reg 0, element 2) */
+ brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
+
+ brw_pop_insn_state(p);
+ }
+
+ {
+ brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
+
+ assert(brw_inst_pred_control(devinfo, insn) == 0);
+ brw_inst_set_compression(devinfo, insn, false);
+
+ brw_set_dest(p, insn, dest); /* UW? */
+ if (devinfo->gen >= 6) {
+ brw_set_src0(p, insn, mrf);
+ } else {
+ brw_set_src0(p, insn, brw_null_reg());
+ brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
+ }
+
+ brw_set_dp_read_message(p,
+ insn,
+ brw_scratch_surface_idx(p),
+ BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
+ BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
+ target_cache,
+ 1, /* msg_length */
+ true, /* header_present */
+ rlen);
+ }
+}
+
+void
+gen7_block_read_scratch(struct brw_codegen *p,
+ struct brw_reg dest,
+ int num_regs,
+ unsigned offset)
+{
+ brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
+ assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
+
+ brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
+
+ /* The HW requires that the header is present; this is to get the g0.5
+ * scratch offset.
+ */
+ brw_set_src0(p, insn, brw_vec8_grf(0, 0));
+
+ /* According to the docs, offset is "A 12-bit HWord offset into the memory
+ * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
+ * is 32 bytes, which happens to be the size of a register.
+ */
+ offset /= REG_SIZE;
+ assert(offset < (1 << 12));
+
+ gen7_set_dp_scratch_message(p, insn,
+ false, /* scratch read */
+ false, /* OWords */
+ false, /* invalidate after read */
+ num_regs,
+ offset,
+ 1, /* mlen: just g0 */
+ num_regs, /* rlen */
+ true); /* header present */
+}
+
+/**
+ * Read float[4] vectors from the data port constant cache.
+ * Location (in buffer) should be a multiple of 16.
+ * Used for fetching shader constants.
+ */
+void brw_oword_block_read(struct brw_codegen *p,
+ struct brw_reg dest,
+ struct brw_reg mrf,
+ uint32_t offset,
+ uint32_t bind_table_index)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ const unsigned target_cache =
+ (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
+ BRW_DATAPORT_READ_TARGET_DATA_CACHE);
+ const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
+
+ /* On newer hardware, offset is in units of owords. */
+ if (devinfo->gen >= 6)
+ offset /= 16;
+
+ mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
+
+ brw_push_insn_state(p);
+ brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+ brw_push_insn_state(p);
+ brw_set_default_exec_size(p, BRW_EXECUTE_8);
+ brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+ /* set message header global offset field (reg 0, element 2) */
+ brw_MOV(p,
+ retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
+ mrf.nr,
+ 2), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(offset));
+ brw_pop_insn_state(p);
+
+ brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
+
+ /* cast dest to a uword[8] vector */
+ dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
+
+ brw_set_dest(p, insn, dest);
+ if (devinfo->gen >= 6) {
+ brw_set_src0(p, insn, mrf);
+ } else {
+ brw_set_src0(p, insn, brw_null_reg());
+ brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
+ }
+
+ brw_set_dp_read_message(p, insn, bind_table_index,
+ BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
+ BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
+ target_cache,
+ 1, /* msg_length */
+ true, /* header_present */
+ DIV_ROUND_UP(exec_size, 8)); /* response_length */
+
+ brw_pop_insn_state(p);
+}
+
+
+void brw_fb_WRITE(struct brw_codegen *p,
+ struct brw_reg payload,
+ struct brw_reg implied_header,
+ unsigned msg_control,
+ unsigned binding_table_index,
+ unsigned msg_length,
+ unsigned response_length,
+ bool eot,
+ bool last_render_target,
+ bool header_present)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ const unsigned target_cache =
+ (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
+ BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
+ brw_inst *insn;
+ unsigned msg_type;
+ struct brw_reg dest, src0;
+
+ if (brw_inst_exec_size(devinfo, p->current) >= BRW_EXECUTE_16)
+ dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
+ else
+ dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
+
+ if (devinfo->gen >= 6) {
+ insn = next_insn(p, BRW_OPCODE_SENDC);
+ } else {
+ insn = next_insn(p, BRW_OPCODE_SEND);
+ }
+ brw_inst_set_compression(devinfo, insn, false);
+
+ if (devinfo->gen >= 6) {
+ /* headerless version, just submit color payload */
+ src0 = payload;
+
+ msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
+ } else {
+ assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
+ brw_inst_set_base_mrf(devinfo, insn, payload.nr);
+ src0 = implied_header;
+
+ msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
+ }
+
+ brw_set_dest(p, insn, dest);
+ brw_set_src0(p, insn, src0);
+ brw_set_dp_write_message(p,
+ insn,
+ binding_table_index,
+ msg_control,
+ msg_type,
+ target_cache,
+ msg_length,
+ header_present,
+ last_render_target,
+ response_length,
+ eot,
+ 0 /* send_commit_msg */);
+}
+
+brw_inst *
+gen9_fb_READ(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg payload,
+ unsigned binding_table_index,
+ unsigned msg_length,
+ unsigned response_length,
+ bool per_sample)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ assert(devinfo->gen >= 9);
+ const unsigned msg_subtype =
+ brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16 ? 0 : 1;
+ brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
+
+ brw_set_dest(p, insn, dst);
+ brw_set_src0(p, insn, payload);
+ brw_set_dp_read_message(p, insn, binding_table_index,
+ per_sample << 5 | msg_subtype,
+ GEN9_DATAPORT_RC_RENDER_TARGET_READ,
+ GEN6_SFID_DATAPORT_RENDER_CACHE,
+ msg_length, true /* header_present */,
+ response_length);
+ brw_inst_set_rt_slot_group(devinfo, insn,
+ brw_inst_qtr_control(devinfo, p->current) / 2);
+
+ return insn;
+}
+
+/**
+ * Texture sample instruction.
+ * Note: the msg_type plus msg_length values determine exactly what kind
+ * of sampling operation is performed. See volume 4, page 161 of docs.
+ */
+void brw_SAMPLE(struct brw_codegen *p,
+ struct brw_reg dest,
+ unsigned msg_reg_nr,
+ struct brw_reg src0,
+ unsigned binding_table_index,
+ unsigned sampler,
+ unsigned msg_type,
+ unsigned response_length,
+ unsigned msg_length,
+ unsigned header_present,
+ unsigned simd_mode,
+ unsigned return_format)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ brw_inst *insn;
+
+ if (msg_reg_nr != -1)
+ gen6_resolve_implied_move(p, &src0, msg_reg_nr);
+
+ insn = next_insn(p, BRW_OPCODE_SEND);
+ brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
+
+ /* From the 965 PRM (volume 4, part 1, section 14.2.41):
+ *
+ * "Instruction compression is not allowed for this instruction (that
+ * is, send). The hardware behavior is undefined if this instruction is
+ * set as compressed. However, compress control can be set to "SecHalf"
+ * to affect the EMask generation."
+ *
+ * No similar wording is found in later PRMs, but there are examples
+ * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
+ * are allowed in SIMD16 mode and they could not work without SecHalf. For
+ * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
+ */
+ brw_inst_set_compression(devinfo, insn, false);
+
+ if (devinfo->gen < 6)
+ brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
+
+ brw_set_dest(p, insn, dest);
+ brw_set_src0(p, insn, src0);
+ brw_set_sampler_message(p, insn,
+ binding_table_index,
+ sampler,
+ msg_type,
+ response_length,
+ msg_length,
+ header_present,
+ simd_mode,
+ return_format);
+}
+
+/* Adjust the message header's sampler state pointer to
+ * select the correct group of 16 samplers.
+ */
+void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
+ struct brw_reg header,
+ struct brw_reg sampler_index)
+{
+ /* The "Sampler Index" field can only store values between 0 and 15.
+ * However, we can add an offset to the "Sampler State Pointer"
+ * field, effectively selecting a different set of 16 samplers.
+ *
+ * The "Sampler State Pointer" needs to be aligned to a 32-byte
+ * offset, and each sampler state is only 16-bytes, so we can't
+ * exclusively use the offset - we have to use both.
+ */
+
+ const struct gen_device_info *devinfo = p->devinfo;
+
+ if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
+ const int sampler_state_size = 16; /* 16 bytes */
+ uint32_t sampler = sampler_index.ud;
+
+ if (sampler >= 16) {
+ assert(devinfo->is_haswell || devinfo->gen >= 8);
+ brw_ADD(p,
+ get_element_ud(header, 3),
+ get_element_ud(brw_vec8_grf(0, 0), 3),
+ brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
+ }
+ } else {
+ /* Non-const sampler array indexing case */
+ if (devinfo->gen < 8 && !devinfo->is_haswell) {
+ return;
+ }
+
+ struct brw_reg temp = get_element_ud(header, 3);
+
+ brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
+ brw_SHL(p, temp, temp, brw_imm_ud(4));
+ brw_ADD(p,
+ get_element_ud(header, 3),
+ get_element_ud(brw_vec8_grf(0, 0), 3),
+ temp);
+ }
+}
+
+/* All these variables are pretty confusing - we might be better off
+ * using bitmasks and macros for this, in the old style. Or perhaps
+ * just having the caller instantiate the fields in dword3 itself.
+ */
+void brw_urb_WRITE(struct brw_codegen *p,
+ struct brw_reg dest,
+ unsigned msg_reg_nr,
+ struct brw_reg src0,
+ enum brw_urb_write_flags flags,
+ unsigned msg_length,
+ unsigned response_length,
+ unsigned offset,
+ unsigned swizzle)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ brw_inst *insn;
+
+ gen6_resolve_implied_move(p, &src0, msg_reg_nr);
+
+ if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
+ /* Enable Channel Masks in the URB_WRITE_HWORD message header */
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
+ BRW_REGISTER_TYPE_UD),
+ retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(0xff00));
+ brw_pop_insn_state(p);
+ }
+
+ insn = next_insn(p, BRW_OPCODE_SEND);
+
+ assert(msg_length < BRW_MAX_MRF(devinfo->gen));
+
+ brw_set_dest(p, insn, dest);
+ brw_set_src0(p, insn, src0);
+ brw_set_src1(p, insn, brw_imm_d(0));
+
+ if (devinfo->gen < 6)
+ brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
+
+ brw_set_urb_message(p,
+ insn,
+ flags,
+ msg_length,
+ response_length,
+ offset,
+ swizzle);
+}
+
+struct brw_inst *
+brw_send_indirect_message(struct brw_codegen *p,
+ unsigned sfid,
+ struct brw_reg dst,
+ struct brw_reg payload,
+ struct brw_reg desc)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ struct brw_inst *send;
+ int setup;
+
+ dst = retype(dst, BRW_REGISTER_TYPE_UW);
+
+ assert(desc.type == BRW_REGISTER_TYPE_UD);
+
+ /* We hold on to the setup instruction (the SEND in the direct case, the OR
+ * in the indirect case) by its index in the instruction store. The
+ * pointer returned by next_insn() may become invalid if emitting the SEND
+ * in the indirect case reallocs the store.
+ */
+
+ if (desc.file == BRW_IMMEDIATE_VALUE) {
+ setup = p->nr_insn;
+ send = next_insn(p, BRW_OPCODE_SEND);
+ brw_set_src1(p, send, desc);
+
+ } else {
+ struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
+
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+
+ /* Load the indirect descriptor to an address register using OR so the
+ * caller can specify additional descriptor bits with the usual
+ * brw_set_*_message() helper functions.
+ */
+ setup = p->nr_insn;
+ brw_OR(p, addr, desc, brw_imm_ud(0));
+
+ brw_pop_insn_state(p);
+
+ send = next_insn(p, BRW_OPCODE_SEND);
+ brw_set_src1(p, send, addr);
+ }
+
+ if (dst.width < BRW_EXECUTE_8)
+ brw_inst_set_exec_size(devinfo, send, dst.width);
+
+ brw_set_dest(p, send, dst);
+ brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
+ brw_inst_set_sfid(devinfo, send, sfid);
+
+ return &p->store[setup];
+}
+
+static struct brw_inst *
+brw_send_indirect_surface_message(struct brw_codegen *p,
+ unsigned sfid,
+ struct brw_reg dst,
+ struct brw_reg payload,
+ struct brw_reg surface,
+ unsigned message_len,
+ unsigned response_len,
+ bool header_present)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ struct brw_inst *insn;
+
+ if (surface.file != BRW_IMMEDIATE_VALUE) {
+ struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
+
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+
+ /* Mask out invalid bits from the surface index to avoid hangs e.g. when
+ * some surface array is accessed out of bounds.
+ */
+ insn = brw_AND(p, addr,
+ suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
+ BRW_GET_SWZ(surface.swizzle, 0)),
+ brw_imm_ud(0xff));
+
+ brw_pop_insn_state(p);
+
+ surface = addr;
+ }
+
+ insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
+ brw_inst_set_mlen(devinfo, insn, message_len);
+ brw_inst_set_rlen(devinfo, insn, response_len);
+ brw_inst_set_header_present(devinfo, insn, header_present);
+
+ return insn;
+}
+
+static bool
+while_jumps_before_offset(const struct gen_device_info *devinfo,
+ brw_inst *insn, int while_offset, int start_offset)
+{
+ int scale = 16 / brw_jump_scale(devinfo);
+ int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
+ : brw_inst_jip(devinfo, insn);
+ assert(jip < 0);
+ return while_offset + jip * scale <= start_offset;
+}
+
+
+static int
+brw_find_next_block_end(struct brw_codegen *p, int start_offset)
+{
+ int offset;
+ void *store = p->store;
+ const struct gen_device_info *devinfo = p->devinfo;
+
+ int depth = 0;
+
+ for (offset = next_offset(devinfo, store, start_offset);
+ offset < p->next_insn_offset;
+ offset = next_offset(devinfo, store, offset)) {
+ brw_inst *insn = store + offset;
+
+ switch (brw_inst_opcode(devinfo, insn)) {
+ case BRW_OPCODE_IF:
+ depth++;
+ break;
+ case BRW_OPCODE_ENDIF:
+ if (depth == 0)
+ return offset;
+ depth--;
+ break;
+ case BRW_OPCODE_WHILE:
+ /* If the while doesn't jump before our instruction, it's the end
+ * of a sibling do...while loop. Ignore it.
+ */
+ if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
+ continue;
+ /* fallthrough */
+ case BRW_OPCODE_ELSE:
+ case BRW_OPCODE_HALT:
+ if (depth == 0)
+ return offset;
+ }
+ }
+
+ return 0;
+}
+
+/* There is no DO instruction on gen6, so to find the end of the loop
+ * we have to see if the loop is jumping back before our start
+ * instruction.
+ */
+static int
+brw_find_loop_end(struct brw_codegen *p, int start_offset)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ int offset;
+ void *store = p->store;
+
+ assert(devinfo->gen >= 6);
+
+ /* Always start after the instruction (such as a WHILE) we're trying to fix
+ * up.
+ */
+ for (offset = next_offset(devinfo, store, start_offset);
+ offset < p->next_insn_offset;
+ offset = next_offset(devinfo, store, offset)) {
+ brw_inst *insn = store + offset;
+
+ if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
+ if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
+ return offset;
+ }
+ }
+ assert(!"not reached");
+ return start_offset;
+}
+
+/* After program generation, go back and update the UIP and JIP of
+ * BREAK, CONT, and HALT instructions to their correct locations.
+ */
+void
+brw_set_uip_jip(struct brw_codegen *p, int start_offset)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ int offset;
+ int br = brw_jump_scale(devinfo);
+ int scale = 16 / br;
+ void *store = p->store;
+
+ if (devinfo->gen < 6)
+ return;
+
+ for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
+ brw_inst *insn = store + offset;
+ assert(brw_inst_cmpt_control(devinfo, insn) == 0);
+
+ int block_end_offset = brw_find_next_block_end(p, offset);
+ switch (brw_inst_opcode(devinfo, insn)) {
+ case BRW_OPCODE_BREAK:
+ assert(block_end_offset != 0);
+ brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
+ /* Gen7 UIP points to WHILE; Gen6 points just after it */
+ brw_inst_set_uip(devinfo, insn,
+ (brw_find_loop_end(p, offset) - offset +
+ (devinfo->gen == 6 ? 16 : 0)) / scale);
+ break;
+ case BRW_OPCODE_CONTINUE:
+ assert(block_end_offset != 0);
+ brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
+ brw_inst_set_uip(devinfo, insn,
+ (brw_find_loop_end(p, offset) - offset) / scale);
+
+ assert(brw_inst_uip(devinfo, insn) != 0);
+ assert(brw_inst_jip(devinfo, insn) != 0);
+ break;
+
+ case BRW_OPCODE_ENDIF: {
+ int32_t jump = (block_end_offset == 0) ?
+ 1 * br : (block_end_offset - offset) / scale;
+ if (devinfo->gen >= 7)
+ brw_inst_set_jip(devinfo, insn, jump);
+ else
+ brw_inst_set_gen6_jump_count(devinfo, insn, jump);
+ break;
+ }
+
+ case BRW_OPCODE_HALT:
+ /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
+ *
+ * "In case of the halt instruction not inside any conditional
+ * code block, the value of <JIP> and <UIP> should be the
+ * same. In case of the halt instruction inside conditional code
+ * block, the <UIP> should be the end of the program, and the
+ * <JIP> should be end of the most inner conditional code block."
+ *
+ * The uip will have already been set by whoever set up the
+ * instruction.
+ */
+ if (block_end_offset == 0) {
+ brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
+ } else {
+ brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
+ }
+ assert(brw_inst_uip(devinfo, insn) != 0);
+ assert(brw_inst_jip(devinfo, insn) != 0);
+ break;
+ }
+ }
+}
+
+void brw_ff_sync(struct brw_codegen *p,
+ struct brw_reg dest,
+ unsigned msg_reg_nr,
+ struct brw_reg src0,
+ bool allocate,
+ unsigned response_length,
+ bool eot)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ brw_inst *insn;
+
+ gen6_resolve_implied_move(p, &src0, msg_reg_nr);
+
+ insn = next_insn(p, BRW_OPCODE_SEND);
+ brw_set_dest(p, insn, dest);
+ brw_set_src0(p, insn, src0);
+ brw_set_src1(p, insn, brw_imm_d(0));
+
+ if (devinfo->gen < 6)
+ brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
+
+ brw_set_ff_sync_message(p,
+ insn,
+ allocate,
+ response_length,
+ eot);
+}
+
+/**
+ * Emit the SEND instruction necessary to generate stream output data on Gen6
+ * (for transform feedback).
+ *
+ * If send_commit_msg is true, this is the last piece of stream output data
+ * from this thread, so send the data as a committed write. According to the
+ * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
+ *
+ * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
+ * writes are complete by sending the final write as a committed write."
+ */
+void
+brw_svb_write(struct brw_codegen *p,
+ struct brw_reg dest,
+ unsigned msg_reg_nr,
+ struct brw_reg src0,
+ unsigned binding_table_index,
+ bool send_commit_msg)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ const unsigned target_cache =
+ (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
+ devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
+ BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
+ brw_inst *insn;
+
+ gen6_resolve_implied_move(p, &src0, msg_reg_nr);
+
+ insn = next_insn(p, BRW_OPCODE_SEND);
+ brw_set_dest(p, insn, dest);
+ brw_set_src0(p, insn, src0);
+ brw_set_src1(p, insn, brw_imm_d(0));
+ brw_set_dp_write_message(p, insn,
+ binding_table_index,
+ 0, /* msg_control: ignored */
+ GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
+ target_cache,
+ 1, /* msg_length */
+ true, /* header_present */
+ 0, /* last_render_target: ignored */
+ send_commit_msg, /* response_length */
+ 0, /* end_of_thread */
+ send_commit_msg); /* send_commit_msg */
+}
+
+static unsigned
+brw_surface_payload_size(struct brw_codegen *p,
+ unsigned num_channels,
+ bool has_simd4x2,
+ bool has_simd16)
+{
+ if (has_simd4x2 &&
+ brw_inst_access_mode(p->devinfo, p->current) == BRW_ALIGN_16)
+ return 1;
+ else if (has_simd16 &&
+ brw_inst_exec_size(p->devinfo, p->current) == BRW_EXECUTE_16)
+ return 2 * num_channels;
+ else
+ return num_channels;
+}
+
+static void
+brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
+ brw_inst *insn,
+ unsigned atomic_op,
+ bool response_expected)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ unsigned msg_control =
+ atomic_op | /* Atomic Operation Type: BRW_AOP_* */
+ (response_expected ? 1 << 5 : 0); /* Return data expected */
+
+ if (devinfo->gen >= 8 || devinfo->is_haswell) {
+ if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
+ if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
+ msg_control |= 1 << 4; /* SIMD8 mode */
+
+ brw_inst_set_dp_msg_type(devinfo, insn,
+ HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
+ } else {
+ brw_inst_set_dp_msg_type(devinfo, insn,
+ HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
+ }
+ } else {
+ brw_inst_set_dp_msg_type(devinfo, insn,
+ GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
+
+ if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
+ msg_control |= 1 << 4; /* SIMD8 mode */
+ }
+
+ brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
+}
+
+void
+brw_untyped_atomic(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg payload,
+ struct brw_reg surface,
+ unsigned atomic_op,
+ unsigned msg_length,
+ bool response_expected)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
+ HSW_SFID_DATAPORT_DATA_CACHE_1 :
+ GEN7_SFID_DATAPORT_DATA_CACHE);
+ const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
+ /* Mask out unused components -- This is especially important in Align16
+ * mode on generations that don't have native support for SIMD4x2 atomics,
+ * because unused but enabled components will cause the dataport to perform
+ * additional atomic operations on the addresses that happen to be in the
+ * uninitialized Y, Z and W coordinates of the payload.
+ */
+ const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
+ struct brw_inst *insn = brw_send_indirect_surface_message(
+ p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
+ brw_surface_payload_size(p, response_expected,
+ devinfo->gen >= 8 || devinfo->is_haswell, true),
+ align1);
+
+ brw_set_dp_untyped_atomic_message(
+ p, insn, atomic_op, response_expected);
+}
+
+static void
+brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
+ struct brw_inst *insn,
+ unsigned num_channels)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ /* Set mask of 32-bit channels to drop. */
+ unsigned msg_control = 0xf & (0xf << num_channels);
+
+ if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
+ if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
+ msg_control |= 1 << 4; /* SIMD16 mode */
+ else
+ msg_control |= 2 << 4; /* SIMD8 mode */
+ }
+
+ brw_inst_set_dp_msg_type(devinfo, insn,
+ (devinfo->gen >= 8 || devinfo->is_haswell ?
+ HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
+ GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
+ brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
+}
+
+void
+brw_untyped_surface_read(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg payload,
+ struct brw_reg surface,
+ unsigned msg_length,
+ unsigned num_channels)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
+ HSW_SFID_DATAPORT_DATA_CACHE_1 :
+ GEN7_SFID_DATAPORT_DATA_CACHE);
+ struct brw_inst *insn = brw_send_indirect_surface_message(
+ p, sfid, dst, payload, surface, msg_length,
+ brw_surface_payload_size(p, num_channels, true, true),
+ false);
+
+ brw_set_dp_untyped_surface_read_message(
+ p, insn, num_channels);
+}
+
+static void
+brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
+ struct brw_inst *insn,
+ unsigned num_channels)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ /* Set mask of 32-bit channels to drop. */
+ unsigned msg_control = 0xf & (0xf << num_channels);
+
+ if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
+ if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
+ msg_control |= 1 << 4; /* SIMD16 mode */
+ else
+ msg_control |= 2 << 4; /* SIMD8 mode */
+ } else {
+ if (devinfo->gen >= 8 || devinfo->is_haswell)
+ msg_control |= 0 << 4; /* SIMD4x2 mode */
+ else
+ msg_control |= 2 << 4; /* SIMD8 mode */
+ }
+
+ brw_inst_set_dp_msg_type(devinfo, insn,
+ devinfo->gen >= 8 || devinfo->is_haswell ?
+ HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
+ GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
+ brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
+}
+
+void
+brw_untyped_surface_write(struct brw_codegen *p,
+ struct brw_reg payload,
+ struct brw_reg surface,
+ unsigned msg_length,
+ unsigned num_channels)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
+ HSW_SFID_DATAPORT_DATA_CACHE_1 :
+ GEN7_SFID_DATAPORT_DATA_CACHE);
+ const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
+ /* Mask out unused components -- See comment in brw_untyped_atomic(). */
+ const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
+ WRITEMASK_X : WRITEMASK_XYZW;
+ struct brw_inst *insn = brw_send_indirect_surface_message(
+ p, sfid, brw_writemask(brw_null_reg(), mask),
+ payload, surface, msg_length, 0, align1);
+
+ brw_set_dp_untyped_surface_write_message(
+ p, insn, num_channels);
+}
+
+static void
+brw_set_dp_typed_atomic_message(struct brw_codegen *p,
+ struct brw_inst *insn,
+ unsigned atomic_op,
+ bool response_expected)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ unsigned msg_control =
+ atomic_op | /* Atomic Operation Type: BRW_AOP_* */
+ (response_expected ? 1 << 5 : 0); /* Return data expected */
+
+ if (devinfo->gen >= 8 || devinfo->is_haswell) {
+ if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
+ if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
+ msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
+
+ brw_inst_set_dp_msg_type(devinfo, insn,
+ HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
+ } else {
+ brw_inst_set_dp_msg_type(devinfo, insn,
+ HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
+ }
+
+ } else {
+ brw_inst_set_dp_msg_type(devinfo, insn,
+ GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
+
+ if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
+ msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
+ }
+
+ brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
+}
+
+void
+brw_typed_atomic(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg payload,
+ struct brw_reg surface,
+ unsigned atomic_op,
+ unsigned msg_length,
+ bool response_expected) {
+ const struct gen_device_info *devinfo = p->devinfo;
+ const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
+ HSW_SFID_DATAPORT_DATA_CACHE_1 :
+ GEN6_SFID_DATAPORT_RENDER_CACHE);
+ const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
+ /* Mask out unused components -- See comment in brw_untyped_atomic(). */
+ const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
+ struct brw_inst *insn = brw_send_indirect_surface_message(
+ p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
+ brw_surface_payload_size(p, response_expected,
+ devinfo->gen >= 8 || devinfo->is_haswell, false),
+ true);
+
+ brw_set_dp_typed_atomic_message(
+ p, insn, atomic_op, response_expected);
+}
+
+static void
+brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
+ struct brw_inst *insn,
+ unsigned num_channels)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ /* Set mask of unused channels. */
+ unsigned msg_control = 0xf & (0xf << num_channels);
+
+ if (devinfo->gen >= 8 || devinfo->is_haswell) {
+ if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
+ if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
+ msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
+ else
+ msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
+ }
+
+ brw_inst_set_dp_msg_type(devinfo, insn,
+ HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
+ } else {
+ if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
+ if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
+ msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
+ }
+
+ brw_inst_set_dp_msg_type(devinfo, insn,
+ GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
+ }
+
+ brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
+}
+
+void
+brw_typed_surface_read(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg payload,
+ struct brw_reg surface,
+ unsigned msg_length,
+ unsigned num_channels)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
+ HSW_SFID_DATAPORT_DATA_CACHE_1 :
+ GEN6_SFID_DATAPORT_RENDER_CACHE);
+ struct brw_inst *insn = brw_send_indirect_surface_message(
+ p, sfid, dst, payload, surface, msg_length,
+ brw_surface_payload_size(p, num_channels,
+ devinfo->gen >= 8 || devinfo->is_haswell, false),
+ true);
+
+ brw_set_dp_typed_surface_read_message(
+ p, insn, num_channels);
+}
+
+static void
+brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
+ struct brw_inst *insn,
+ unsigned num_channels)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ /* Set mask of unused channels. */
+ unsigned msg_control = 0xf & (0xf << num_channels);
+
+ if (devinfo->gen >= 8 || devinfo->is_haswell) {
+ if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
+ if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
+ msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
+ else
+ msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
+ }
+
+ brw_inst_set_dp_msg_type(devinfo, insn,
+ HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
+
+ } else {
+ if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
+ if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
+ msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
+ }
+
+ brw_inst_set_dp_msg_type(devinfo, insn,
+ GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
+ }
+
+ brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
+}
+
+void
+brw_typed_surface_write(struct brw_codegen *p,
+ struct brw_reg payload,
+ struct brw_reg surface,
+ unsigned msg_length,
+ unsigned num_channels)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
+ HSW_SFID_DATAPORT_DATA_CACHE_1 :
+ GEN6_SFID_DATAPORT_RENDER_CACHE);
+ const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
+ /* Mask out unused components -- See comment in brw_untyped_atomic(). */
+ const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
+ WRITEMASK_X : WRITEMASK_XYZW);
+ struct brw_inst *insn = brw_send_indirect_surface_message(
+ p, sfid, brw_writemask(brw_null_reg(), mask),
+ payload, surface, msg_length, 0, true);
+
+ brw_set_dp_typed_surface_write_message(
+ p, insn, num_channels);
+}
+
+static void
+brw_set_memory_fence_message(struct brw_codegen *p,
+ struct brw_inst *insn,
+ enum brw_message_target sfid,
+ bool commit_enable)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+
+ brw_set_message_descriptor(p, insn, sfid,
+ 1 /* message length */,
+ (commit_enable ? 1 : 0) /* response length */,
+ true /* header present */,
+ false);
+
+ switch (sfid) {
+ case GEN6_SFID_DATAPORT_RENDER_CACHE:
+ brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
+ break;
+ case GEN7_SFID_DATAPORT_DATA_CACHE:
+ brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
+ break;
+ default:
+ unreachable("Not reached");
+ }
+
+ if (commit_enable)
+ brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
+}
+
+void
+brw_memory_fence(struct brw_codegen *p,
+ struct brw_reg dst)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ const bool commit_enable = devinfo->gen == 7 && !devinfo->is_haswell;
+ struct brw_inst *insn;
+
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_exec_size(p, BRW_EXECUTE_1);
+ dst = vec1(dst);
+
+ /* Set dst as destination for dependency tracking, the MEMORY_FENCE
+ * message doesn't write anything back.
+ */
+ insn = next_insn(p, BRW_OPCODE_SEND);
+ dst = retype(dst, BRW_REGISTER_TYPE_UW);
+ brw_set_dest(p, insn, dst);
+ brw_set_src0(p, insn, dst);
+ brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
+ commit_enable);
+
+ if (devinfo->gen == 7 && !devinfo->is_haswell) {
+ /* IVB does typed surface access through the render cache, so we need to
+ * flush it too. Use a different register so both flushes can be
+ * pipelined by the hardware.
+ */
+ insn = next_insn(p, BRW_OPCODE_SEND);
+ brw_set_dest(p, insn, offset(dst, 1));
+ brw_set_src0(p, insn, offset(dst, 1));
+ brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
+ commit_enable);
+
+ /* Now write the response of the second message into the response of the
+ * first to trigger a pipeline stall -- This way future render and data
+ * cache messages will be properly ordered with respect to past data and
+ * render cache messages.
+ */
+ brw_MOV(p, dst, offset(dst, 1));
+ }
+
+ brw_pop_insn_state(p);
+}
+
+void
+brw_pixel_interpolator_query(struct brw_codegen *p,
+ struct brw_reg dest,
+ struct brw_reg mrf,
+ bool noperspective,
+ unsigned mode,
+ struct brw_reg data,
+ unsigned msg_length,
+ unsigned response_length)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ struct brw_inst *insn;
+ const uint16_t exec_size = brw_inst_exec_size(devinfo, p->current);
+
+ /* brw_send_indirect_message will automatically use a direct send message
+ * if data is actually immediate.
+ */
+ insn = brw_send_indirect_message(p,
+ GEN7_SFID_PIXEL_INTERPOLATOR,
+ dest,
+ mrf,
+ vec1(data));
+ brw_inst_set_mlen(devinfo, insn, msg_length);
+ brw_inst_set_rlen(devinfo, insn, response_length);
+
+ brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
+ brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
+ brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
+ brw_inst_set_pi_message_type(devinfo, insn, mode);
+}
+
+void
+brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
+ struct brw_reg mask)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
+ const unsigned qtr_control = brw_inst_qtr_control(devinfo, p->current);
+ brw_inst *inst;
+
+ assert(devinfo->gen >= 7);
+ assert(mask.type == BRW_REGISTER_TYPE_UD);
+
+ brw_push_insn_state(p);
+
+ if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+ if (devinfo->gen >= 8) {
+ /* Getting the first active channel index is easy on Gen8: Just find
+ * the first bit set in the execution mask. The register exists on
+ * HSW already but it reads back as all ones when the current
+ * instruction has execution masking disabled, so it's kind of
+ * useless.
+ */
+ struct brw_reg exec_mask =
+ retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
+
+ if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
+ /* Unfortunately, ce0 does not take into account the thread
+ * dispatch mask, which may be a problem in cases where it's not
+ * tightly packed (i.e. it doesn't have the form '2^n - 1' for
+ * some n). Combine ce0 with the given dispatch (or vector) mask
+ * to mask off those channels which were never dispatched by the
+ * hardware.
+ */
+ brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
+ brw_AND(p, vec1(dst), exec_mask, vec1(dst));
+ exec_mask = vec1(dst);
+ }
+
+ /* Quarter control has the effect of magically shifting the value of
+ * ce0 so you'll get the first active channel relative to the
+ * specified quarter control as result.
+ */
+ inst = brw_FBL(p, vec1(dst), exec_mask);
+ } else {
+ const struct brw_reg flag = brw_flag_reg(1, 0);
+
+ brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
+
+ /* Run enough instructions returning zero with execution masking and
+ * a conditional modifier enabled in order to get the full execution
+ * mask in f1.0. We could use a single 32-wide move here if it
+ * weren't because of the hardware bug that causes channel enables to
+ * be applied incorrectly to the second half of 32-wide instructions
+ * on Gen7.
+ */
+ const unsigned lower_size = MIN2(16, exec_size);
+ for (unsigned i = 0; i < exec_size / lower_size; i++) {
+ inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
+ brw_imm_uw(0));
+ brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
+ brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
+ brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
+ brw_inst_set_flag_reg_nr(devinfo, inst, 1);
+ brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
+ }
+
+ /* Find the first bit set in the exec_size-wide portion of the flag
+ * register that was updated by the last sequence of MOV
+ * instructions.
+ */
+ const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
+ brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
+ }
+ } else {
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+ if (devinfo->gen >= 8 &&
+ mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
+ /* In SIMD4x2 mode the first active channel index is just the
+ * negation of the first bit of the mask register. Note that ce0
+ * doesn't take into account the dispatch mask, so the Gen7 path
+ * should be used instead unless you have the guarantee that the
+ * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
+ * for some n).
+ */
+ inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
+ negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
+ brw_imm_ud(1));
+
+ } else {
+ /* Overwrite the destination without and with execution masking to
+ * find out which of the channels is active.
+ */
+ brw_push_insn_state(p);
+ brw_set_default_exec_size(p, BRW_EXECUTE_4);
+ brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
+ brw_imm_ud(1));
+
+ inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
+ brw_imm_ud(0));
+ brw_pop_insn_state(p);
+ brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
+ }
+ }
+
+ brw_pop_insn_state(p);
+}
+
+void
+brw_broadcast(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg src,
+ struct brw_reg idx)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
+ brw_inst *inst;
+
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
+
+ assert(src.file == BRW_GENERAL_REGISTER_FILE &&
+ src.address_mode == BRW_ADDRESS_DIRECT);
+
+ if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
+ idx.file == BRW_IMMEDIATE_VALUE) {
+ /* Trivial, the source is already uniform or the index is a constant.
+ * We will typically not get here if the optimizer is doing its job, but
+ * asserting would be mean.
+ */
+ const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
+ brw_MOV(p, dst,
+ (align1 ? stride(suboffset(src, i), 0, 1, 0) :
+ stride(suboffset(src, 4 * i), 0, 4, 1)));
+ } else {
+ if (align1) {
+ const struct brw_reg addr =
+ retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
+ const unsigned offset = src.nr * REG_SIZE + src.subnr;
+ /* Limit in bytes of the signed indirect addressing immediate. */
+ const unsigned limit = 512;
+
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+
+ /* Take into account the component size and horizontal stride. */
+ assert(src.vstride == src.hstride + src.width);
+ brw_SHL(p, addr, vec1(idx),
+ brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
+ src.hstride - 1));
+
+ /* We can only address up to limit bytes using the indirect
+ * addressing immediate, account for the difference if the source
+ * register is above this limit.
+ */
+ if (offset >= limit)
+ brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
+
+ brw_pop_insn_state(p);
+
+ /* Use indirect addressing to fetch the specified component. */
+ brw_MOV(p, dst,
+ retype(brw_vec1_indirect(addr.subnr, offset % limit),
+ src.type));
+ } else {
+ /* In SIMD4x2 mode the index can be either zero or one, replicate it
+ * to all bits of a flag register,
+ */
+ inst = brw_MOV(p,
+ brw_null_reg(),
+ stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
+ brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
+ brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
+ brw_inst_set_flag_reg_nr(devinfo, inst, 1);
+
+ /* and use predicated SEL to pick the right channel. */
+ inst = brw_SEL(p, dst,
+ stride(suboffset(src, 4), 4, 4, 1),
+ stride(src, 4, 4, 1));
+ brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
+ brw_inst_set_flag_reg_nr(devinfo, inst, 1);
+ }
+ }
+
+ brw_pop_insn_state(p);
+}
+
+/**
+ * This instruction is generated as a single-channel align1 instruction by
+ * both the VS and FS stages when using INTEL_DEBUG=shader_time.
+ *
+ * We can't use the typed atomic op in the FS because that has the execution
+ * mask ANDed with the pixel mask, but we just want to write the one dword for
+ * all the pixels.
+ *
+ * We don't use the SIMD4x2 atomic ops in the VS because want to just write
+ * one u32. So we use the same untyped atomic write message as the pixel
+ * shader.
+ *
+ * The untyped atomic operation requires a BUFFER surface type with RAW
+ * format, and is only accessible through the legacy DATA_CACHE dataport
+ * messages.
+ */
+void brw_shader_time_add(struct brw_codegen *p,
+ struct brw_reg payload,
+ uint32_t surf_index)
+{
+ const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
+ HSW_SFID_DATAPORT_DATA_CACHE_1 :
+ GEN7_SFID_DATAPORT_DATA_CACHE);
+ assert(p->devinfo->gen >= 7);
+
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+
+ /* We use brw_vec1_reg and unmasked because we want to increment the given
+ * offset only once.
+ */
+ brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+ BRW_ARF_NULL, 0));
+ brw_set_src0(p, send, brw_vec1_reg(payload.file,
+ payload.nr, 0));
+ brw_set_src1(p, send, brw_imm_ud(0));
+ brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
+ brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
+ brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
+
+ brw_pop_insn_state(p);
+}
+
+
+/**
+ * Emit the SEND message for a barrier
+ */
+void
+brw_barrier(struct brw_codegen *p, struct brw_reg src)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ struct brw_inst *inst;
+
+ assert(devinfo->gen >= 7);
+
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ inst = next_insn(p, BRW_OPCODE_SEND);
+ brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
+ brw_set_src0(p, inst, src);
+ brw_set_src1(p, inst, brw_null_reg());
+
+ brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
+ 1 /* msg_length */,
+ 0 /* response_length */,
+ false /* header_present */,
+ false /* end_of_thread */);
+
+ brw_inst_set_gateway_notify(devinfo, inst, 1);
+ brw_inst_set_gateway_subfuncid(devinfo, inst,
+ BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
+
+ brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
+ brw_pop_insn_state(p);
+}
+
+
+/**
+ * Emit the wait instruction for a barrier
+ */
+void
+brw_WAIT(struct brw_codegen *p)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ struct brw_inst *insn;
+
+ struct brw_reg src = brw_notification_reg();
+
+ insn = next_insn(p, BRW_OPCODE_WAIT);
+ brw_set_dest(p, insn, src);
+ brw_set_src0(p, insn, src);
+ brw_set_src1(p, insn, brw_null_reg());
+
+ brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
+ brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
+}
diff --git a/src/intel/compiler/brw_eu_util.c b/src/intel/compiler/brw_eu_util.c
new file mode 100644
index 00000000000..8c84cb45008
--- /dev/null
+++ b/src/intel/compiler/brw_eu_util.c
@@ -0,0 +1,123 @@
+/*
+ Copyright (C) Intel Corp. 2006. All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+ * Authors:
+ * Keith Whitwell <[email protected]>
+ */
+
+
+#include "brw_eu_defines.h"
+#include "brw_eu.h"
+
+
+void brw_math_invert( struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg src)
+{
+ gen4_math(p,
+ dst,
+ BRW_MATH_FUNCTION_INV,
+ 0,
+ src,
+ BRW_MATH_PRECISION_FULL);
+}
+
+
+
+void brw_copy4(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg src,
+ unsigned count)
+{
+ unsigned i;
+
+ dst = vec4(dst);
+ src = vec4(src);
+
+ for (i = 0; i < count; i++)
+ {
+ unsigned delta = i*32;
+ brw_MOV(p, byte_offset(dst, delta), byte_offset(src, delta));
+ brw_MOV(p, byte_offset(dst, delta+16), byte_offset(src, delta+16));
+ }
+}
+
+
+void brw_copy8(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg src,
+ unsigned count)
+{
+ unsigned i;
+
+ dst = vec8(dst);
+ src = vec8(src);
+
+ for (i = 0; i < count; i++)
+ {
+ unsigned delta = i*32;
+ brw_MOV(p, byte_offset(dst, delta), byte_offset(src, delta));
+ }
+}
+
+
+void brw_copy_indirect_to_indirect(struct brw_codegen *p,
+ struct brw_indirect dst_ptr,
+ struct brw_indirect src_ptr,
+ unsigned count)
+{
+ unsigned i;
+
+ for (i = 0; i < count; i++)
+ {
+ unsigned delta = i*32;
+ brw_MOV(p, deref_4f(dst_ptr, delta), deref_4f(src_ptr, delta));
+ brw_MOV(p, deref_4f(dst_ptr, delta+16), deref_4f(src_ptr, delta+16));
+ }
+}
+
+
+void brw_copy_from_indirect(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_indirect ptr,
+ unsigned count)
+{
+ unsigned i;
+
+ dst = vec4(dst);
+
+ for (i = 0; i < count; i++)
+ {
+ unsigned delta = i*32;
+ brw_MOV(p, byte_offset(dst, delta), deref_4f(ptr, delta));
+ brw_MOV(p, byte_offset(dst, delta+16), deref_4f(ptr, delta+16));
+ }
+}
+
+
+
+
diff --git a/src/intel/compiler/brw_eu_validate.c b/src/intel/compiler/brw_eu_validate.c
new file mode 100644
index 00000000000..64615af44ac
--- /dev/null
+++ b/src/intel/compiler/brw_eu_validate.c
@@ -0,0 +1,1051 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_eu_validate.c
+ *
+ * This file implements a pass that validates shader assembly.
+ */
+
+#include "brw_eu.h"
+
+/* We're going to do lots of string concatenation, so this should help. */
+struct string {
+ char *str;
+ size_t len;
+};
+
+static void
+cat(struct string *dest, const struct string src)
+{
+ dest->str = realloc(dest->str, dest->len + src.len + 1);
+ memcpy(dest->str + dest->len, src.str, src.len);
+ dest->str[dest->len + src.len] = '\0';
+ dest->len = dest->len + src.len;
+}
+#define CAT(dest, src) cat(&dest, (struct string){src, strlen(src)})
+
+#define error(str) "\tERROR: " str "\n"
+#define ERROR_INDENT "\t "
+
+#define ERROR(msg) ERROR_IF(true, msg)
+#define ERROR_IF(cond, msg) \
+ do { \
+ if (cond) { \
+ CAT(error_msg, error(msg)); \
+ } \
+ } while(0)
+
+#define CHECK(func, args...) \
+ do { \
+ struct string __msg = func(devinfo, inst, ##args); \
+ if (__msg.str) { \
+ cat(&error_msg, __msg); \
+ free(__msg.str); \
+ } \
+ } while (0)
+
+static bool
+inst_is_send(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+ switch (brw_inst_opcode(devinfo, inst)) {
+ case BRW_OPCODE_SEND:
+ case BRW_OPCODE_SENDC:
+ case BRW_OPCODE_SENDS:
+ case BRW_OPCODE_SENDSC:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static unsigned
+signed_type(unsigned type)
+{
+ switch (type) {
+ case BRW_HW_REG_TYPE_UD: return BRW_HW_REG_TYPE_D;
+ case BRW_HW_REG_TYPE_UW: return BRW_HW_REG_TYPE_W;
+ case BRW_HW_REG_NON_IMM_TYPE_UB: return BRW_HW_REG_NON_IMM_TYPE_B;
+ case GEN8_HW_REG_TYPE_UQ: return GEN8_HW_REG_TYPE_Q;
+ default: return type;
+ }
+}
+
+static bool
+inst_is_raw_move(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+ unsigned dst_type = signed_type(brw_inst_dst_reg_type(devinfo, inst));
+ unsigned src_type = signed_type(brw_inst_src0_reg_type(devinfo, inst));
+
+ if (brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE &&
+ (brw_inst_src0_negate(devinfo, inst) ||
+ brw_inst_src0_abs(devinfo, inst)))
+ return false;
+
+ return brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MOV &&
+ brw_inst_saturate(devinfo, inst) == 0 &&
+ dst_type == src_type;
+}
+
+static bool
+dst_is_null(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+ return brw_inst_dst_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
+ brw_inst_dst_da_reg_nr(devinfo, inst) == BRW_ARF_NULL;
+}
+
+static bool
+src0_is_null(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+ return brw_inst_src0_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
+ brw_inst_src0_da_reg_nr(devinfo, inst) == BRW_ARF_NULL;
+}
+
+static bool
+src1_is_null(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+ return brw_inst_src1_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
+ brw_inst_src1_da_reg_nr(devinfo, inst) == BRW_ARF_NULL;
+}
+
+static bool
+src0_is_grf(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+ return brw_inst_src0_reg_file(devinfo, inst) == BRW_GENERAL_REGISTER_FILE;
+}
+
+static bool
+src0_has_scalar_region(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+ return brw_inst_src0_vstride(devinfo, inst) == BRW_VERTICAL_STRIDE_0 &&
+ brw_inst_src0_width(devinfo, inst) == BRW_WIDTH_1 &&
+ brw_inst_src0_hstride(devinfo, inst) == BRW_HORIZONTAL_STRIDE_0;
+}
+
+static bool
+src1_has_scalar_region(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+ return brw_inst_src1_vstride(devinfo, inst) == BRW_VERTICAL_STRIDE_0 &&
+ brw_inst_src1_width(devinfo, inst) == BRW_WIDTH_1 &&
+ brw_inst_src1_hstride(devinfo, inst) == BRW_HORIZONTAL_STRIDE_0;
+}
+
+static unsigned
+num_sources_from_inst(const struct gen_device_info *devinfo,
+ const brw_inst *inst)
+{
+ const struct opcode_desc *desc =
+ brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst));
+ unsigned math_function;
+
+ if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MATH) {
+ math_function = brw_inst_math_function(devinfo, inst);
+ } else if (devinfo->gen < 6 &&
+ brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND) {
+ if (brw_inst_sfid(devinfo, inst) == BRW_SFID_MATH) {
+ /* src1 must be a descriptor (including the information to determine
+ * that the SEND is doing an extended math operation), but src0 can
+ * actually be null since it serves as the source of the implicit GRF
+ * to MRF move.
+ *
+ * If we stop using that functionality, we'll have to revisit this.
+ */
+ return 2;
+ } else {
+ /* Send instructions are allowed to have null sources since they use
+ * the base_mrf field to specify which message register source.
+ */
+ return 0;
+ }
+ } else {
+ assert(desc->nsrc < 4);
+ return desc->nsrc;
+ }
+
+ switch (math_function) {
+ case BRW_MATH_FUNCTION_INV:
+ case BRW_MATH_FUNCTION_LOG:
+ case BRW_MATH_FUNCTION_EXP:
+ case BRW_MATH_FUNCTION_SQRT:
+ case BRW_MATH_FUNCTION_RSQ:
+ case BRW_MATH_FUNCTION_SIN:
+ case BRW_MATH_FUNCTION_COS:
+ case BRW_MATH_FUNCTION_SINCOS:
+ case GEN8_MATH_FUNCTION_INVM:
+ case GEN8_MATH_FUNCTION_RSQRTM:
+ return 1;
+ case BRW_MATH_FUNCTION_FDIV:
+ case BRW_MATH_FUNCTION_POW:
+ case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
+ case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
+ case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
+ return 2;
+ default:
+ unreachable("not reached");
+ }
+}
+
+static struct string
+sources_not_null(const struct gen_device_info *devinfo,
+ const brw_inst *inst)
+{
+ unsigned num_sources = num_sources_from_inst(devinfo, inst);
+ struct string error_msg = { .str = NULL, .len = 0 };
+
+ /* Nothing to test. 3-src instructions can only have GRF sources, and
+ * there's no bit to control the file.
+ */
+ if (num_sources == 3)
+ return (struct string){};
+
+ if (num_sources >= 1)
+ ERROR_IF(src0_is_null(devinfo, inst), "src0 is null");
+
+ if (num_sources == 2)
+ ERROR_IF(src1_is_null(devinfo, inst), "src1 is null");
+
+ return error_msg;
+}
+
+static struct string
+send_restrictions(const struct gen_device_info *devinfo,
+ const brw_inst *inst)
+{
+ struct string error_msg = { .str = NULL, .len = 0 };
+
+ if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND) {
+ ERROR_IF(brw_inst_src0_address_mode(devinfo, inst) != BRW_ADDRESS_DIRECT,
+ "send must use direct addressing");
+
+ if (devinfo->gen >= 7) {
+ ERROR_IF(!src0_is_grf(devinfo, inst), "send from non-GRF");
+ ERROR_IF(brw_inst_eot(devinfo, inst) &&
+ brw_inst_src0_da_reg_nr(devinfo, inst) < 112,
+ "send with EOT must use g112-g127");
+ }
+ }
+
+ return error_msg;
+}
+
+static bool
+is_unsupported_inst(const struct gen_device_info *devinfo,
+ const brw_inst *inst)
+{
+ return brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst)) == NULL;
+}
+
+static unsigned
+execution_type_for_type(unsigned type, bool is_immediate)
+{
+ /* The meaning of the type bits is dependent on whether the operand is an
+ * immediate, so normalize them first.
+ */
+ if (is_immediate) {
+ switch (type) {
+ case BRW_HW_REG_IMM_TYPE_UV:
+ case BRW_HW_REG_IMM_TYPE_V:
+ type = BRW_HW_REG_TYPE_W;
+ break;
+ case BRW_HW_REG_IMM_TYPE_VF:
+ type = BRW_HW_REG_TYPE_F;
+ break;
+ case GEN8_HW_REG_IMM_TYPE_DF:
+ type = GEN7_HW_REG_NON_IMM_TYPE_DF;
+ break;
+ case GEN8_HW_REG_IMM_TYPE_HF:
+ type = GEN8_HW_REG_NON_IMM_TYPE_HF;
+ break;
+ default:
+ break;
+ }
+ }
+
+ switch (type) {
+ case BRW_HW_REG_TYPE_UD:
+ case BRW_HW_REG_TYPE_D:
+ return BRW_HW_REG_TYPE_D;
+ case BRW_HW_REG_TYPE_UW:
+ case BRW_HW_REG_TYPE_W:
+ case BRW_HW_REG_NON_IMM_TYPE_UB:
+ case BRW_HW_REG_NON_IMM_TYPE_B:
+ return BRW_HW_REG_TYPE_W;
+ case GEN8_HW_REG_TYPE_UQ:
+ case GEN8_HW_REG_TYPE_Q:
+ return GEN8_HW_REG_TYPE_Q;
+ case BRW_HW_REG_TYPE_F:
+ case GEN7_HW_REG_NON_IMM_TYPE_DF:
+ case GEN8_HW_REG_NON_IMM_TYPE_HF:
+ return type;
+ default:
+ unreachable("not reached");
+ }
+}
+
+/**
+ * Returns the execution type of an instruction \p inst
+ */
+static unsigned
+execution_type(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+ unsigned num_sources = num_sources_from_inst(devinfo, inst);
+ unsigned src0_exec_type, src1_exec_type;
+ unsigned src0_type = brw_inst_src0_reg_type(devinfo, inst);
+ unsigned src1_type = brw_inst_src1_reg_type(devinfo, inst);
+
+ bool src0_is_immediate =
+ brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE;
+ bool src1_is_immediate =
+ brw_inst_src1_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE;
+
+ /* Execution data type is independent of destination data type, except in
+ * mixed F/HF instructions on CHV and SKL+.
+ */
+ unsigned dst_exec_type = brw_inst_dst_reg_type(devinfo, inst);
+
+ src0_exec_type = execution_type_for_type(src0_type, src0_is_immediate);
+ if (num_sources == 1) {
+ if ((devinfo->gen >= 9 || devinfo->is_cherryview) &&
+ src0_exec_type == GEN8_HW_REG_NON_IMM_TYPE_HF) {
+ return dst_exec_type;
+ }
+ return src0_exec_type;
+ }
+
+ src1_exec_type = execution_type_for_type(src1_type, src1_is_immediate);
+ if (src0_exec_type == src1_exec_type)
+ return src0_exec_type;
+
+ /* Mixed operand types where one is float is float on Gen < 6
+ * (and not allowed on later platforms)
+ */
+ if (devinfo->gen < 6 &&
+ (src0_exec_type == BRW_HW_REG_TYPE_F ||
+ src1_exec_type == BRW_HW_REG_TYPE_F))
+ return BRW_HW_REG_TYPE_F;
+
+ if (src0_exec_type == GEN8_HW_REG_TYPE_Q ||
+ src1_exec_type == GEN8_HW_REG_TYPE_Q)
+ return GEN8_HW_REG_TYPE_Q;
+
+ if (src0_exec_type == BRW_HW_REG_TYPE_D ||
+ src1_exec_type == BRW_HW_REG_TYPE_D)
+ return BRW_HW_REG_TYPE_D;
+
+ if (src0_exec_type == BRW_HW_REG_TYPE_W ||
+ src1_exec_type == BRW_HW_REG_TYPE_W)
+ return BRW_HW_REG_TYPE_W;
+
+ if (src0_exec_type == GEN7_HW_REG_NON_IMM_TYPE_DF ||
+ src1_exec_type == GEN7_HW_REG_NON_IMM_TYPE_DF)
+ return GEN7_HW_REG_NON_IMM_TYPE_DF;
+
+ if (devinfo->gen >= 9 || devinfo->is_cherryview) {
+ if (dst_exec_type == BRW_HW_REG_TYPE_F ||
+ src0_exec_type == BRW_HW_REG_TYPE_F ||
+ src1_exec_type == BRW_HW_REG_TYPE_F) {
+ return BRW_HW_REG_TYPE_F;
+ } else {
+ return GEN8_HW_REG_NON_IMM_TYPE_HF;
+ }
+ }
+
+ assert(src0_exec_type == BRW_HW_REG_TYPE_F);
+ return BRW_HW_REG_TYPE_F;
+}
+
+/**
+ * Returns whether a region is packed
+ *
+ * A region is packed if its elements are adjacent in memory, with no
+ * intervening space, no overlap, and no replicated values.
+ */
+static bool
+is_packed(unsigned vstride, unsigned width, unsigned hstride)
+{
+ if (vstride == width) {
+ if (vstride == 1) {
+ return hstride == 0;
+ } else {
+ return hstride == 1;
+ }
+ }
+
+ return false;
+}
+
+/**
+ * Checks restrictions listed in "General Restrictions Based on Operand Types"
+ * in the "Register Region Restrictions" section.
+ */
+static struct string
+general_restrictions_based_on_operand_types(const struct gen_device_info *devinfo,
+ const brw_inst *inst)
+{
+ const struct opcode_desc *desc =
+ brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst));
+ unsigned num_sources = num_sources_from_inst(devinfo, inst);
+ unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst);
+ struct string error_msg = { .str = NULL, .len = 0 };
+
+ if (num_sources == 3)
+ return (struct string){};
+
+ if (inst_is_send(devinfo, inst))
+ return (struct string){};
+
+ if (exec_size == 1)
+ return (struct string){};
+
+ if (desc->ndst == 0)
+ return (struct string){};
+
+ /* The PRMs say:
+ *
+ * Where n is the largest element size in bytes for any source or
+ * destination operand type, ExecSize * n must be <= 64.
+ *
+ * But we do not attempt to enforce it, because it is implied by other
+ * rules:
+ *
+ * - that the destination stride must match the execution data type
+ * - sources may not span more than two adjacent GRF registers
+ * - destination may not span more than two adjacent GRF registers
+ *
+ * In fact, checking it would weaken testing of the other rules.
+ */
+
+ if (num_sources == 3)
+ return (struct string){};
+
+ if (exec_size == 1)
+ return (struct string){};
+
+ if (inst_is_send(devinfo, inst))
+ return (struct string){};
+
+ if (desc->ndst == 0)
+ return (struct string){};
+
+ unsigned dst_stride = 1 << (brw_inst_dst_hstride(devinfo, inst) - 1);
+ bool dst_type_is_byte =
+ brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_NON_IMM_TYPE_B ||
+ brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_NON_IMM_TYPE_UB;
+
+ if (dst_type_is_byte) {
+ if (is_packed(exec_size * dst_stride, exec_size, dst_stride)) {
+ if (!inst_is_raw_move(devinfo, inst)) {
+ ERROR("Only raw MOV supports a packed-byte destination");
+ return error_msg;
+ } else {
+ return (struct string){};
+ }
+ }
+ }
+
+ unsigned exec_type = execution_type(devinfo, inst);
+ unsigned exec_type_size =
+ brw_hw_reg_type_to_size(devinfo, exec_type, BRW_GENERAL_REGISTER_FILE);
+ unsigned dst_type_size = brw_element_size(devinfo, inst, dst);
+
+ if (exec_type_size > dst_type_size) {
+ ERROR_IF(dst_stride * dst_type_size != exec_type_size,
+ "Destination stride must be equal to the ratio of the sizes of "
+ "the execution data type to the destination type");
+
+ unsigned subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst);
+
+ if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1 &&
+ brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+ /* The i965 PRM says:
+ *
+ * Implementation Restriction: The relaxed alignment rule for byte
+ * destination (#10.5) is not supported.
+ */
+ if ((devinfo->gen > 4 || devinfo->is_g4x) && dst_type_is_byte) {
+ ERROR_IF(subreg % exec_type_size != 0 &&
+ subreg % exec_type_size != 1,
+ "Destination subreg must be aligned to the size of the "
+ "execution data type (or to the next lowest byte for byte "
+ "destinations)");
+ } else {
+ ERROR_IF(subreg % exec_type_size != 0,
+ "Destination subreg must be aligned to the size of the "
+ "execution data type");
+ }
+ }
+ }
+
+ return error_msg;
+}
+
+/**
+ * Checks restrictions listed in "General Restrictions on Regioning Parameters"
+ * in the "Register Region Restrictions" section.
+ */
+static struct string
+general_restrictions_on_region_parameters(const struct gen_device_info *devinfo,
+ const brw_inst *inst)
+{
+ const struct opcode_desc *desc =
+ brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst));
+ unsigned num_sources = num_sources_from_inst(devinfo, inst);
+ unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst);
+ struct string error_msg = { .str = NULL, .len = 0 };
+
+ if (num_sources == 3)
+ return (struct string){};
+
+ if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16) {
+ if (desc->ndst != 0 && !dst_is_null(devinfo, inst))
+ ERROR_IF(brw_inst_dst_hstride(devinfo, inst) != BRW_HORIZONTAL_STRIDE_1,
+ "Destination Horizontal Stride must be 1");
+
+ if (num_sources >= 1) {
+ if (devinfo->is_haswell || devinfo->gen >= 8) {
+ ERROR_IF(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE &&
+ brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0 &&
+ brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_2 &&
+ brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4,
+ "In Align16 mode, only VertStride of 0, 2, or 4 is allowed");
+ } else {
+ ERROR_IF(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE &&
+ brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0 &&
+ brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4,
+ "In Align16 mode, only VertStride of 0 or 4 is allowed");
+ }
+ }
+
+ if (num_sources == 2) {
+ if (devinfo->is_haswell || devinfo->gen >= 8) {
+ ERROR_IF(brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE &&
+ brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0 &&
+ brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_2 &&
+ brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4,
+ "In Align16 mode, only VertStride of 0, 2, or 4 is allowed");
+ } else {
+ ERROR_IF(brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE &&
+ brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0 &&
+ brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4,
+ "In Align16 mode, only VertStride of 0 or 4 is allowed");
+ }
+ }
+
+ return error_msg;
+ }
+
+ for (unsigned i = 0; i < num_sources; i++) {
+ unsigned vstride, width, hstride, element_size, subreg;
+
+#define DO_SRC(n) \
+ if (brw_inst_src ## n ## _reg_file(devinfo, inst) == \
+ BRW_IMMEDIATE_VALUE) \
+ continue; \
+ \
+ vstride = brw_inst_src ## n ## _vstride(devinfo, inst) ? \
+ (1 << (brw_inst_src ## n ## _vstride(devinfo, inst) - 1)) : 0; \
+ width = 1 << brw_inst_src ## n ## _width(devinfo, inst); \
+ hstride = brw_inst_src ## n ## _hstride(devinfo, inst) ? \
+ (1 << (brw_inst_src ## n ## _hstride(devinfo, inst) - 1)) : 0; \
+ element_size = brw_element_size(devinfo, inst, src ## n); \
+ subreg = brw_inst_src ## n ## _da1_subreg_nr(devinfo, inst)
+
+ if (i == 0) {
+ DO_SRC(0);
+ } else if (i == 1) {
+ DO_SRC(1);
+ }
+#undef DO_SRC
+
+ /* ExecSize must be greater than or equal to Width. */
+ ERROR_IF(exec_size < width, "ExecSize must be greater than or equal "
+ "to Width");
+
+ /* If ExecSize = Width and HorzStride ≠ 0,
+ * VertStride must be set to Width * HorzStride.
+ */
+ if (exec_size == width && hstride != 0) {
+ ERROR_IF(vstride != width * hstride,
+ "If ExecSize = Width and HorzStride ≠ 0, "
+ "VertStride must be set to Width * HorzStride");
+ }
+
+ /* If Width = 1, HorzStride must be 0 regardless of the values of
+ * ExecSize and VertStride.
+ */
+ if (width == 1) {
+ ERROR_IF(hstride != 0,
+ "If Width = 1, HorzStride must be 0 regardless "
+ "of the values of ExecSize and VertStride");
+ }
+
+ /* If ExecSize = Width = 1, both VertStride and HorzStride must be 0. */
+ if (exec_size == 1 && width == 1) {
+ ERROR_IF(vstride != 0 || hstride != 0,
+ "If ExecSize = Width = 1, both VertStride "
+ "and HorzStride must be 0");
+ }
+
+ /* If VertStride = HorzStride = 0, Width must be 1 regardless of the
+ * value of ExecSize.
+ */
+ if (vstride == 0 && hstride == 0) {
+ ERROR_IF(width != 1,
+ "If VertStride = HorzStride = 0, Width must be "
+ "1 regardless of the value of ExecSize");
+ }
+
+ /* VertStride must be used to cross GRF register boundaries. This rule
+ * implies that elements within a 'Width' cannot cross GRF boundaries.
+ */
+ const uint64_t mask = (1 << element_size) - 1;
+ unsigned rowbase = subreg;
+
+ for (int y = 0; y < exec_size / width; y++) {
+ uint64_t access_mask = 0;
+ unsigned offset = rowbase;
+
+ for (int x = 0; x < width; x++) {
+ access_mask |= mask << offset;
+ offset += hstride * element_size;
+ }
+
+ rowbase += vstride * element_size;
+
+ if ((uint32_t)access_mask != 0 && (access_mask >> 32) != 0) {
+ ERROR("VertStride must be used to cross GRF register boundaries");
+ break;
+ }
+ }
+ }
+
+ /* Dst.HorzStride must not be 0. */
+ if (desc->ndst != 0 && !dst_is_null(devinfo, inst)) {
+ ERROR_IF(brw_inst_dst_hstride(devinfo, inst) == BRW_HORIZONTAL_STRIDE_0,
+ "Destination Horizontal Stride must not be 0");
+ }
+
+ return error_msg;
+}
+
+/**
+ * Creates an \p access_mask for an \p exec_size, \p element_size, and a region
+ *
+ * An \p access_mask is a 32-element array of uint64_t, where each uint64_t is
+ * a bitmask of bytes accessed by the region.
+ *
+ * For instance the access mask of the source gX.1<4,2,2>F in an exec_size = 4
+ * instruction would be
+ *
+ * access_mask[0] = 0x00000000000000F0
+ * access_mask[1] = 0x000000000000F000
+ * access_mask[2] = 0x0000000000F00000
+ * access_mask[3] = 0x00000000F0000000
+ * access_mask[4-31] = 0
+ *
+ * because the first execution channel accesses bytes 7-4 and the second
+ * execution channel accesses bytes 15-12, etc.
+ */
+static void
+align1_access_mask(uint64_t access_mask[static 32],
+ unsigned exec_size, unsigned element_size, unsigned subreg,
+ unsigned vstride, unsigned width, unsigned hstride)
+{
+ const uint64_t mask = (1 << element_size) - 1;
+ unsigned rowbase = subreg;
+ unsigned element = 0;
+
+ for (int y = 0; y < exec_size / width; y++) {
+ unsigned offset = rowbase;
+
+ for (int x = 0; x < width; x++) {
+ access_mask[element++] = mask << offset;
+ offset += hstride * element_size;
+ }
+
+ rowbase += vstride * element_size;
+ }
+
+ assert(element == 0 || element == exec_size);
+}
+
+/**
+ * Returns the number of registers accessed according to the \p access_mask
+ */
+static int
+registers_read(const uint64_t access_mask[static 32])
+{
+ int regs_read = 0;
+
+ for (unsigned i = 0; i < 32; i++) {
+ if (access_mask[i] > 0xFFFFFFFF) {
+ return 2;
+ } else if (access_mask[i]) {
+ regs_read = 1;
+ }
+ }
+
+ return regs_read;
+}
+
+/**
+ * Checks restrictions listed in "Region Alignment Rules" in the "Register
+ * Region Restrictions" section.
+ */
+static struct string
+region_alignment_rules(const struct gen_device_info *devinfo,
+ const brw_inst *inst)
+{
+ const struct opcode_desc *desc =
+ brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst));
+ unsigned num_sources = num_sources_from_inst(devinfo, inst);
+ unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst);
+ uint64_t dst_access_mask[32], src0_access_mask[32], src1_access_mask[32];
+ struct string error_msg = { .str = NULL, .len = 0 };
+
+ if (num_sources == 3)
+ return (struct string){};
+
+ if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16)
+ return (struct string){};
+
+ if (inst_is_send(devinfo, inst))
+ return (struct string){};
+
+ memset(dst_access_mask, 0, sizeof(dst_access_mask));
+ memset(src0_access_mask, 0, sizeof(src0_access_mask));
+ memset(src1_access_mask, 0, sizeof(src1_access_mask));
+
+ for (unsigned i = 0; i < num_sources; i++) {
+ unsigned vstride, width, hstride, element_size, subreg;
+
+ /* In Direct Addressing mode, a source cannot span more than 2 adjacent
+ * GRF registers.
+ */
+
+#define DO_SRC(n) \
+ if (brw_inst_src ## n ## _address_mode(devinfo, inst) != \
+ BRW_ADDRESS_DIRECT) \
+ continue; \
+ \
+ if (brw_inst_src ## n ## _reg_file(devinfo, inst) == \
+ BRW_IMMEDIATE_VALUE) \
+ continue; \
+ \
+ vstride = brw_inst_src ## n ## _vstride(devinfo, inst) ? \
+ (1 << (brw_inst_src ## n ## _vstride(devinfo, inst) - 1)) : 0; \
+ width = 1 << brw_inst_src ## n ## _width(devinfo, inst); \
+ hstride = brw_inst_src ## n ## _hstride(devinfo, inst) ? \
+ (1 << (brw_inst_src ## n ## _hstride(devinfo, inst) - 1)) : 0; \
+ element_size = brw_element_size(devinfo, inst, src ## n); \
+ subreg = brw_inst_src ## n ## _da1_subreg_nr(devinfo, inst); \
+ align1_access_mask(src ## n ## _access_mask, \
+ exec_size, element_size, subreg, \
+ vstride, width, hstride)
+
+ if (i == 0) {
+ DO_SRC(0);
+ } else if (i == 1) {
+ DO_SRC(1);
+ }
+#undef DO_SRC
+
+ unsigned num_vstride = exec_size / width;
+ unsigned num_hstride = width;
+ unsigned vstride_elements = (num_vstride - 1) * vstride;
+ unsigned hstride_elements = (num_hstride - 1) * hstride;
+ unsigned offset = (vstride_elements + hstride_elements) * element_size +
+ subreg;
+ ERROR_IF(offset >= 64,
+ "A source cannot span more than 2 adjacent GRF registers");
+ }
+
+ if (desc->ndst == 0 || dst_is_null(devinfo, inst))
+ return error_msg;
+
+ unsigned stride = 1 << (brw_inst_dst_hstride(devinfo, inst) - 1);
+ unsigned element_size = brw_element_size(devinfo, inst, dst);
+ unsigned subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst);
+ unsigned offset = ((exec_size - 1) * stride * element_size) + subreg;
+ ERROR_IF(offset >= 64,
+ "A destination cannot span more than 2 adjacent GRF registers");
+
+ if (error_msg.str)
+ return error_msg;
+
+ align1_access_mask(dst_access_mask, exec_size, element_size, subreg,
+ exec_size == 1 ? 0 : exec_size * stride,
+ exec_size == 1 ? 1 : exec_size,
+ exec_size == 1 ? 0 : stride);
+
+ unsigned dst_regs = registers_read(dst_access_mask);
+ unsigned src0_regs = registers_read(src0_access_mask);
+ unsigned src1_regs = registers_read(src1_access_mask);
+
+ /* The SNB, IVB, HSW, BDW, and CHV PRMs say:
+ *
+ * When an instruction has a source region spanning two registers and a
+ * destination region contained in one register, the number of elements
+ * must be the same between two sources and one of the following must be
+ * true:
+ *
+ * 1. The destination region is entirely contained in the lower OWord
+ * of a register.
+ * 2. The destination region is entirely contained in the upper OWord
+ * of a register.
+ * 3. The destination elements are evenly split between the two OWords
+ * of a register.
+ */
+ if (devinfo->gen <= 8) {
+ if (dst_regs == 1 && (src0_regs == 2 || src1_regs == 2)) {
+ unsigned upper_oword_writes = 0, lower_oword_writes = 0;
+
+ for (unsigned i = 0; i < exec_size; i++) {
+ if (dst_access_mask[i] > 0x0000FFFF) {
+ upper_oword_writes++;
+ } else {
+ assert(dst_access_mask[i] != 0);
+ lower_oword_writes++;
+ }
+ }
+
+ ERROR_IF(lower_oword_writes != 0 &&
+ upper_oword_writes != 0 &&
+ upper_oword_writes != lower_oword_writes,
+ "Writes must be to only one OWord or "
+ "evenly split between OWords");
+ }
+ }
+
+ /* The IVB and HSW PRMs say:
+ *
+ * When an instruction has a source region that spans two registers and
+ * the destination spans two registers, the destination elements must be
+ * evenly split between the two registers [...]
+ *
+ * The SNB PRM contains similar wording (but written in a much more
+ * confusing manner).
+ *
+ * The BDW PRM says:
+ *
+ * When destination spans two registers, the source may be one or two
+ * registers. The destination elements must be evenly split between the
+ * two registers.
+ *
+ * The SKL PRM says:
+ *
+ * When destination of MATH instruction spans two registers, the
+ * destination elements must be evenly split between the two registers.
+ *
+ * It is not known whether this restriction applies to KBL other Gens after
+ * SKL.
+ */
+ if (devinfo->gen <= 8 ||
+ brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MATH) {
+
+ /* Nothing explicitly states that on Gen < 8 elements must be evenly
+ * split between two destination registers in the two exceptional
+ * source-region-spans-one-register cases, but since Broadwell requires
+ * evenly split writes regardless of source region, we assume that it was
+ * an oversight and require it.
+ */
+ if (dst_regs == 2) {
+ unsigned upper_reg_writes = 0, lower_reg_writes = 0;
+
+ for (unsigned i = 0; i < exec_size; i++) {
+ if (dst_access_mask[i] > 0xFFFFFFFF) {
+ upper_reg_writes++;
+ } else {
+ assert(dst_access_mask[i] != 0);
+ lower_reg_writes++;
+ }
+ }
+
+ ERROR_IF(upper_reg_writes != lower_reg_writes,
+ "Writes must be evenly split between the two "
+ "destination registers");
+ }
+ }
+
+ /* The IVB and HSW PRMs say:
+ *
+ * When an instruction has a source region that spans two registers and
+ * the destination spans two registers, the destination elements must be
+ * evenly split between the two registers and each destination register
+ * must be entirely derived from one source register.
+ *
+ * Note: In such cases, the regioning parameters must ensure that the
+ * offset from the two source registers is the same.
+ *
+ * The SNB PRM contains similar wording (but written in a much more
+ * confusing manner).
+ *
+ * There are effectively three rules stated here:
+ *
+ * For an instruction with a source and a destination spanning two
+ * registers,
+ *
+ * (1) destination elements must be evenly split between the two
+ * registers
+ * (2) all destination elements in a register must be derived
+ * from one source register
+ * (3) the offset (i.e. the starting location in each of the two
+ * registers spanned by a region) must be the same in the two
+ * registers spanned by a region
+ *
+ * It is impossible to violate rule (1) without violating (2) or (3), so we
+ * do not attempt to validate it.
+ */
+ if (devinfo->gen <= 7 && dst_regs == 2) {
+ for (unsigned i = 0; i < num_sources; i++) {
+#define DO_SRC(n) \
+ if (src ## n ## _regs <= 1) \
+ continue; \
+ \
+ for (unsigned i = 0; i < exec_size; i++) { \
+ if ((dst_access_mask[i] > 0xFFFFFFFF) != \
+ (src ## n ## _access_mask[i] > 0xFFFFFFFF)) { \
+ ERROR("Each destination register must be entirely derived " \
+ "from one source register"); \
+ break; \
+ } \
+ } \
+ \
+ unsigned offset_0 = \
+ brw_inst_src ## n ## _da1_subreg_nr(devinfo, inst); \
+ unsigned offset_1 = offset_0; \
+ \
+ for (unsigned i = 0; i < exec_size; i++) { \
+ if (src ## n ## _access_mask[i] > 0xFFFFFFFF) { \
+ offset_1 = __builtin_ctzll(src ## n ## _access_mask[i]) - 32; \
+ break; \
+ } \
+ } \
+ \
+ ERROR_IF(offset_0 != offset_1, \
+ "The offset from the two source registers " \
+ "must be the same")
+
+ if (i == 0) {
+ DO_SRC(0);
+ } else if (i == 1) {
+ DO_SRC(1);
+ }
+#undef DO_SRC
+ }
+ }
+
+ /* The IVB and HSW PRMs say:
+ *
+ * When destination spans two registers, the source MUST span two
+ * registers. The exception to the above rule:
+ * 1. When source is scalar, the source registers are not
+ * incremented.
+ * 2. When source is packed integer Word and destination is packed
+ * integer DWord, the source register is not incremented by the
+ * source sub register is incremented.
+ *
+ * The SNB PRM does not contain this rule, but the internal documentation
+ * indicates that it applies to SNB as well. We assume that the rule applies
+ * to Gen <= 5 although their PRMs do not state it.
+ *
+ * While the documentation explicitly says in exception (2) that the
+ * destination must be an integer DWord, the hardware allows at least a
+ * float destination type as well. We emit such instructions from
+ *
+ * fs_visitor::emit_interpolation_setup_gen6
+ * fs_visitor::emit_fragcoord_interpolation
+ *
+ * and have for years with no ill effects.
+ *
+ * Additionally the simulator source code indicates that the real condition
+ * is that the size of the destination type is 4 bytes.
+ */
+ if (devinfo->gen <= 7 && dst_regs == 2) {
+ bool dst_is_packed_dword =
+ is_packed(exec_size * stride, exec_size, stride) &&
+ brw_element_size(devinfo, inst, dst) == 4;
+
+ for (unsigned i = 0; i < num_sources; i++) {
+#define DO_SRC(n) \
+ unsigned vstride, width, hstride; \
+ vstride = brw_inst_src ## n ## _vstride(devinfo, inst) ? \
+ (1 << (brw_inst_src ## n ## _vstride(devinfo, inst) - 1)) : 0; \
+ width = 1 << brw_inst_src ## n ## _width(devinfo, inst); \
+ hstride = brw_inst_src ## n ## _hstride(devinfo, inst) ? \
+ (1 << (brw_inst_src ## n ## _hstride(devinfo, inst) - 1)) : 0; \
+ bool src ## n ## _is_packed_word = \
+ is_packed(vstride, width, hstride) && \
+ (brw_inst_src ## n ## _reg_type(devinfo, inst) == BRW_HW_REG_TYPE_W || \
+ brw_inst_src ## n ## _reg_type(devinfo, inst) == BRW_HW_REG_TYPE_UW); \
+ \
+ ERROR_IF(src ## n ## _regs == 1 && \
+ !src ## n ## _has_scalar_region(devinfo, inst) && \
+ !(dst_is_packed_dword && src ## n ## _is_packed_word), \
+ "When the destination spans two registers, the source must " \
+ "span two registers\n" ERROR_INDENT "(exceptions for scalar " \
+ "source and packed-word to packed-dword expansion)")
+
+ if (i == 0) {
+ DO_SRC(0);
+ } else if (i == 1) {
+ DO_SRC(1);
+ }
+#undef DO_SRC
+ }
+ }
+
+ return error_msg;
+}
+
+bool
+brw_validate_instructions(const struct brw_codegen *p, int start_offset,
+ struct annotation_info *annotation)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ const void *store = p->store;
+ bool valid = true;
+
+ for (int src_offset = start_offset; src_offset < p->next_insn_offset;
+ src_offset += sizeof(brw_inst)) {
+ struct string error_msg = { .str = NULL, .len = 0 };
+ const brw_inst *inst = store + src_offset;
+
+ if (is_unsupported_inst(devinfo, inst)) {
+ ERROR("Instruction not supported on this Gen");
+ } else {
+ CHECK(sources_not_null);
+ CHECK(send_restrictions);
+ CHECK(general_restrictions_based_on_operand_types);
+ CHECK(general_restrictions_on_region_parameters);
+ CHECK(region_alignment_rules);
+ }
+
+ if (error_msg.str && annotation) {
+ annotation_insert_error(annotation, src_offset, error_msg.str);
+ }
+ valid = valid && error_msg.len == 0;
+ free(error_msg.str);
+ }
+
+ return valid;
+}
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
new file mode 100644
index 00000000000..c410efc29d6
--- /dev/null
+++ b/src/intel/compiler/brw_fs.cpp
@@ -0,0 +1,6805 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs.cpp
+ *
+ * This file drives the GLSL IR -> LIR translation, contains the
+ * optimizations on the LIR, and drives the generation of native code
+ * from the LIR.
+ */
+
+#include "main/macros.h"
+#include "brw_eu.h"
+#include "brw_fs.h"
+#include "brw_nir.h"
+#include "brw_vec4_gs_visitor.h"
+#include "brw_cfg.h"
+#include "brw_dead_control_flow.h"
+#include "common/gen_debug.h"
+#include "compiler/glsl_types.h"
+#include "compiler/nir/nir_builder.h"
+#include "program/prog_parameter.h"
+
+using namespace brw;
+
+static unsigned get_lowered_simd_width(const struct gen_device_info *devinfo,
+ const fs_inst *inst);
+
+void
+fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+ const fs_reg *src, unsigned sources)
+{
+ memset(this, 0, sizeof(*this));
+
+ this->src = new fs_reg[MAX2(sources, 3)];
+ for (unsigned i = 0; i < sources; i++)
+ this->src[i] = src[i];
+
+ this->opcode = opcode;
+ this->dst = dst;
+ this->sources = sources;
+ this->exec_size = exec_size;
+ this->base_mrf = -1;
+
+ assert(dst.file != IMM && dst.file != UNIFORM);
+
+ assert(this->exec_size != 0);
+
+ this->conditional_mod = BRW_CONDITIONAL_NONE;
+
+ /* This will be the case for almost all instructions. */
+ switch (dst.file) {
+ case VGRF:
+ case ARF:
+ case FIXED_GRF:
+ case MRF:
+ case ATTR:
+ this->size_written = dst.component_size(exec_size);
+ break;
+ case BAD_FILE:
+ this->size_written = 0;
+ break;
+ case IMM:
+ case UNIFORM:
+ unreachable("Invalid destination register file");
+ }
+
+ this->writes_accumulator = false;
+}
+
+fs_inst::fs_inst()
+{
+ init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
+}
+
+fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
+{
+ init(opcode, exec_size, reg_undef, NULL, 0);
+}
+
+fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
+{
+ init(opcode, exec_size, dst, NULL, 0);
+}
+
+fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+ const fs_reg &src0)
+{
+ const fs_reg src[1] = { src0 };
+ init(opcode, exec_size, dst, src, 1);
+}
+
+fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+ const fs_reg &src0, const fs_reg &src1)
+{
+ const fs_reg src[2] = { src0, src1 };
+ init(opcode, exec_size, dst, src, 2);
+}
+
+fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+ const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
+{
+ const fs_reg src[3] = { src0, src1, src2 };
+ init(opcode, exec_size, dst, src, 3);
+}
+
+fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
+ const fs_reg src[], unsigned sources)
+{
+ init(opcode, exec_width, dst, src, sources);
+}
+
+fs_inst::fs_inst(const fs_inst &that)
+{
+ memcpy(this, &that, sizeof(that));
+
+ this->src = new fs_reg[MAX2(that.sources, 3)];
+
+ for (unsigned i = 0; i < that.sources; i++)
+ this->src[i] = that.src[i];
+}
+
+fs_inst::~fs_inst()
+{
+ delete[] this->src;
+}
+
+void
+fs_inst::resize_sources(uint8_t num_sources)
+{
+ if (this->sources != num_sources) {
+ fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
+
+ for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
+ src[i] = this->src[i];
+
+ delete[] this->src;
+ this->src = src;
+ this->sources = num_sources;
+ }
+}
+
+void
+fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
+ const fs_reg &dst,
+ const fs_reg &surf_index,
+ const fs_reg &varying_offset,
+ uint32_t const_offset)
+{
+ /* We have our constant surface use a pitch of 4 bytes, so our index can
+ * be any component of a vector, and then we load 4 contiguous
+ * components starting from that.
+ *
+ * We break down the const_offset to a portion added to the variable offset
+ * and a portion done using fs_reg::offset, which means that if you have
+ * GLSL using something like "uniform vec4 a[20]; gl_FragColor = a[i]",
+ * we'll temporarily generate 4 vec4 loads from offset i * 4, and CSE can
+ * later notice that those loads are all the same and eliminate the
+ * redundant ones.
+ */
+ fs_reg vec4_offset = vgrf(glsl_type::uint_type);
+ bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf));
+
+ /* The pull load message will load a vec4 (16 bytes). If we are loading
+ * a double this means we are only loading 2 elements worth of data.
+ * We also want to use a 32-bit data type for the dst of the load operation
+ * so other parts of the driver don't get confused about the size of the
+ * result.
+ */
+ fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+ fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
+ vec4_result, surf_index, vec4_offset);
+ inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
+
+ if (type_sz(dst.type) == 8) {
+ shuffle_32bit_load_result_to_64bit_data(
+ bld, retype(vec4_result, dst.type), vec4_result, 2);
+ }
+
+ vec4_result.type = dst.type;
+ bld.MOV(dst, offset(vec4_result, bld,
+ (const_offset & 0xf) / type_sz(vec4_result.type)));
+}
+
+/**
+ * A helper for MOV generation for fixing up broken hardware SEND dependency
+ * handling.
+ */
+void
+fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
+{
+ /* The caller always wants uncompressed to emit the minimal extra
+ * dependencies, and to avoid having to deal with aligning its regs to 2.
+ */
+ const fs_builder ubld = bld.annotate("send dependency resolve")
+ .half(0);
+
+ ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F));
+}
+
+bool
+fs_inst::equals(fs_inst *inst) const
+{
+ return (opcode == inst->opcode &&
+ dst.equals(inst->dst) &&
+ src[0].equals(inst->src[0]) &&
+ src[1].equals(inst->src[1]) &&
+ src[2].equals(inst->src[2]) &&
+ saturate == inst->saturate &&
+ predicate == inst->predicate &&
+ conditional_mod == inst->conditional_mod &&
+ mlen == inst->mlen &&
+ base_mrf == inst->base_mrf &&
+ target == inst->target &&
+ eot == inst->eot &&
+ header_size == inst->header_size &&
+ shadow_compare == inst->shadow_compare &&
+ exec_size == inst->exec_size &&
+ offset == inst->offset);
+}
+
+bool
+fs_inst::is_send_from_grf() const
+{
+ switch (opcode) {
+ case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
+ case SHADER_OPCODE_SHADER_TIME_ADD:
+ case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+ case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+ case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+ case SHADER_OPCODE_UNTYPED_ATOMIC:
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+ case SHADER_OPCODE_TYPED_ATOMIC:
+ case SHADER_OPCODE_TYPED_SURFACE_READ:
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+ case SHADER_OPCODE_URB_WRITE_SIMD8:
+ case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+ case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+ case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
+ case SHADER_OPCODE_URB_READ_SIMD8:
+ case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
+ return true;
+ case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+ return src[1].file == VGRF;
+ case FS_OPCODE_FB_WRITE:
+ case FS_OPCODE_FB_READ:
+ return src[0].file == VGRF;
+ default:
+ if (is_tex())
+ return src[0].file == VGRF;
+
+ return false;
+ }
+}
+
+/**
+ * Returns true if this instruction's sources and destinations cannot
+ * safely be the same register.
+ *
+ * In most cases, a register can be written over safely by the same
+ * instruction that is its last use. For a single instruction, the
+ * sources are dereferenced before writing of the destination starts
+ * (naturally).
+ *
+ * However, there are a few cases where this can be problematic:
+ *
+ * - Virtual opcodes that translate to multiple instructions in the
+ * code generator: if src == dst and one instruction writes the
+ * destination before a later instruction reads the source, then
+ * src will have been clobbered.
+ *
+ * - SIMD16 compressed instructions with certain regioning (see below).
+ *
+ * The register allocator uses this information to set up conflicts between
+ * GRF sources and the destination.
+ */
+bool
+fs_inst::has_source_and_destination_hazard() const
+{
+ switch (opcode) {
+ case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+ /* Multiple partial writes to the destination */
+ return true;
+ default:
+ /* The SIMD16 compressed instruction
+ *
+ * add(16) g4<1>F g4<8,8,1>F g6<8,8,1>F
+ *
+ * is actually decoded in hardware as:
+ *
+ * add(8) g4<1>F g4<8,8,1>F g6<8,8,1>F
+ * add(8) g5<1>F g5<8,8,1>F g7<8,8,1>F
+ *
+ * Which is safe. However, if we have uniform accesses
+ * happening, we get into trouble:
+ *
+ * add(8) g4<1>F g4<0,1,0>F g6<8,8,1>F
+ * add(8) g5<1>F g4<0,1,0>F g7<8,8,1>F
+ *
+ * Now our destination for the first instruction overwrote the
+ * second instruction's src0, and we get garbage for those 8
+ * pixels. There's a similar issue for the pre-gen6
+ * pixel_x/pixel_y, which are registers of 16-bit values and thus
+ * would get stomped by the first decode as well.
+ */
+ if (exec_size == 16) {
+ for (int i = 0; i < sources; i++) {
+ if (src[i].file == VGRF && (src[i].stride == 0 ||
+ src[i].type == BRW_REGISTER_TYPE_UW ||
+ src[i].type == BRW_REGISTER_TYPE_W ||
+ src[i].type == BRW_REGISTER_TYPE_UB ||
+ src[i].type == BRW_REGISTER_TYPE_B)) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+}
+
+bool
+fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
+{
+ if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
+ return false;
+
+ fs_reg reg = this->src[0];
+ if (reg.file != VGRF || reg.offset != 0 || reg.stride != 1)
+ return false;
+
+ if (grf_alloc.sizes[reg.nr] * REG_SIZE != this->size_written)
+ return false;
+
+ for (int i = 0; i < this->sources; i++) {
+ reg.type = this->src[i].type;
+ if (!this->src[i].equals(reg))
+ return false;
+
+ if (i < this->header_size) {
+ reg.offset += REG_SIZE;
+ } else {
+ reg = horiz_offset(reg, this->exec_size);
+ }
+ }
+
+ return true;
+}
+
+bool
+fs_inst::can_do_source_mods(const struct gen_device_info *devinfo)
+{
+ if (devinfo->gen == 6 && is_math())
+ return false;
+
+ if (is_send_from_grf())
+ return false;
+
+ if (!backend_instruction::can_do_source_mods())
+ return false;
+
+ return true;
+}
+
+bool
+fs_inst::can_change_types() const
+{
+ return dst.type == src[0].type &&
+ !src[0].abs && !src[0].negate && !saturate &&
+ (opcode == BRW_OPCODE_MOV ||
+ (opcode == BRW_OPCODE_SEL &&
+ dst.type == src[1].type &&
+ predicate != BRW_PREDICATE_NONE &&
+ !src[1].abs && !src[1].negate));
+}
+
+bool
+fs_inst::has_side_effects() const
+{
+ return this->eot || backend_instruction::has_side_effects();
+}
+
+void
+fs_reg::init()
+{
+ memset(this, 0, sizeof(*this));
+ stride = 1;
+}
+
+/** Generic unset register constructor. */
+fs_reg::fs_reg()
+{
+ init();
+ this->file = BAD_FILE;
+}
+
+fs_reg::fs_reg(struct ::brw_reg reg) :
+ backend_reg(reg)
+{
+ this->offset = 0;
+ this->stride = 1;
+ if (this->file == IMM &&
+ (this->type != BRW_REGISTER_TYPE_V &&
+ this->type != BRW_REGISTER_TYPE_UV &&
+ this->type != BRW_REGISTER_TYPE_VF)) {
+ this->stride = 0;
+ }
+}
+
+bool
+fs_reg::equals(const fs_reg &r) const
+{
+ return (this->backend_reg::equals(r) &&
+ stride == r.stride);
+}
+
+bool
+fs_reg::is_contiguous() const
+{
+ return stride == 1;
+}
+
+unsigned
+fs_reg::component_size(unsigned width) const
+{
+ const unsigned stride = ((file != ARF && file != FIXED_GRF) ? this->stride :
+ hstride == 0 ? 0 :
+ 1 << (hstride - 1));
+ return MAX2(width * stride, 1) * type_sz(type);
+}
+
+extern "C" int
+type_size_scalar(const struct glsl_type *type)
+{
+ unsigned int size, i;
+
+ switch (type->base_type) {
+ case GLSL_TYPE_UINT:
+ case GLSL_TYPE_INT:
+ case GLSL_TYPE_FLOAT:
+ case GLSL_TYPE_BOOL:
+ return type->components();
+ case GLSL_TYPE_DOUBLE:
+ case GLSL_TYPE_UINT64:
+ case GLSL_TYPE_INT64:
+ return type->components() * 2;
+ case GLSL_TYPE_ARRAY:
+ return type_size_scalar(type->fields.array) * type->length;
+ case GLSL_TYPE_STRUCT:
+ size = 0;
+ for (i = 0; i < type->length; i++) {
+ size += type_size_scalar(type->fields.structure[i].type);
+ }
+ return size;
+ case GLSL_TYPE_SAMPLER:
+ /* Samplers take up no register space, since they're baked in at
+ * link time.
+ */
+ return 0;
+ case GLSL_TYPE_ATOMIC_UINT:
+ return 0;
+ case GLSL_TYPE_SUBROUTINE:
+ return 1;
+ case GLSL_TYPE_IMAGE:
+ return BRW_IMAGE_PARAM_SIZE;
+ case GLSL_TYPE_VOID:
+ case GLSL_TYPE_ERROR:
+ case GLSL_TYPE_INTERFACE:
+ case GLSL_TYPE_FUNCTION:
+ unreachable("not reached");
+ }
+
+ return 0;
+}
+
+/**
+ * Create a MOV to read the timestamp register.
+ *
+ * The caller is responsible for emitting the MOV. The return value is
+ * the destination of the MOV, with extra parameters set.
+ */
+fs_reg
+fs_visitor::get_timestamp(const fs_builder &bld)
+{
+ assert(devinfo->gen >= 7);
+
+ fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+ BRW_ARF_TIMESTAMP,
+ 0),
+ BRW_REGISTER_TYPE_UD));
+
+ fs_reg dst = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+
+ /* We want to read the 3 fields we care about even if it's not enabled in
+ * the dispatch.
+ */
+ bld.group(4, 0).exec_all().MOV(dst, ts);
+
+ return dst;
+}
+
+void
+fs_visitor::emit_shader_time_begin()
+{
+ /* We want only the low 32 bits of the timestamp. Since it's running
+ * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
+ * which is plenty of time for our purposes. It is identical across the
+ * EUs, but since it's tracking GPU core speed it will increment at a
+ * varying rate as render P-states change.
+ */
+ shader_start_time = component(
+ get_timestamp(bld.annotate("shader time start")), 0);
+}
+
+void
+fs_visitor::emit_shader_time_end()
+{
+ /* Insert our code just before the final SEND with EOT. */
+ exec_node *end = this->instructions.get_tail();
+ assert(end && ((fs_inst *) end)->eot);
+ const fs_builder ibld = bld.annotate("shader time end")
+ .exec_all().at(NULL, end);
+ const fs_reg timestamp = get_timestamp(ibld);
+
+ /* We only use the low 32 bits of the timestamp - see
+ * emit_shader_time_begin()).
+ *
+ * We could also check if render P-states have changed (or anything
+ * else that might disrupt timing) by setting smear to 2 and checking if
+ * that field is != 0.
+ */
+ const fs_reg shader_end_time = component(timestamp, 0);
+
+ /* Check that there weren't any timestamp reset events (assuming these
+ * were the only two timestamp reads that happened).
+ */
+ const fs_reg reset = component(timestamp, 2);
+ set_condmod(BRW_CONDITIONAL_Z,
+ ibld.AND(ibld.null_reg_ud(), reset, brw_imm_ud(1u)));
+ ibld.IF(BRW_PREDICATE_NORMAL);
+
+ fs_reg start = shader_start_time;
+ start.negate = true;
+ const fs_reg diff = component(fs_reg(VGRF, alloc.allocate(1),
+ BRW_REGISTER_TYPE_UD),
+ 0);
+ const fs_builder cbld = ibld.group(1, 0);
+ cbld.group(1, 0).ADD(diff, start, shader_end_time);
+
+ /* If there were no instructions between the two timestamp gets, the diff
+ * is 2 cycles. Remove that overhead, so I can forget about that when
+ * trying to determine the time taken for single instructions.
+ */
+ cbld.ADD(diff, diff, brw_imm_ud(-2u));
+ SHADER_TIME_ADD(cbld, 0, diff);
+ SHADER_TIME_ADD(cbld, 1, brw_imm_ud(1u));
+ ibld.emit(BRW_OPCODE_ELSE);
+ SHADER_TIME_ADD(cbld, 2, brw_imm_ud(1u));
+ ibld.emit(BRW_OPCODE_ENDIF);
+}
+
+void
+fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
+ int shader_time_subindex,
+ fs_reg value)
+{
+ int index = shader_time_index * 3 + shader_time_subindex;
+ struct brw_reg offset = brw_imm_d(index * BRW_SHADER_TIME_STRIDE);
+
+ fs_reg payload;
+ if (dispatch_width == 8)
+ payload = vgrf(glsl_type::uvec2_type);
+ else
+ payload = vgrf(glsl_type::uint_type);
+
+ bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
+}
+
+void
+fs_visitor::vfail(const char *format, va_list va)
+{
+ char *msg;
+
+ if (failed)
+ return;
+
+ failed = true;
+
+ msg = ralloc_vasprintf(mem_ctx, format, va);
+ msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
+
+ this->fail_msg = msg;
+
+ if (debug_enabled) {
+ fprintf(stderr, "%s", msg);
+ }
+}
+
+void
+fs_visitor::fail(const char *format, ...)
+{
+ va_list va;
+
+ va_start(va, format);
+ vfail(format, va);
+ va_end(va);
+}
+
+/**
+ * Mark this program as impossible to compile with dispatch width greater
+ * than n.
+ *
+ * During the SIMD8 compile (which happens first), we can detect and flag
+ * things that are unsupported in SIMD16+ mode, so the compiler can skip the
+ * SIMD16+ compile altogether.
+ *
+ * During a compile of dispatch width greater than n (if one happens anyway),
+ * this just calls fail().
+ */
+void
+fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
+{
+ if (dispatch_width > n) {
+ fail("%s", msg);
+ } else {
+ max_dispatch_width = n;
+ compiler->shader_perf_log(log_data,
+ "Shader dispatch width limited to SIMD%d: %s",
+ n, msg);
+ }
+}
+
+/**
+ * Returns true if the instruction has a flag that means it won't
+ * update an entire destination register.
+ *
+ * For example, dead code elimination and live variable analysis want to know
+ * when a write to a variable screens off any preceding values that were in
+ * it.
+ */
+bool
+fs_inst::is_partial_write() const
+{
+ return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
+ (this->exec_size * type_sz(this->dst.type)) < 32 ||
+ !this->dst.is_contiguous() ||
+ this->dst.offset % REG_SIZE != 0);
+}
+
+unsigned
+fs_inst::components_read(unsigned i) const
+{
+ /* Return zero if the source is not present. */
+ if (src[i].file == BAD_FILE)
+ return 0;
+
+ switch (opcode) {
+ case FS_OPCODE_LINTERP:
+ if (i == 0)
+ return 2;
+ else
+ return 1;
+
+ case FS_OPCODE_PIXEL_X:
+ case FS_OPCODE_PIXEL_Y:
+ assert(i == 0);
+ return 2;
+
+ case FS_OPCODE_FB_WRITE_LOGICAL:
+ assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
+ /* First/second FB write color. */
+ if (i < 2)
+ return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
+ else
+ return 1;
+
+ case SHADER_OPCODE_TEX_LOGICAL:
+ case SHADER_OPCODE_TXD_LOGICAL:
+ case SHADER_OPCODE_TXF_LOGICAL:
+ case SHADER_OPCODE_TXL_LOGICAL:
+ case SHADER_OPCODE_TXS_LOGICAL:
+ case FS_OPCODE_TXB_LOGICAL:
+ case SHADER_OPCODE_TXF_CMS_LOGICAL:
+ case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+ case SHADER_OPCODE_TXF_UMS_LOGICAL:
+ case SHADER_OPCODE_TXF_MCS_LOGICAL:
+ case SHADER_OPCODE_LOD_LOGICAL:
+ case SHADER_OPCODE_TG4_LOGICAL:
+ case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+ case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
+ assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
+ src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
+ /* Texture coordinates. */
+ if (i == TEX_LOGICAL_SRC_COORDINATE)
+ return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
+ /* Texture derivatives. */
+ else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
+ opcode == SHADER_OPCODE_TXD_LOGICAL)
+ return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
+ /* Texture offset. */
+ else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
+ return 2;
+ /* MCS */
+ else if (i == TEX_LOGICAL_SRC_MCS && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
+ return 2;
+ else
+ return 1;
+
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+ case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+ assert(src[3].file == IMM);
+ /* Surface coordinates. */
+ if (i == 0)
+ return src[3].ud;
+ /* Surface operation source (ignored for reads). */
+ else if (i == 1)
+ return 0;
+ else
+ return 1;
+
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+ assert(src[3].file == IMM &&
+ src[4].file == IMM);
+ /* Surface coordinates. */
+ if (i == 0)
+ return src[3].ud;
+ /* Surface operation source. */
+ else if (i == 1)
+ return src[4].ud;
+ else
+ return 1;
+
+ case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+ case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
+ assert(src[3].file == IMM &&
+ src[4].file == IMM);
+ const unsigned op = src[4].ud;
+ /* Surface coordinates. */
+ if (i == 0)
+ return src[3].ud;
+ /* Surface operation source. */
+ else if (i == 1 && op == BRW_AOP_CMPWR)
+ return 2;
+ else if (i == 1 && (op == BRW_AOP_INC || op == BRW_AOP_DEC ||
+ op == BRW_AOP_PREDEC))
+ return 0;
+ else
+ return 1;
+ }
+
+ default:
+ return 1;
+ }
+}
+
+unsigned
+fs_inst::size_read(int arg) const
+{
+ switch (opcode) {
+ case FS_OPCODE_FB_WRITE:
+ case FS_OPCODE_FB_READ:
+ case SHADER_OPCODE_URB_WRITE_SIMD8:
+ case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+ case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+ case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
+ case SHADER_OPCODE_URB_READ_SIMD8:
+ case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
+ case SHADER_OPCODE_UNTYPED_ATOMIC:
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+ case SHADER_OPCODE_TYPED_ATOMIC:
+ case SHADER_OPCODE_TYPED_SURFACE_READ:
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+ case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+ if (arg == 0)
+ return mlen * REG_SIZE;
+ break;
+
+ case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
+ /* The payload is actually stored in src1 */
+ if (arg == 1)
+ return mlen * REG_SIZE;
+ break;
+
+ case FS_OPCODE_LINTERP:
+ if (arg == 1)
+ return 16;
+ break;
+
+ case SHADER_OPCODE_LOAD_PAYLOAD:
+ if (arg < this->header_size)
+ return REG_SIZE;
+ break;
+
+ case CS_OPCODE_CS_TERMINATE:
+ case SHADER_OPCODE_BARRIER:
+ return REG_SIZE;
+
+ case SHADER_OPCODE_MOV_INDIRECT:
+ if (arg == 0) {
+ assert(src[2].file == IMM);
+ return src[2].ud;
+ }
+ break;
+
+ default:
+ if (is_tex() && arg == 0 && src[0].file == VGRF)
+ return mlen * REG_SIZE;
+ break;
+ }
+
+ switch (src[arg].file) {
+ case UNIFORM:
+ case IMM:
+ return components_read(arg) * type_sz(src[arg].type);
+ case BAD_FILE:
+ case ARF:
+ case FIXED_GRF:
+ case VGRF:
+ case ATTR:
+ return components_read(arg) * src[arg].component_size(exec_size);
+ case MRF:
+ unreachable("MRF registers are not allowed as sources");
+ }
+ return 0;
+}
+
+namespace {
+ /* Return the subset of flag registers that an instruction could
+ * potentially read or write based on the execution controls and flag
+ * subregister number of the instruction.
+ */
+ unsigned
+ flag_mask(const fs_inst *inst)
+ {
+ const unsigned start = inst->flag_subreg * 16 + inst->group;
+ const unsigned end = start + inst->exec_size;
+ return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
+ }
+}
+
+unsigned
+fs_inst::flags_read(const gen_device_info *devinfo) const
+{
+ /* XXX - This doesn't consider explicit uses of the flag register as source
+ * region.
+ */
+ if (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
+ predicate == BRW_PREDICATE_ALIGN1_ALLV) {
+ /* The vertical predication modes combine corresponding bits from
+ * f0.0 and f1.0 on Gen7+, and f0.0 and f0.1 on older hardware.
+ */
+ const unsigned shift = devinfo->gen >= 7 ? 4 : 2;
+ return flag_mask(this) << shift | flag_mask(this);
+ } else if (predicate) {
+ return flag_mask(this);
+ } else {
+ return 0;
+ }
+}
+
+unsigned
+fs_inst::flags_written() const
+{
+ /* XXX - This doesn't consider explicit uses of the flag register as
+ * destination region.
+ */
+ if ((conditional_mod && (opcode != BRW_OPCODE_SEL &&
+ opcode != BRW_OPCODE_IF &&
+ opcode != BRW_OPCODE_WHILE)) ||
+ opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
+ return flag_mask(this);
+ } else {
+ return 0;
+ }
+}
+
+/**
+ * Returns how many MRFs an FS opcode will write over.
+ *
+ * Note that this is not the 0 or 1 implied writes in an actual gen
+ * instruction -- the FS opcodes often generate MOVs in addition.
+ */
+int
+fs_visitor::implied_mrf_writes(fs_inst *inst)
+{
+ if (inst->mlen == 0)
+ return 0;
+
+ if (inst->base_mrf == -1)
+ return 0;
+
+ switch (inst->opcode) {
+ case SHADER_OPCODE_RCP:
+ case SHADER_OPCODE_RSQ:
+ case SHADER_OPCODE_SQRT:
+ case SHADER_OPCODE_EXP2:
+ case SHADER_OPCODE_LOG2:
+ case SHADER_OPCODE_SIN:
+ case SHADER_OPCODE_COS:
+ return 1 * dispatch_width / 8;
+ case SHADER_OPCODE_POW:
+ case SHADER_OPCODE_INT_QUOTIENT:
+ case SHADER_OPCODE_INT_REMAINDER:
+ return 2 * dispatch_width / 8;
+ case SHADER_OPCODE_TEX:
+ case FS_OPCODE_TXB:
+ case SHADER_OPCODE_TXD:
+ case SHADER_OPCODE_TXF:
+ case SHADER_OPCODE_TXF_CMS:
+ case SHADER_OPCODE_TXF_MCS:
+ case SHADER_OPCODE_TG4:
+ case SHADER_OPCODE_TG4_OFFSET:
+ case SHADER_OPCODE_TXL:
+ case SHADER_OPCODE_TXS:
+ case SHADER_OPCODE_LOD:
+ case SHADER_OPCODE_SAMPLEINFO:
+ return 1;
+ case FS_OPCODE_FB_WRITE:
+ return 2;
+ case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+ case SHADER_OPCODE_GEN4_SCRATCH_READ:
+ return 1;
+ case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
+ return inst->mlen;
+ case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
+ return inst->mlen;
+ default:
+ unreachable("not reached");
+ }
+}
+
+fs_reg
+fs_visitor::vgrf(const glsl_type *const type)
+{
+ int reg_width = dispatch_width / 8;
+ return fs_reg(VGRF, alloc.allocate(type_size_scalar(type) * reg_width),
+ brw_type_for_base_type(type));
+}
+
+fs_reg::fs_reg(enum brw_reg_file file, int nr)
+{
+ init();
+ this->file = file;
+ this->nr = nr;
+ this->type = BRW_REGISTER_TYPE_F;
+ this->stride = (file == UNIFORM ? 0 : 1);
+}
+
+fs_reg::fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type)
+{
+ init();
+ this->file = file;
+ this->nr = nr;
+ this->type = type;
+ this->stride = (file == UNIFORM ? 0 : 1);
+}
+
+/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
+ * This brings in those uniform definitions
+ */
+void
+fs_visitor::import_uniforms(fs_visitor *v)
+{
+ this->push_constant_loc = v->push_constant_loc;
+ this->pull_constant_loc = v->pull_constant_loc;
+ this->uniforms = v->uniforms;
+}
+
+void
+fs_visitor::emit_fragcoord_interpolation(fs_reg wpos)
+{
+ assert(stage == MESA_SHADER_FRAGMENT);
+
+ /* gl_FragCoord.x */
+ bld.MOV(wpos, this->pixel_x);
+ wpos = offset(wpos, bld, 1);
+
+ /* gl_FragCoord.y */
+ bld.MOV(wpos, this->pixel_y);
+ wpos = offset(wpos, bld, 1);
+
+ /* gl_FragCoord.z */
+ if (devinfo->gen >= 6) {
+ bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
+ } else {
+ bld.emit(FS_OPCODE_LINTERP, wpos,
+ this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL],
+ interp_reg(VARYING_SLOT_POS, 2));
+ }
+ wpos = offset(wpos, bld, 1);
+
+ /* gl_FragCoord.w: Already set up in emit_interpolation */
+ bld.MOV(wpos, this->wpos_w);
+}
+
+enum brw_barycentric_mode
+brw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op)
+{
+ /* Barycentric modes don't make sense for flat inputs. */
+ assert(mode != INTERP_MODE_FLAT);
+
+ unsigned bary;
+ switch (op) {
+ case nir_intrinsic_load_barycentric_pixel:
+ case nir_intrinsic_load_barycentric_at_offset:
+ bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
+ break;
+ case nir_intrinsic_load_barycentric_centroid:
+ bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
+ break;
+ case nir_intrinsic_load_barycentric_sample:
+ case nir_intrinsic_load_barycentric_at_sample:
+ bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
+ break;
+ default:
+ unreachable("invalid intrinsic");
+ }
+
+ if (mode == INTERP_MODE_NOPERSPECTIVE)
+ bary += 3;
+
+ return (enum brw_barycentric_mode) bary;
+}
+
+/**
+ * Turn one of the two CENTROID barycentric modes into PIXEL mode.
+ */
+static enum brw_barycentric_mode
+centroid_to_pixel(enum brw_barycentric_mode bary)
+{
+ assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID ||
+ bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
+ return (enum brw_barycentric_mode) ((unsigned) bary - 1);
+}
+
+fs_reg *
+fs_visitor::emit_frontfacing_interpolation()
+{
+ fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
+
+ if (devinfo->gen >= 6) {
+ /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
+ * a boolean result from this (~0/true or 0/false).
+ *
+ * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
+ * this task in only one instruction:
+ * - a negation source modifier will flip the bit; and
+ * - a W -> D type conversion will sign extend the bit into the high
+ * word of the destination.
+ *
+ * An ASR 15 fills the low word of the destination.
+ */
+ fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
+ g0.negate = true;
+
+ bld.ASR(*reg, g0, brw_imm_d(15));
+ } else {
+ /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
+ * a boolean result from this (1/true or 0/false).
+ *
+ * Like in the above case, since the bit is the MSB of g1.6:UD we can use
+ * the negation source modifier to flip it. Unfortunately the SHR
+ * instruction only operates on UD (or D with an abs source modifier)
+ * sources without negation.
+ *
+ * Instead, use ASR (which will give ~0/true or 0/false).
+ */
+ fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
+ g1_6.negate = true;
+
+ bld.ASR(*reg, g1_6, brw_imm_d(31));
+ }
+
+ return reg;
+}
+
+void
+fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
+{
+ assert(stage == MESA_SHADER_FRAGMENT);
+ struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
+ assert(dst.type == BRW_REGISTER_TYPE_F);
+
+ if (wm_prog_data->persample_dispatch) {
+ /* Convert int_sample_pos to floating point */
+ bld.MOV(dst, int_sample_pos);
+ /* Scale to the range [0, 1] */
+ bld.MUL(dst, dst, brw_imm_f(1 / 16.0f));
+ }
+ else {
+ /* From ARB_sample_shading specification:
+ * "When rendering to a non-multisample buffer, or if multisample
+ * rasterization is disabled, gl_SamplePosition will always be
+ * (0.5, 0.5).
+ */
+ bld.MOV(dst, brw_imm_f(0.5f));
+ }
+}
+
+fs_reg *
+fs_visitor::emit_samplepos_setup()
+{
+ assert(devinfo->gen >= 6);
+
+ const fs_builder abld = bld.annotate("compute sample position");
+ fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
+ fs_reg pos = *reg;
+ fs_reg int_sample_x = vgrf(glsl_type::int_type);
+ fs_reg int_sample_y = vgrf(glsl_type::int_type);
+
+ /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
+ * mode will be enabled.
+ *
+ * From the Ivy Bridge PRM, volume 2 part 1, page 344:
+ * R31.1:0 Position Offset X/Y for Slot[3:0]
+ * R31.3:2 Position Offset X/Y for Slot[7:4]
+ * .....
+ *
+ * The X, Y sample positions come in as bytes in thread payload. So, read
+ * the positions using vstride=16, width=8, hstride=2.
+ */
+ struct brw_reg sample_pos_reg =
+ stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
+ BRW_REGISTER_TYPE_B), 16, 8, 2);
+
+ if (dispatch_width == 8) {
+ abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
+ } else {
+ abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
+ abld.half(1).MOV(half(int_sample_x, 1),
+ fs_reg(suboffset(sample_pos_reg, 16)));
+ }
+ /* Compute gl_SamplePosition.x */
+ compute_sample_position(pos, int_sample_x);
+ pos = offset(pos, abld, 1);
+ if (dispatch_width == 8) {
+ abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
+ } else {
+ abld.half(0).MOV(half(int_sample_y, 0),
+ fs_reg(suboffset(sample_pos_reg, 1)));
+ abld.half(1).MOV(half(int_sample_y, 1),
+ fs_reg(suboffset(sample_pos_reg, 17)));
+ }
+ /* Compute gl_SamplePosition.y */
+ compute_sample_position(pos, int_sample_y);
+ return reg;
+}
+
+fs_reg *
+fs_visitor::emit_sampleid_setup()
+{
+ assert(stage == MESA_SHADER_FRAGMENT);
+ brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+ assert(devinfo->gen >= 6);
+
+ const fs_builder abld = bld.annotate("compute sample id");
+ fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
+
+ if (!key->multisample_fbo) {
+ /* As per GL_ARB_sample_shading specification:
+ * "When rendering to a non-multisample buffer, or if multisample
+ * rasterization is disabled, gl_SampleID will always be zero."
+ */
+ abld.MOV(*reg, brw_imm_d(0));
+ } else if (devinfo->gen >= 8) {
+ /* Sample ID comes in as 4-bit numbers in g1.0:
+ *
+ * 15:12 Slot 3 SampleID (only used in SIMD16)
+ * 11:8 Slot 2 SampleID (only used in SIMD16)
+ * 7:4 Slot 1 SampleID
+ * 3:0 Slot 0 SampleID
+ *
+ * Each slot corresponds to four channels, so we want to replicate each
+ * half-byte value to 4 channels in a row:
+ *
+ * dst+0: .7 .6 .5 .4 .3 .2 .1 .0
+ * 7:4 7:4 7:4 7:4 3:0 3:0 3:0 3:0
+ *
+ * dst+1: .7 .6 .5 .4 .3 .2 .1 .0 (if SIMD16)
+ * 15:12 15:12 15:12 15:12 11:8 11:8 11:8 11:8
+ *
+ * First, we read g1.0 with a <1,8,0>UB region, causing the first 8
+ * channels to read the first byte (7:0), and the second group of 8
+ * channels to read the second byte (15:8). Then, we shift right by
+ * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
+ * values into place. Finally, we AND with 0xf to keep the low nibble.
+ *
+ * shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
+ * and(16) dst<1>D tmp<8,8,1>W 0xf:W
+ *
+ * TODO: These payload bits exist on Gen7 too, but they appear to always
+ * be zero, so this code fails to work. We should find out why.
+ */
+ fs_reg tmp(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
+
+ abld.SHR(tmp, fs_reg(stride(retype(brw_vec1_grf(1, 0),
+ BRW_REGISTER_TYPE_B), 1, 8, 0)),
+ brw_imm_v(0x44440000));
+ abld.AND(*reg, tmp, brw_imm_w(0xf));
+ } else {
+ const fs_reg t1 = component(fs_reg(VGRF, alloc.allocate(1),
+ BRW_REGISTER_TYPE_D), 0);
+ const fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
+
+ /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
+ * 8x multisampling, subspan 0 will represent sample N (where N
+ * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
+ * 7. We can find the value of N by looking at R0.0 bits 7:6
+ * ("Starting Sample Pair Index (SSPI)") and multiplying by two
+ * (since samples are always delivered in pairs). That is, we
+ * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
+ * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
+ * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
+ * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
+ * populating a temporary variable with the sequence (0, 1, 2, 3),
+ * and then reading from it using vstride=1, width=4, hstride=0.
+ * These computations hold good for 4x multisampling as well.
+ *
+ * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
+ * the first four slots are sample 0 of subspan 0; the next four
+ * are sample 1 of subspan 0; the third group is sample 0 of
+ * subspan 1, and finally sample 1 of subspan 1.
+ */
+
+ /* SKL+ has an extra bit for the Starting Sample Pair Index to
+ * accomodate 16x MSAA.
+ */
+ abld.exec_all().group(1, 0)
+ .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
+ brw_imm_ud(0xc0));
+ abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));
+
+ /* This works for both SIMD8 and SIMD16 */
+ abld.exec_all().group(4, 0).MOV(t2, brw_imm_v(0x3210));
+
+ /* This special instruction takes care of setting vstride=1,
+ * width=4, hstride=0 of t2 during an ADD instruction.
+ */
+ abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
+ }
+
+ return reg;
+}
+
+fs_reg *
+fs_visitor::emit_samplemaskin_setup()
+{
+ assert(stage == MESA_SHADER_FRAGMENT);
+ struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
+ assert(devinfo->gen >= 6);
+
+ fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
+
+ fs_reg coverage_mask(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
+ BRW_REGISTER_TYPE_D));
+
+ if (wm_prog_data->persample_dispatch) {
+ /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
+ * and a mask representing which sample is being processed by the
+ * current shader invocation.
+ *
+ * From the OES_sample_variables specification:
+ * "When per-sample shading is active due to the use of a fragment input
+ * qualified by "sample" or due to the use of the gl_SampleID or
+ * gl_SamplePosition variables, only the bit for the current sample is
+ * set in gl_SampleMaskIn."
+ */
+ const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
+
+ if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
+ nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
+
+ fs_reg one = vgrf(glsl_type::int_type);
+ fs_reg enabled_mask = vgrf(glsl_type::int_type);
+ abld.MOV(one, brw_imm_d(1));
+ abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]);
+ abld.AND(*reg, enabled_mask, coverage_mask);
+ } else {
+ /* In per-pixel mode, the coverage mask is sufficient. */
+ *reg = coverage_mask;
+ }
+ return reg;
+}
+
+fs_reg
+fs_visitor::resolve_source_modifiers(const fs_reg &src)
+{
+ if (!src.abs && !src.negate)
+ return src;
+
+ fs_reg temp = bld.vgrf(src.type);
+ bld.MOV(temp, src);
+
+ return temp;
+}
+
+void
+fs_visitor::emit_discard_jump()
+{
+ assert(brw_wm_prog_data(this->prog_data)->uses_kill);
+
+ /* For performance, after a discard, jump to the end of the
+ * shader if all relevant channels have been discarded.
+ */
+ fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
+ discard_jump->flag_subreg = 1;
+
+ discard_jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H;
+ discard_jump->predicate_inverse = true;
+}
+
+void
+fs_visitor::emit_gs_thread_end()
+{
+ assert(stage == MESA_SHADER_GEOMETRY);
+
+ struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
+
+ if (gs_compile->control_data_header_size_bits > 0) {
+ emit_gs_control_data_bits(this->final_gs_vertex_count);
+ }
+
+ const fs_builder abld = bld.annotate("thread end");
+ fs_inst *inst;
+
+ if (gs_prog_data->static_vertex_count != -1) {
+ foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
+ if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 ||
+ prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
+ prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
+ prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) {
+ prev->eot = true;
+
+ /* Delete now dead instructions. */
+ foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
+ if (dead == prev)
+ break;
+ dead->remove();
+ }
+ return;
+ } else if (prev->is_control_flow() || prev->has_side_effects()) {
+ break;
+ }
+ }
+ fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)));
+ inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr);
+ inst->mlen = 1;
+ } else {
+ fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+ fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
+ sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+ sources[1] = this->final_gs_vertex_count;
+ abld.LOAD_PAYLOAD(payload, sources, 2, 2);
+ inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
+ inst->mlen = 2;
+ }
+ inst->eot = true;
+ inst->offset = 0;
+}
+
+void
+fs_visitor::assign_curb_setup()
+{
+ prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
+
+ /* Map the offsets in the UNIFORM file to fixed HW regs. */
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ for (unsigned int i = 0; i < inst->sources; i++) {
+ if (inst->src[i].file == UNIFORM) {
+ int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
+ int constant_nr;
+ if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
+ constant_nr = push_constant_loc[uniform_nr];
+ } else {
+ /* Section 5.11 of the OpenGL 4.1 spec says:
+ * "Out-of-bounds reads return undefined values, which include
+ * values from other variables of the active program or zero."
+ * Just return the first push constant.
+ */
+ constant_nr = 0;
+ }
+
+ struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
+ constant_nr / 8,
+ constant_nr % 8);
+ brw_reg.abs = inst->src[i].abs;
+ brw_reg.negate = inst->src[i].negate;
+
+ assert(inst->src[i].stride == 0);
+ inst->src[i] = byte_offset(
+ retype(brw_reg, inst->src[i].type),
+ inst->src[i].offset % 4);
+ }
+ }
+ }
+
+ /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
+ this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length;
+}
+
+void
+fs_visitor::calculate_urb_setup()
+{
+ assert(stage == MESA_SHADER_FRAGMENT);
+ struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+ brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+
+ memset(prog_data->urb_setup, -1,
+ sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
+
+ int urb_next = 0;
+ /* Figure out where each of the incoming setup attributes lands. */
+ if (devinfo->gen >= 6) {
+ if (_mesa_bitcount_64(nir->info->inputs_read &
+ BRW_FS_VARYING_INPUT_MASK) <= 16) {
+ /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
+ * first 16 varying inputs, so we can put them wherever we want.
+ * Just put them in order.
+ *
+ * This is useful because it means that (a) inputs not used by the
+ * fragment shader won't take up valuable register space, and (b) we
+ * won't have to recompile the fragment shader if it gets paired with
+ * a different vertex (or geometry) shader.
+ */
+ for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
+ if (nir->info->inputs_read & BRW_FS_VARYING_INPUT_MASK &
+ BITFIELD64_BIT(i)) {
+ prog_data->urb_setup[i] = urb_next++;
+ }
+ }
+ } else {
+ bool include_vue_header =
+ nir->info->inputs_read & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
+
+ /* We have enough input varyings that the SF/SBE pipeline stage can't
+ * arbitrarily rearrange them to suit our whim; we have to put them
+ * in an order that matches the output of the previous pipeline stage
+ * (geometry or vertex shader).
+ */
+ struct brw_vue_map prev_stage_vue_map;
+ brw_compute_vue_map(devinfo, &prev_stage_vue_map,
+ key->input_slots_valid,
+ nir->info->separate_shader);
+ int first_slot =
+ include_vue_header ? 0 : 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
+
+ assert(prev_stage_vue_map.num_slots <= first_slot + 32);
+ for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
+ slot++) {
+ int varying = prev_stage_vue_map.slot_to_varying[slot];
+ if (varying != BRW_VARYING_SLOT_PAD &&
+ (nir->info->inputs_read & BRW_FS_VARYING_INPUT_MASK &
+ BITFIELD64_BIT(varying))) {
+ prog_data->urb_setup[varying] = slot - first_slot;
+ }
+ }
+ urb_next = prev_stage_vue_map.num_slots - first_slot;
+ }
+ } else {
+ /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
+ for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
+ /* Point size is packed into the header, not as a general attribute */
+ if (i == VARYING_SLOT_PSIZ)
+ continue;
+
+ if (key->input_slots_valid & BITFIELD64_BIT(i)) {
+ /* The back color slot is skipped when the front color is
+ * also written to. In addition, some slots can be
+ * written in the vertex shader and not read in the
+ * fragment shader. So the register number must always be
+ * incremented, mapped or not.
+ */
+ if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
+ prog_data->urb_setup[i] = urb_next;
+ urb_next++;
+ }
+ }
+
+ /*
+ * It's a FS only attribute, and we did interpolation for this attribute
+ * in SF thread. So, count it here, too.
+ *
+ * See compile_sf_prog() for more info.
+ */
+ if (nir->info->inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
+ prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
+ }
+
+ prog_data->num_varying_inputs = urb_next;
+}
+
+void
+fs_visitor::assign_urb_setup()
+{
+ assert(stage == MESA_SHADER_FRAGMENT);
+ struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+
+ int urb_start = payload.num_regs + prog_data->base.curb_read_length;
+
+ /* Offset all the urb_setup[] index by the actual position of the
+ * setup regs, now that the location of the constants has been chosen.
+ */
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ if (inst->opcode == FS_OPCODE_LINTERP) {
+ assert(inst->src[1].file == FIXED_GRF);
+ inst->src[1].nr += urb_start;
+ }
+
+ if (inst->opcode == FS_OPCODE_CINTERP) {
+ assert(inst->src[0].file == FIXED_GRF);
+ inst->src[0].nr += urb_start;
+ }
+ }
+
+ /* Each attribute is 4 setup channels, each of which is half a reg. */
+ this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
+}
+
+void
+fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
+{
+ for (int i = 0; i < inst->sources; i++) {
+ if (inst->src[i].file == ATTR) {
+ int grf = payload.num_regs +
+ prog_data->curb_read_length +
+ inst->src[i].nr +
+ inst->src[i].offset / REG_SIZE;
+
+ /* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
+ *
+ * VertStride must be used to cross GRF register boundaries. This
+ * rule implies that elements within a 'Width' cannot cross GRF
+ * boundaries.
+ *
+ * So, for registers that are large enough, we have to split the exec
+ * size in two and trust the compression state to sort it out.
+ */
+ unsigned total_size = inst->exec_size *
+ inst->src[i].stride *
+ type_sz(inst->src[i].type);
+
+ assert(total_size <= 2 * REG_SIZE);
+ const unsigned exec_size =
+ (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
+
+ unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
+ struct brw_reg reg =
+ stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
+ inst->src[i].offset % REG_SIZE),
+ exec_size * inst->src[i].stride,
+ width, inst->src[i].stride);
+ reg.abs = inst->src[i].abs;
+ reg.negate = inst->src[i].negate;
+
+ inst->src[i] = reg;
+ }
+ }
+}
+
+void
+fs_visitor::assign_vs_urb_setup()
+{
+ struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
+
+ assert(stage == MESA_SHADER_VERTEX);
+
+ /* Each attribute is 4 regs. */
+ this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
+
+ assert(vs_prog_data->base.urb_read_length <= 15);
+
+ /* Rewrite all ATTR file references to the hw grf that they land in. */
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ convert_attr_sources_to_hw_regs(inst);
+ }
+}
+
+void
+fs_visitor::assign_tcs_single_patch_urb_setup()
+{
+ assert(stage == MESA_SHADER_TESS_CTRL);
+
+ /* Rewrite all ATTR file references to HW_REGs. */
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ convert_attr_sources_to_hw_regs(inst);
+ }
+}
+
+void
+fs_visitor::assign_tes_urb_setup()
+{
+ assert(stage == MESA_SHADER_TESS_EVAL);
+
+ struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
+
+ first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
+
+ /* Rewrite all ATTR file references to HW_REGs. */
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ convert_attr_sources_to_hw_regs(inst);
+ }
+}
+
+void
+fs_visitor::assign_gs_urb_setup()
+{
+ assert(stage == MESA_SHADER_GEOMETRY);
+
+ struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
+
+ first_non_payload_grf +=
+ 8 * vue_prog_data->urb_read_length * nir->info->gs.vertices_in;
+
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ /* Rewrite all ATTR file references to GRFs. */
+ convert_attr_sources_to_hw_regs(inst);
+ }
+}
+
+
+/**
+ * Split large virtual GRFs into separate components if we can.
+ *
+ * This is mostly duplicated with what brw_fs_vector_splitting does,
+ * but that's really conservative because it's afraid of doing
+ * splitting that doesn't result in real progress after the rest of
+ * the optimization phases, which would cause infinite looping in
+ * optimization. We can do it once here, safely. This also has the
+ * opportunity to split interpolated values, or maybe even uniforms,
+ * which we don't have at the IR level.
+ *
+ * We want to split, because virtual GRFs are what we register
+ * allocate and spill (due to contiguousness requirements for some
+ * instructions), and they're what we naturally generate in the
+ * codegen process, but most virtual GRFs don't actually need to be
+ * contiguous sets of GRFs. If we split, we'll end up with reduced
+ * live intervals and better dead code elimination and coalescing.
+ */
+void
+fs_visitor::split_virtual_grfs()
+{
+ /* Compact the register file so we eliminate dead vgrfs. This
+ * only defines split points for live registers, so if we have
+ * too large dead registers they will hit assertions later.
+ */
+ compact_virtual_grfs();
+
+ int num_vars = this->alloc.count;
+
+ /* Count the total number of registers */
+ int reg_count = 0;
+ int vgrf_to_reg[num_vars];
+ for (int i = 0; i < num_vars; i++) {
+ vgrf_to_reg[i] = reg_count;
+ reg_count += alloc.sizes[i];
+ }
+
+ /* An array of "split points". For each register slot, this indicates
+ * if this slot can be separated from the previous slot. Every time an
+ * instruction uses multiple elements of a register (as a source or
+ * destination), we mark the used slots as inseparable. Then we go
+ * through and split the registers into the smallest pieces we can.
+ */
+ bool split_points[reg_count];
+ memset(split_points, 0, sizeof(split_points));
+
+ /* Mark all used registers as fully splittable */
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ if (inst->dst.file == VGRF) {
+ int reg = vgrf_to_reg[inst->dst.nr];
+ for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
+ split_points[reg + j] = true;
+ }
+
+ for (int i = 0; i < inst->sources; i++) {
+ if (inst->src[i].file == VGRF) {
+ int reg = vgrf_to_reg[inst->src[i].nr];
+ for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
+ split_points[reg + j] = true;
+ }
+ }
+ }
+
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ if (inst->dst.file == VGRF) {
+ int reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
+ for (unsigned j = 1; j < regs_written(inst); j++)
+ split_points[reg + j] = false;
+ }
+ for (int i = 0; i < inst->sources; i++) {
+ if (inst->src[i].file == VGRF) {
+ int reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
+ for (unsigned j = 1; j < regs_read(inst, i); j++)
+ split_points[reg + j] = false;
+ }
+ }
+ }
+
+ int new_virtual_grf[reg_count];
+ int new_reg_offset[reg_count];
+
+ int reg = 0;
+ for (int i = 0; i < num_vars; i++) {
+ /* The first one should always be 0 as a quick sanity check. */
+ assert(split_points[reg] == false);
+
+ /* j = 0 case */
+ new_reg_offset[reg] = 0;
+ reg++;
+ int offset = 1;
+
+ /* j > 0 case */
+ for (unsigned j = 1; j < alloc.sizes[i]; j++) {
+ /* If this is a split point, reset the offset to 0 and allocate a
+ * new virtual GRF for the previous offset many registers
+ */
+ if (split_points[reg]) {
+ assert(offset <= MAX_VGRF_SIZE);
+ int grf = alloc.allocate(offset);
+ for (int k = reg - offset; k < reg; k++)
+ new_virtual_grf[k] = grf;
+ offset = 0;
+ }
+ new_reg_offset[reg] = offset;
+ offset++;
+ reg++;
+ }
+
+ /* The last one gets the original register number */
+ assert(offset <= MAX_VGRF_SIZE);
+ alloc.sizes[i] = offset;
+ for (int k = reg - offset; k < reg; k++)
+ new_virtual_grf[k] = i;
+ }
+ assert(reg == reg_count);
+
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ if (inst->dst.file == VGRF) {
+ reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
+ inst->dst.nr = new_virtual_grf[reg];
+ inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
+ inst->dst.offset % REG_SIZE;
+ assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
+ }
+ for (int i = 0; i < inst->sources; i++) {
+ if (inst->src[i].file == VGRF) {
+ reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
+ inst->src[i].nr = new_virtual_grf[reg];
+ inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
+ inst->src[i].offset % REG_SIZE;
+ assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
+ }
+ }
+ }
+ invalidate_live_intervals();
+}
+
+/**
+ * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
+ *
+ * During code generation, we create tons of temporary variables, many of
+ * which get immediately killed and are never used again. Yet, in later
+ * optimization and analysis passes, such as compute_live_intervals, we need
+ * to loop over all the virtual GRFs. Compacting them can save a lot of
+ * overhead.
+ */
+bool
+fs_visitor::compact_virtual_grfs()
+{
+ bool progress = false;
+ int remap_table[this->alloc.count];
+ memset(remap_table, -1, sizeof(remap_table));
+
+ /* Mark which virtual GRFs are used. */
+ foreach_block_and_inst(block, const fs_inst, inst, cfg) {
+ if (inst->dst.file == VGRF)
+ remap_table[inst->dst.nr] = 0;
+
+ for (int i = 0; i < inst->sources; i++) {
+ if (inst->src[i].file == VGRF)
+ remap_table[inst->src[i].nr] = 0;
+ }
+ }
+
+ /* Compact the GRF arrays. */
+ int new_index = 0;
+ for (unsigned i = 0; i < this->alloc.count; i++) {
+ if (remap_table[i] == -1) {
+ /* We just found an unused register. This means that we are
+ * actually going to compact something.
+ */
+ progress = true;
+ } else {
+ remap_table[i] = new_index;
+ alloc.sizes[new_index] = alloc.sizes[i];
+ invalidate_live_intervals();
+ ++new_index;
+ }
+ }
+
+ this->alloc.count = new_index;
+
+ /* Patch all the instructions to use the newly renumbered registers */
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ if (inst->dst.file == VGRF)
+ inst->dst.nr = remap_table[inst->dst.nr];
+
+ for (int i = 0; i < inst->sources; i++) {
+ if (inst->src[i].file == VGRF)
+ inst->src[i].nr = remap_table[inst->src[i].nr];
+ }
+ }
+
+ /* Patch all the references to delta_xy, since they're used in register
+ * allocation. If they're unused, switch them to BAD_FILE so we don't
+ * think some random VGRF is delta_xy.
+ */
+ for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
+ if (delta_xy[i].file == VGRF) {
+ if (remap_table[delta_xy[i].nr] != -1) {
+ delta_xy[i].nr = remap_table[delta_xy[i].nr];
+ } else {
+ delta_xy[i].file = BAD_FILE;
+ }
+ }
+ }
+
+ return progress;
+}
+
+static void
+set_push_pull_constant_loc(unsigned uniform, int *chunk_start,
+ unsigned *max_chunk_bitsize,
+ bool contiguous, unsigned bitsize,
+ const unsigned target_bitsize,
+ int *push_constant_loc, int *pull_constant_loc,
+ unsigned *num_push_constants,
+ unsigned *num_pull_constants,
+ const unsigned max_push_components,
+ const unsigned max_chunk_size,
+ struct brw_stage_prog_data *stage_prog_data)
+{
+ /* This is the first live uniform in the chunk */
+ if (*chunk_start < 0)
+ *chunk_start = uniform;
+
+ /* Keep track of the maximum bit size access in contiguous uniforms */
+ *max_chunk_bitsize = MAX2(*max_chunk_bitsize, bitsize);
+
+ /* If this element does not need to be contiguous with the next, we
+ * split at this point and everything between chunk_start and u forms a
+ * single chunk.
+ */
+ if (!contiguous) {
+ /* If bitsize doesn't match the target one, skip it */
+ if (*max_chunk_bitsize != target_bitsize) {
+ /* FIXME: right now we only support 32 and 64-bit accesses */
+ assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8);
+ *max_chunk_bitsize = 0;
+ *chunk_start = -1;
+ return;
+ }
+
+ unsigned chunk_size = uniform - *chunk_start + 1;
+
+ /* Decide whether we should push or pull this parameter. In the
+ * Vulkan driver, push constants are explicitly exposed via the API
+ * so we push everything. In GL, we only push small arrays.
+ */
+ if (stage_prog_data->pull_param == NULL ||
+ (*num_push_constants + chunk_size <= max_push_components &&
+ chunk_size <= max_chunk_size)) {
+ assert(*num_push_constants + chunk_size <= max_push_components);
+ for (unsigned j = *chunk_start; j <= uniform; j++)
+ push_constant_loc[j] = (*num_push_constants)++;
+ } else {
+ for (unsigned j = *chunk_start; j <= uniform; j++)
+ pull_constant_loc[j] = (*num_pull_constants)++;
+ }
+
+ *max_chunk_bitsize = 0;
+ *chunk_start = -1;
+ }
+}
+
+/**
+ * Assign UNIFORM file registers to either push constants or pull constants.
+ *
+ * We allow a fragment shader to have more than the specified minimum
+ * maximum number of fragment shader uniform components (64). If
+ * there are too many of these, they'd fill up all of register space.
+ * So, this will push some of them out to the pull constant buffer and
+ * update the program to load them.
+ */
+void
+fs_visitor::assign_constant_locations()
+{
+ /* Only the first compile gets to decide on locations. */
+ if (dispatch_width != min_dispatch_width)
+ return;
+
+ bool is_live[uniforms];
+ memset(is_live, 0, sizeof(is_live));
+ unsigned bitsize_access[uniforms];
+ memset(bitsize_access, 0, sizeof(bitsize_access));
+
+ /* For each uniform slot, a value of true indicates that the given slot and
+ * the next slot must remain contiguous. This is used to keep us from
+ * splitting arrays apart.
+ */
+ bool contiguous[uniforms];
+ memset(contiguous, 0, sizeof(contiguous));
+
+ int thread_local_id_index =
+ (stage == MESA_SHADER_COMPUTE) ?
+ brw_cs_prog_data(stage_prog_data)->thread_local_id_index : -1;
+
+ /* First, we walk through the instructions and do two things:
+ *
+ * 1) Figure out which uniforms are live.
+ *
+ * 2) Mark any indirectly used ranges of registers as contiguous.
+ *
+ * Note that we don't move constant-indexed accesses to arrays. No
+ * testing has been done of the performance impact of this choice.
+ */
+ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+ for (int i = 0 ; i < inst->sources; i++) {
+ if (inst->src[i].file != UNIFORM)
+ continue;
+
+ int constant_nr = inst->src[i].nr + inst->src[i].offset / 4;
+
+ if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
+ assert(inst->src[2].ud % 4 == 0);
+ unsigned last = constant_nr + (inst->src[2].ud / 4) - 1;
+ assert(last < uniforms);
+
+ for (unsigned j = constant_nr; j < last; j++) {
+ is_live[j] = true;
+ contiguous[j] = true;
+ bitsize_access[j] = MAX2(bitsize_access[j], type_sz(inst->src[i].type));
+ }
+ is_live[last] = true;
+ bitsize_access[last] = MAX2(bitsize_access[last], type_sz(inst->src[i].type));
+ } else {
+ if (constant_nr >= 0 && constant_nr < (int) uniforms) {
+ int regs_read = inst->components_read(i) *
+ type_sz(inst->src[i].type) / 4;
+ for (int j = 0; j < regs_read; j++) {
+ is_live[constant_nr + j] = true;
+ bitsize_access[constant_nr + j] =
+ MAX2(bitsize_access[constant_nr + j], type_sz(inst->src[i].type));
+ }
+ }
+ }
+ }
+ }
+
+ if (thread_local_id_index >= 0 && !is_live[thread_local_id_index])
+ thread_local_id_index = -1;
+
+ /* Only allow 16 registers (128 uniform components) as push constants.
+ *
+ * Just demote the end of the list. We could probably do better
+ * here, demoting things that are rarely used in the program first.
+ *
+ * If changing this value, note the limitation about total_regs in
+ * brw_curbe.c.
+ */
+ unsigned int max_push_components = 16 * 8;
+ if (thread_local_id_index >= 0)
+ max_push_components--; /* Save a slot for the thread ID */
+
+ /* We push small arrays, but no bigger than 16 floats. This is big enough
+ * for a vec4 but hopefully not large enough to push out other stuff. We
+ * should probably use a better heuristic at some point.
+ */
+ const unsigned int max_chunk_size = 16;
+
+ unsigned int num_push_constants = 0;
+ unsigned int num_pull_constants = 0;
+
+ push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+ pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+
+ /* Default to -1 meaning no location */
+ memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
+ memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
+
+ int chunk_start = -1;
+ unsigned max_chunk_bitsize = 0;
+
+ /* First push 64-bit uniforms to ensure they are properly aligned */
+ const unsigned uniform_64_bit_size = type_sz(BRW_REGISTER_TYPE_DF);
+ for (unsigned u = 0; u < uniforms; u++) {
+ if (!is_live[u])
+ continue;
+
+ set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize,
+ contiguous[u], bitsize_access[u],
+ uniform_64_bit_size,
+ push_constant_loc, pull_constant_loc,
+ &num_push_constants, &num_pull_constants,
+ max_push_components, max_chunk_size,
+ stage_prog_data);
+
+ }
+
+ /* Then push the rest of uniforms */
+ const unsigned uniform_32_bit_size = type_sz(BRW_REGISTER_TYPE_F);
+ for (unsigned u = 0; u < uniforms; u++) {
+ if (!is_live[u])
+ continue;
+
+ /* Skip thread_local_id_index to put it in the last push register. */
+ if (thread_local_id_index == (int)u)
+ continue;
+
+ set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize,
+ contiguous[u], bitsize_access[u],
+ uniform_32_bit_size,
+ push_constant_loc, pull_constant_loc,
+ &num_push_constants, &num_pull_constants,
+ max_push_components, max_chunk_size,
+ stage_prog_data);
+ }
+
+ /* Add the CS local thread ID uniform at the end of the push constants */
+ if (thread_local_id_index >= 0)
+ push_constant_loc[thread_local_id_index] = num_push_constants++;
+
+ /* As the uniforms are going to be reordered, take the data from a temporary
+ * copy of the original param[].
+ */
+ gl_constant_value **param = ralloc_array(NULL, gl_constant_value*,
+ stage_prog_data->nr_params);
+ memcpy(param, stage_prog_data->param,
+ sizeof(gl_constant_value*) * stage_prog_data->nr_params);
+ stage_prog_data->nr_params = num_push_constants;
+ stage_prog_data->nr_pull_params = num_pull_constants;
+
+ /* Up until now, the param[] array has been indexed by reg + offset
+ * of UNIFORM registers. Move pull constants into pull_param[] and
+ * condense param[] to only contain the uniforms we chose to push.
+ *
+ * NOTE: Because we are condensing the params[] array, we know that
+ * push_constant_loc[i] <= i and we can do it in one smooth loop without
+ * having to make a copy.
+ */
+ int new_thread_local_id_index = -1;
+ for (unsigned int i = 0; i < uniforms; i++) {
+ const gl_constant_value *value = param[i];
+
+ if (pull_constant_loc[i] != -1) {
+ stage_prog_data->pull_param[pull_constant_loc[i]] = value;
+ } else if (push_constant_loc[i] != -1) {
+ stage_prog_data->param[push_constant_loc[i]] = value;
+ if (thread_local_id_index == (int)i)
+ new_thread_local_id_index = push_constant_loc[i];
+ }
+ }
+ ralloc_free(param);
+
+ if (stage == MESA_SHADER_COMPUTE)
+ brw_cs_prog_data(stage_prog_data)->thread_local_id_index =
+ new_thread_local_id_index;
+}
+
+/**
+ * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
+ * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
+ */
+void
+fs_visitor::lower_constant_loads()
+{
+ const unsigned index = stage_prog_data->binding_table.pull_constants_start;
+
+ foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+ /* Set up the annotation tracking for new generated instructions. */
+ const fs_builder ibld(this, block, inst);
+
+ for (int i = 0; i < inst->sources; i++) {
+ if (inst->src[i].file != UNIFORM)
+ continue;
+
+ /* We'll handle this case later */
+ if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
+ continue;
+
+ unsigned location = inst->src[i].nr + inst->src[i].offset / 4;
+ if (location >= uniforms)
+ continue; /* Out of bounds access */
+
+ int pull_index = pull_constant_loc[location];
+
+ if (pull_index == -1)
+ continue;
+
+ assert(inst->src[i].stride == 0);
+
+ const unsigned index = stage_prog_data->binding_table.pull_constants_start;
+ const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
+ const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
+ const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+ const unsigned base = pull_index * 4;
+
+ ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+ dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1)));
+
+ /* Rewrite the instruction to use the temporary VGRF. */
+ inst->src[i].file = VGRF;
+ inst->src[i].nr = dst.nr;
+ inst->src[i].offset = (base & (block_sz - 1)) +
+ inst->src[i].offset % 4;
+
+ brw_mark_surface_used(prog_data, index);
+ }
+
+ if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
+ inst->src[0].file == UNIFORM) {
+
+ unsigned location = inst->src[0].nr + inst->src[0].offset / 4;
+ if (location >= uniforms)
+ continue; /* Out of bounds access */
+
+ int pull_index = pull_constant_loc[location];
+
+ if (pull_index == -1)
+ continue;
+
+ VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
+ brw_imm_ud(index),
+ inst->src[1],
+ pull_index * 4);
+ inst->remove(block);
+
+ brw_mark_surface_used(prog_data, index);
+ }
+ }
+ invalidate_live_intervals();
+}
+
+bool
+fs_visitor::opt_algebraic()
+{
+ bool progress = false;
+
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ switch (inst->opcode) {
+ case BRW_OPCODE_MOV:
+ if (inst->src[0].file != IMM)
+ break;
+
+ if (inst->saturate) {
+ if (inst->dst.type != inst->src[0].type)
+ assert(!"unimplemented: saturate mixed types");
+
+ if (brw_saturate_immediate(inst->dst.type,
+ &inst->src[0].as_brw_reg())) {
+ inst->saturate = false;
+ progress = true;
+ }
+ }
+ break;
+
+ case BRW_OPCODE_MUL:
+ if (inst->src[1].file != IMM)
+ continue;
+
+ /* a * 1.0 = a */
+ if (inst->src[1].is_one()) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[1] = reg_undef;
+ progress = true;
+ break;
+ }
+
+ /* a * -1.0 = -a */
+ if (inst->src[1].is_negative_one()) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[0].negate = !inst->src[0].negate;
+ inst->src[1] = reg_undef;
+ progress = true;
+ break;
+ }
+
+ /* a * 0.0 = 0.0 */
+ if (inst->src[1].is_zero()) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[0] = inst->src[1];
+ inst->src[1] = reg_undef;
+ progress = true;
+ break;
+ }
+
+ if (inst->src[0].file == IMM) {
+ assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[0].f *= inst->src[1].f;
+ inst->src[1] = reg_undef;
+ progress = true;
+ break;
+ }
+ break;
+ case BRW_OPCODE_ADD:
+ if (inst->src[1].file != IMM)
+ continue;
+
+ /* a + 0.0 = a */
+ if (inst->src[1].is_zero()) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[1] = reg_undef;
+ progress = true;
+ break;
+ }
+
+ if (inst->src[0].file == IMM) {
+ assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[0].f += inst->src[1].f;
+ inst->src[1] = reg_undef;
+ progress = true;
+ break;
+ }
+ break;
+ case BRW_OPCODE_OR:
+ if (inst->src[0].equals(inst->src[1])) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[1] = reg_undef;
+ progress = true;
+ break;
+ }
+ break;
+ case BRW_OPCODE_LRP:
+ if (inst->src[1].equals(inst->src[2])) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[0] = inst->src[1];
+ inst->src[1] = reg_undef;
+ inst->src[2] = reg_undef;
+ progress = true;
+ break;
+ }
+ break;
+ case BRW_OPCODE_CMP:
+ if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
+ inst->src[0].abs &&
+ inst->src[0].negate &&
+ inst->src[1].is_zero()) {
+ inst->src[0].abs = false;
+ inst->src[0].negate = false;
+ inst->conditional_mod = BRW_CONDITIONAL_Z;
+ progress = true;
+ break;
+ }
+ break;
+ case BRW_OPCODE_SEL:
+ if (inst->src[0].equals(inst->src[1])) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[1] = reg_undef;
+ inst->predicate = BRW_PREDICATE_NONE;
+ inst->predicate_inverse = false;
+ progress = true;
+ } else if (inst->saturate && inst->src[1].file == IMM) {
+ switch (inst->conditional_mod) {
+ case BRW_CONDITIONAL_LE:
+ case BRW_CONDITIONAL_L:
+ switch (inst->src[1].type) {
+ case BRW_REGISTER_TYPE_F:
+ if (inst->src[1].f >= 1.0f) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[1] = reg_undef;
+ inst->conditional_mod = BRW_CONDITIONAL_NONE;
+ progress = true;
+ }
+ break;
+ default:
+ break;
+ }
+ break;
+ case BRW_CONDITIONAL_GE:
+ case BRW_CONDITIONAL_G:
+ switch (inst->src[1].type) {
+ case BRW_REGISTER_TYPE_F:
+ if (inst->src[1].f <= 0.0f) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[1] = reg_undef;
+ inst->conditional_mod = BRW_CONDITIONAL_NONE;
+ progress = true;
+ }
+ break;
+ default:
+ break;
+ }
+ default:
+ break;
+ }
+ }
+ break;
+ case BRW_OPCODE_MAD:
+ if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[1] = reg_undef;
+ inst->src[2] = reg_undef;
+ progress = true;
+ } else if (inst->src[0].is_zero()) {
+ inst->opcode = BRW_OPCODE_MUL;
+ inst->src[0] = inst->src[2];
+ inst->src[2] = reg_undef;
+ progress = true;
+ } else if (inst->src[1].is_one()) {
+ inst->opcode = BRW_OPCODE_ADD;
+ inst->src[1] = inst->src[2];
+ inst->src[2] = reg_undef;
+ progress = true;
+ } else if (inst->src[2].is_one()) {
+ inst->opcode = BRW_OPCODE_ADD;
+ inst->src[2] = reg_undef;
+ progress = true;
+ } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
+ inst->opcode = BRW_OPCODE_ADD;
+ inst->src[1].f *= inst->src[2].f;
+ inst->src[2] = reg_undef;
+ progress = true;
+ }
+ break;
+ case SHADER_OPCODE_BROADCAST:
+ if (is_uniform(inst->src[0])) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->sources = 1;
+ inst->force_writemask_all = true;
+ progress = true;
+ } else if (inst->src[1].file == IMM) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[0] = component(inst->src[0],
+ inst->src[1].ud);
+ inst->sources = 1;
+ inst->force_writemask_all = true;
+ progress = true;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ /* Swap if src[0] is immediate. */
+ if (progress && inst->is_commutative()) {
+ if (inst->src[0].file == IMM) {
+ fs_reg tmp = inst->src[1];
+ inst->src[1] = inst->src[0];
+ inst->src[0] = tmp;
+ }
+ }
+ }
+ return progress;
+}
+
+/**
+ * Optimize sample messages that have constant zero values for the trailing
+ * texture coordinates. We can just reduce the message length for these
+ * instructions instead of reserving a register for it. Trailing parameters
+ * that aren't sent default to zero anyway. This will cause the dead code
+ * eliminator to remove the MOV instruction that would otherwise be emitted to
+ * set up the zero value.
+ */
+bool
+fs_visitor::opt_zero_samples()
+{
+ /* Gen4 infers the texturing opcode based on the message length so we can't
+ * change it.
+ */
+ if (devinfo->gen < 5)
+ return false;
+
+ bool progress = false;
+
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ if (!inst->is_tex())
+ continue;
+
+ fs_inst *load_payload = (fs_inst *) inst->prev;
+
+ if (load_payload->is_head_sentinel() ||
+ load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
+ continue;
+
+ /* We don't want to remove the message header or the first parameter.
+ * Removing the first parameter is not allowed, see the Haswell PRM
+ * volume 7, page 149:
+ *
+ * "Parameter 0 is required except for the sampleinfo message, which
+ * has no parameter 0"
+ */
+ while (inst->mlen > inst->header_size + inst->exec_size / 8 &&
+ load_payload->src[(inst->mlen - inst->header_size) /
+ (inst->exec_size / 8) +
+ inst->header_size - 1].is_zero()) {
+ inst->mlen -= inst->exec_size / 8;
+ progress = true;
+ }
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+/**
+ * Optimize sample messages which are followed by the final RT write.
+ *
+ * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
+ * results sent directly to the framebuffer, bypassing the EU. Recognize the
+ * final texturing results copied to the framebuffer write payload and modify
+ * them to write to the framebuffer directly.
+ */
+bool
+fs_visitor::opt_sampler_eot()
+{
+ brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+
+ if (stage != MESA_SHADER_FRAGMENT)
+ return false;
+
+ if (devinfo->gen < 9 && !devinfo->is_cherryview)
+ return false;
+
+ /* FINISHME: It should be possible to implement this optimization when there
+ * are multiple drawbuffers.
+ */
+ if (key->nr_color_regions != 1)
+ return false;
+
+ /* Requires emitting a bunch of saturating MOV instructions during logical
+ * send lowering to clamp the color payload, which the sampler unit isn't
+ * going to do for us.
+ */
+ if (key->clamp_fragment_color)
+ return false;
+
+ /* Look for a texturing instruction immediately before the final FB_WRITE. */
+ bblock_t *block = cfg->blocks[cfg->num_blocks - 1];
+ fs_inst *fb_write = (fs_inst *)block->end();
+ assert(fb_write->eot);
+ assert(fb_write->opcode == FS_OPCODE_FB_WRITE_LOGICAL);
+
+ /* There wasn't one; nothing to do. */
+ if (unlikely(fb_write->prev->is_head_sentinel()))
+ return false;
+
+ fs_inst *tex_inst = (fs_inst *) fb_write->prev;
+
+ /* 3D Sampler » Messages » Message Format
+ *
+ * “Response Length of zero is allowed on all SIMD8* and SIMD16* sampler
+ * messages except sample+killpix, resinfo, sampleinfo, LOD, and gather4*”
+ */
+ if (tex_inst->opcode != SHADER_OPCODE_TEX_LOGICAL &&
+ tex_inst->opcode != SHADER_OPCODE_TXD_LOGICAL &&
+ tex_inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
+ tex_inst->opcode != SHADER_OPCODE_TXL_LOGICAL &&
+ tex_inst->opcode != FS_OPCODE_TXB_LOGICAL &&
+ tex_inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL &&
+ tex_inst->opcode != SHADER_OPCODE_TXF_CMS_W_LOGICAL &&
+ tex_inst->opcode != SHADER_OPCODE_TXF_UMS_LOGICAL)
+ return false;
+
+ /* XXX - This shouldn't be necessary. */
+ if (tex_inst->prev->is_head_sentinel())
+ return false;
+
+ /* Check that the FB write sources are fully initialized by the single
+ * texturing instruction.
+ */
+ for (unsigned i = 0; i < FB_WRITE_LOGICAL_NUM_SRCS; i++) {
+ if (i == FB_WRITE_LOGICAL_SRC_COLOR0) {
+ if (!fb_write->src[i].equals(tex_inst->dst) ||
+ fb_write->size_read(i) != tex_inst->size_written)
+ return false;
+ } else if (i != FB_WRITE_LOGICAL_SRC_COMPONENTS) {
+ if (fb_write->src[i].file != BAD_FILE)
+ return false;
+ }
+ }
+
+ assert(!tex_inst->eot); /* We can't get here twice */
+ assert((tex_inst->offset & (0xff << 24)) == 0);
+
+ const fs_builder ibld(this, block, tex_inst);
+
+ tex_inst->offset |= fb_write->target << 24;
+ tex_inst->eot = true;
+ tex_inst->dst = ibld.null_reg_ud();
+ tex_inst->size_written = 0;
+ fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
+
+ /* Marking EOT is sufficient, lower_logical_sends() will notice the EOT
+ * flag and submit a header together with the sampler message as required
+ * by the hardware.
+ */
+ invalidate_live_intervals();
+ return true;
+}
+
+bool
+fs_visitor::opt_register_renaming()
+{
+ bool progress = false;
+ int depth = 0;
+
+ int remap[alloc.count];
+ memset(remap, -1, sizeof(int) * alloc.count);
+
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
+ depth++;
+ } else if (inst->opcode == BRW_OPCODE_ENDIF ||
+ inst->opcode == BRW_OPCODE_WHILE) {
+ depth--;
+ }
+
+ /* Rewrite instruction sources. */
+ for (int i = 0; i < inst->sources; i++) {
+ if (inst->src[i].file == VGRF &&
+ remap[inst->src[i].nr] != -1 &&
+ remap[inst->src[i].nr] != inst->src[i].nr) {
+ inst->src[i].nr = remap[inst->src[i].nr];
+ progress = true;
+ }
+ }
+
+ const int dst = inst->dst.nr;
+
+ if (depth == 0 &&
+ inst->dst.file == VGRF &&
+ alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written &&
+ !inst->is_partial_write()) {
+ if (remap[dst] == -1) {
+ remap[dst] = dst;
+ } else {
+ remap[dst] = alloc.allocate(regs_written(inst));
+ inst->dst.nr = remap[dst];
+ progress = true;
+ }
+ } else if (inst->dst.file == VGRF &&
+ remap[dst] != -1 &&
+ remap[dst] != dst) {
+ inst->dst.nr = remap[dst];
+ progress = true;
+ }
+ }
+
+ if (progress) {
+ invalidate_live_intervals();
+
+ for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
+ if (delta_xy[i].file == VGRF && remap[delta_xy[i].nr] != -1) {
+ delta_xy[i].nr = remap[delta_xy[i].nr];
+ }
+ }
+ }
+
+ return progress;
+}
+
+/**
+ * Remove redundant or useless discard jumps.
+ *
+ * For example, we can eliminate jumps in the following sequence:
+ *
+ * discard-jump (redundant with the next jump)
+ * discard-jump (useless; jumps to the next instruction)
+ * placeholder-halt
+ */
+bool
+fs_visitor::opt_redundant_discard_jumps()
+{
+ bool progress = false;
+
+ bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
+
+ fs_inst *placeholder_halt = NULL;
+ foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
+ if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
+ placeholder_halt = inst;
+ break;
+ }
+ }
+
+ if (!placeholder_halt)
+ return false;
+
+ /* Delete any HALTs immediately before the placeholder halt. */
+ for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
+ !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
+ prev = (fs_inst *) placeholder_halt->prev) {
+ prev->remove(last_bblock);
+ progress = true;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+/**
+ * Compute a bitmask with GRF granularity with a bit set for each GRF starting
+ * from \p r.offset which overlaps the region starting at \p s.offset and
+ * spanning \p ds bytes.
+ */
+static inline unsigned
+mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds)
+{
+ const int rel_offset = reg_offset(s) - reg_offset(r);
+ const int shift = rel_offset / REG_SIZE;
+ const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE);
+ assert(reg_space(r) == reg_space(s) &&
+ shift >= 0 && shift < int(8 * sizeof(unsigned)));
+ return ((1 << n) - 1) << shift;
+}
+
+bool
+fs_visitor::compute_to_mrf()
+{
+ bool progress = false;
+ int next_ip = 0;
+
+ /* No MRFs on Gen >= 7. */
+ if (devinfo->gen >= 7)
+ return false;
+
+ calculate_live_intervals();
+
+ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+ int ip = next_ip;
+ next_ip++;
+
+ if (inst->opcode != BRW_OPCODE_MOV ||
+ inst->is_partial_write() ||
+ inst->dst.file != MRF || inst->src[0].file != VGRF ||
+ inst->dst.type != inst->src[0].type ||
+ inst->src[0].abs || inst->src[0].negate ||
+ !inst->src[0].is_contiguous() ||
+ inst->src[0].offset % REG_SIZE != 0)
+ continue;
+
+ /* Can't compute-to-MRF this GRF if someone else was going to
+ * read it later.
+ */
+ if (this->virtual_grf_end[inst->src[0].nr] > ip)
+ continue;
+
+ /* Found a move of a GRF to a MRF. Let's see if we can go rewrite the
+ * things that computed the value of all GRFs of the source region. The
+ * regs_left bitset keeps track of the registers we haven't yet found a
+ * generating instruction for.
+ */
+ unsigned regs_left = (1 << regs_read(inst, 0)) - 1;
+
+ foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+ if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+ inst->src[0], inst->size_read(0))) {
+ /* Found the last thing to write our reg we want to turn
+ * into a compute-to-MRF.
+ */
+
+ /* If this one instruction didn't populate all the
+ * channels, bail. We might be able to rewrite everything
+ * that writes that reg, but it would require smarter
+ * tracking.
+ */
+ if (scan_inst->is_partial_write())
+ break;
+
+ /* Handling things not fully contained in the source of the copy
+ * would need us to understand coalescing out more than one MOV at
+ * a time.
+ */
+ if (!region_contained_in(scan_inst->dst, scan_inst->size_written,
+ inst->src[0], inst->size_read(0)))
+ break;
+
+ /* SEND instructions can't have MRF as a destination. */
+ if (scan_inst->mlen)
+ break;
+
+ if (devinfo->gen == 6) {
+ /* gen6 math instructions must have the destination be
+ * GRF, so no compute-to-MRF for them.
+ */
+ if (scan_inst->is_math()) {
+ break;
+ }
+ }
+
+ /* Clear the bits for any registers this instruction overwrites. */
+ regs_left &= ~mask_relative_to(
+ inst->src[0], scan_inst->dst, scan_inst->size_written);
+ if (!regs_left)
+ break;
+ }
+
+ /* We don't handle control flow here. Most computation of
+ * values that end up in MRFs are shortly before the MRF
+ * write anyway.
+ */
+ if (block->start() == scan_inst)
+ break;
+
+ /* You can't read from an MRF, so if someone else reads our
+ * MRF's source GRF that we wanted to rewrite, that stops us.
+ */
+ bool interfered = false;
+ for (int i = 0; i < scan_inst->sources; i++) {
+ if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i),
+ inst->src[0], inst->size_read(0))) {
+ interfered = true;
+ }
+ }
+ if (interfered)
+ break;
+
+ if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+ inst->dst, inst->size_written)) {
+ /* If somebody else writes our MRF here, we can't
+ * compute-to-MRF before that.
+ */
+ break;
+ }
+
+ if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 &&
+ regions_overlap(fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE,
+ inst->dst, inst->size_written)) {
+ /* Found a SEND instruction, which means that there are
+ * live values in MRFs from base_mrf to base_mrf +
+ * scan_inst->mlen - 1. Don't go pushing our MRF write up
+ * above it.
+ */
+ break;
+ }
+ }
+
+ if (regs_left)
+ continue;
+
+ /* Found all generating instructions of our MRF's source value, so it
+ * should be safe to rewrite them to point to the MRF directly.
+ */
+ regs_left = (1 << regs_read(inst, 0)) - 1;
+
+ foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+ if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+ inst->src[0], inst->size_read(0))) {
+ /* Clear the bits for any registers this instruction overwrites. */
+ regs_left &= ~mask_relative_to(
+ inst->src[0], scan_inst->dst, scan_inst->size_written);
+
+ const unsigned rel_offset = reg_offset(scan_inst->dst) -
+ reg_offset(inst->src[0]);
+
+ if (inst->dst.nr & BRW_MRF_COMPR4) {
+ /* Apply the same address transformation done by the hardware
+ * for COMPR4 MRF writes.
+ */
+ assert(rel_offset < 2 * REG_SIZE);
+ scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4;
+
+ /* Clear the COMPR4 bit if the generating instruction is not
+ * compressed.
+ */
+ if (scan_inst->size_written < 2 * REG_SIZE)
+ scan_inst->dst.nr &= ~BRW_MRF_COMPR4;
+
+ } else {
+ /* Calculate the MRF number the result of this instruction is
+ * ultimately written to.
+ */
+ scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE;
+ }
+
+ scan_inst->dst.file = MRF;
+ scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE;
+ scan_inst->saturate |= inst->saturate;
+ if (!regs_left)
+ break;
+ }
+ }
+
+ assert(!regs_left);
+ inst->remove(block);
+ progress = true;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+/**
+ * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
+ * flow. We could probably do better here with some form of divergence
+ * analysis.
+ */
+bool
+fs_visitor::eliminate_find_live_channel()
+{
+ bool progress = false;
+ unsigned depth = 0;
+
+ if (!brw_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
+ /* The optimization below assumes that channel zero is live on thread
+ * dispatch, which may not be the case if the fixed function dispatches
+ * threads sparsely.
+ */
+ return false;
+ }
+
+ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+ switch (inst->opcode) {
+ case BRW_OPCODE_IF:
+ case BRW_OPCODE_DO:
+ depth++;
+ break;
+
+ case BRW_OPCODE_ENDIF:
+ case BRW_OPCODE_WHILE:
+ depth--;
+ break;
+
+ case FS_OPCODE_DISCARD_JUMP:
+ /* This can potentially make control flow non-uniform until the end
+ * of the program.
+ */
+ return progress;
+
+ case SHADER_OPCODE_FIND_LIVE_CHANNEL:
+ if (depth == 0) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[0] = brw_imm_ud(0u);
+ inst->sources = 1;
+ inst->force_writemask_all = true;
+ progress = true;
+ }
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ return progress;
+}
+
+/**
+ * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
+ * instructions to FS_OPCODE_REP_FB_WRITE.
+ */
+void
+fs_visitor::emit_repclear_shader()
+{
+ brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+ int base_mrf = 0;
+ int color_mrf = base_mrf + 2;
+ fs_inst *mov;
+
+ if (uniforms > 0) {
+ mov = bld.exec_all().group(4, 0)
+ .MOV(brw_message_reg(color_mrf),
+ fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
+ } else {
+ struct brw_reg reg =
+ brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_F,
+ BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
+ BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+
+ mov = bld.exec_all().group(4, 0)
+ .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg));
+ }
+
+ fs_inst *write;
+ if (key->nr_color_regions == 1) {
+ write = bld.emit(FS_OPCODE_REP_FB_WRITE);
+ write->saturate = key->clamp_fragment_color;
+ write->base_mrf = color_mrf;
+ write->target = 0;
+ write->header_size = 0;
+ write->mlen = 1;
+ } else {
+ assume(key->nr_color_regions > 0);
+ for (int i = 0; i < key->nr_color_regions; ++i) {
+ write = bld.emit(FS_OPCODE_REP_FB_WRITE);
+ write->saturate = key->clamp_fragment_color;
+ write->base_mrf = base_mrf;
+ write->target = i;
+ write->header_size = 2;
+ write->mlen = 3;
+ }
+ }
+ write->eot = true;
+
+ calculate_cfg();
+
+ assign_constant_locations();
+ assign_curb_setup();
+
+ /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
+ if (uniforms > 0) {
+ assert(mov->src[0].file == FIXED_GRF);
+ mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
+ }
+}
+
+/**
+ * Walks through basic blocks, looking for repeated MRF writes and
+ * removing the later ones.
+ */
+bool
+fs_visitor::remove_duplicate_mrf_writes()
+{
+ fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->gen)];
+ bool progress = false;
+
+ /* Need to update the MRF tracking for compressed instructions. */
+ if (dispatch_width >= 16)
+ return false;
+
+ memset(last_mrf_move, 0, sizeof(last_mrf_move));
+
+ foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+ if (inst->is_control_flow()) {
+ memset(last_mrf_move, 0, sizeof(last_mrf_move));
+ }
+
+ if (inst->opcode == BRW_OPCODE_MOV &&
+ inst->dst.file == MRF) {
+ fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
+ if (prev_inst && inst->equals(prev_inst)) {
+ inst->remove(block);
+ progress = true;
+ continue;
+ }
+ }
+
+ /* Clear out the last-write records for MRFs that were overwritten. */
+ if (inst->dst.file == MRF) {
+ last_mrf_move[inst->dst.nr] = NULL;
+ }
+
+ if (inst->mlen > 0 && inst->base_mrf != -1) {
+ /* Found a SEND instruction, which will include two or fewer
+ * implied MRF writes. We could do better here.
+ */
+ for (int i = 0; i < implied_mrf_writes(inst); i++) {
+ last_mrf_move[inst->base_mrf + i] = NULL;
+ }
+ }
+
+ /* Clear out any MRF move records whose sources got overwritten. */
+ for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
+ if (last_mrf_move[i] &&
+ regions_overlap(inst->dst, inst->size_written,
+ last_mrf_move[i]->src[0],
+ last_mrf_move[i]->size_read(0))) {
+ last_mrf_move[i] = NULL;
+ }
+ }
+
+ if (inst->opcode == BRW_OPCODE_MOV &&
+ inst->dst.file == MRF &&
+ inst->src[0].file != ARF &&
+ !inst->is_partial_write()) {
+ last_mrf_move[inst->dst.nr] = inst;
+ }
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+static void
+clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
+{
+ /* Clear the flag for registers that actually got read (as expected). */
+ for (int i = 0; i < inst->sources; i++) {
+ int grf;
+ if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
+ grf = inst->src[i].nr;
+ } else {
+ continue;
+ }
+
+ if (grf >= first_grf &&
+ grf < first_grf + grf_len) {
+ deps[grf - first_grf] = false;
+ if (inst->exec_size == 16)
+ deps[grf - first_grf + 1] = false;
+ }
+ }
+}
+
+/**
+ * Implements this workaround for the original 965:
+ *
+ * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
+ * check for post destination dependencies on this instruction, software
+ * must ensure that there is no destination hazard for the case of ‘write
+ * followed by a posted write’ shown in the following example.
+ *
+ * 1. mov r3 0
+ * 2. send r3.xy <rest of send instruction>
+ * 3. mov r2 r3
+ *
+ * Due to no post-destination dependency check on the ‘send’, the above
+ * code sequence could have two instructions (1 and 2) in flight at the
+ * same time that both consider ‘r3’ as the target of their final writes.
+ */
+void
+fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
+ fs_inst *inst)
+{
+ int write_len = regs_written(inst);
+ int first_write_grf = inst->dst.nr;
+ bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
+ assert(write_len < (int)sizeof(needs_dep) - 1);
+
+ memset(needs_dep, false, sizeof(needs_dep));
+ memset(needs_dep, true, write_len);
+
+ clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
+
+ /* Walk backwards looking for writes to registers we're writing which
+ * aren't read since being written. If we hit the start of the program,
+ * we assume that there are no outstanding dependencies on entry to the
+ * program.
+ */
+ foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+ /* If we hit control flow, assume that there *are* outstanding
+ * dependencies, and force their cleanup before our instruction.
+ */
+ if (block->start() == scan_inst && block->num != 0) {
+ for (int i = 0; i < write_len; i++) {
+ if (needs_dep[i])
+ DEP_RESOLVE_MOV(fs_builder(this, block, inst),
+ first_write_grf + i);
+ }
+ return;
+ }
+
+ /* We insert our reads as late as possible on the assumption that any
+ * instruction but a MOV that might have left us an outstanding
+ * dependency has more latency than a MOV.
+ */
+ if (scan_inst->dst.file == VGRF) {
+ for (unsigned i = 0; i < regs_written(scan_inst); i++) {
+ int reg = scan_inst->dst.nr + i;
+
+ if (reg >= first_write_grf &&
+ reg < first_write_grf + write_len &&
+ needs_dep[reg - first_write_grf]) {
+ DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
+ needs_dep[reg - first_write_grf] = false;
+ if (scan_inst->exec_size == 16)
+ needs_dep[reg - first_write_grf + 1] = false;
+ }
+ }
+ }
+
+ /* Clear the flag for registers that actually got read (as expected). */
+ clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
+
+ /* Continue the loop only if we haven't resolved all the dependencies */
+ int i;
+ for (i = 0; i < write_len; i++) {
+ if (needs_dep[i])
+ break;
+ }
+ if (i == write_len)
+ return;
+ }
+}
+
+/**
+ * Implements this workaround for the original 965:
+ *
+ * "[DevBW, DevCL] Errata: A destination register from a send can not be
+ * used as a destination register until after it has been sourced by an
+ * instruction with a different destination register.
+ */
+void
+fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
+{
+ int write_len = regs_written(inst);
+ int first_write_grf = inst->dst.nr;
+ bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
+ assert(write_len < (int)sizeof(needs_dep) - 1);
+
+ memset(needs_dep, false, sizeof(needs_dep));
+ memset(needs_dep, true, write_len);
+ /* Walk forwards looking for writes to registers we're writing which aren't
+ * read before being written.
+ */
+ foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) {
+ /* If we hit control flow, force resolve all remaining dependencies. */
+ if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) {
+ for (int i = 0; i < write_len; i++) {
+ if (needs_dep[i])
+ DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+ first_write_grf + i);
+ }
+ return;
+ }
+
+ /* Clear the flag for registers that actually got read (as expected). */
+ clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
+
+ /* We insert our reads as late as possible since they're reading the
+ * result of a SEND, which has massive latency.
+ */
+ if (scan_inst->dst.file == VGRF &&
+ scan_inst->dst.nr >= first_write_grf &&
+ scan_inst->dst.nr < first_write_grf + write_len &&
+ needs_dep[scan_inst->dst.nr - first_write_grf]) {
+ DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+ scan_inst->dst.nr);
+ needs_dep[scan_inst->dst.nr - first_write_grf] = false;
+ }
+
+ /* Continue the loop only if we haven't resolved all the dependencies */
+ int i;
+ for (i = 0; i < write_len; i++) {
+ if (needs_dep[i])
+ break;
+ }
+ if (i == write_len)
+ return;
+ }
+}
+
+void
+fs_visitor::insert_gen4_send_dependency_workarounds()
+{
+ if (devinfo->gen != 4 || devinfo->is_g4x)
+ return;
+
+ bool progress = false;
+
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ if (inst->mlen != 0 && inst->dst.file == VGRF) {
+ insert_gen4_pre_send_dependency_workarounds(block, inst);
+ insert_gen4_post_send_dependency_workarounds(block, inst);
+ progress = true;
+ }
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+}
+
+/**
+ * Turns the generic expression-style uniform pull constant load instruction
+ * into a hardware-specific series of instructions for loading a pull
+ * constant.
+ *
+ * The expression style allows the CSE pass before this to optimize out
+ * repeated loads from the same offset, and gives the pre-register-allocation
+ * scheduling full flexibility, while the conversion to native instructions
+ * allows the post-register-allocation scheduler the best information
+ * possible.
+ *
+ * Note that execution masking for setting up pull constant loads is special:
+ * the channels that need to be written are unrelated to the current execution
+ * mask, since a later instruction will use one of the result channels as a
+ * source operand for all 8 or 16 of its channels.
+ */
+void
+fs_visitor::lower_uniform_pull_constant_loads()
+{
+ foreach_block_and_inst (block, fs_inst, inst, cfg) {
+ if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
+ continue;
+
+ if (devinfo->gen >= 7) {
+ const fs_builder ubld = fs_builder(this, block, inst).exec_all();
+ const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD);
+
+ ubld.group(8, 0).MOV(payload,
+ retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+ ubld.group(1, 0).MOV(component(payload, 2),
+ brw_imm_ud(inst->src[1].ud / 16));
+
+ inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
+ inst->src[1] = payload;
+ inst->header_size = 1;
+ inst->mlen = 1;
+
+ invalidate_live_intervals();
+ } else {
+ /* Before register allocation, we didn't tell the scheduler about the
+ * MRF we use. We know it's safe to use this MRF because nothing
+ * else does except for register spill/unspill, which generates and
+ * uses its MRF within a single IR instruction.
+ */
+ inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
+ inst->mlen = 1;
+ }
+ }
+}
+
+bool
+fs_visitor::lower_load_payload()
+{
+ bool progress = false;
+
+ foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+ if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
+ continue;
+
+ assert(inst->dst.file == MRF || inst->dst.file == VGRF);
+ assert(inst->saturate == false);
+ fs_reg dst = inst->dst;
+
+ /* Get rid of COMPR4. We'll add it back in if we need it */
+ if (dst.file == MRF)
+ dst.nr = dst.nr & ~BRW_MRF_COMPR4;
+
+ const fs_builder ibld(this, block, inst);
+ const fs_builder hbld = ibld.exec_all().group(8, 0);
+
+ for (uint8_t i = 0; i < inst->header_size; i++) {
+ if (inst->src[i].file != BAD_FILE) {
+ fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
+ fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
+ hbld.MOV(mov_dst, mov_src);
+ }
+ dst = offset(dst, hbld, 1);
+ }
+
+ if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&
+ inst->exec_size > 8) {
+ /* In this case, the payload portion of the LOAD_PAYLOAD isn't
+ * a straightforward copy. Instead, the result of the
+ * LOAD_PAYLOAD is treated as interleaved and the first four
+ * non-header sources are unpacked as:
+ *
+ * m + 0: r0
+ * m + 1: g0
+ * m + 2: b0
+ * m + 3: a0
+ * m + 4: r1
+ * m + 5: g1
+ * m + 6: b1
+ * m + 7: a1
+ *
+ * This is used for gen <= 5 fb writes.
+ */
+ assert(inst->exec_size == 16);
+ assert(inst->header_size + 4 <= inst->sources);
+ for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
+ if (inst->src[i].file != BAD_FILE) {
+ if (devinfo->has_compr4) {
+ fs_reg compr4_dst = retype(dst, inst->src[i].type);
+ compr4_dst.nr |= BRW_MRF_COMPR4;
+ ibld.MOV(compr4_dst, inst->src[i]);
+ } else {
+ /* Platform doesn't have COMPR4. We have to fake it */
+ fs_reg mov_dst = retype(dst, inst->src[i].type);
+ ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
+ mov_dst.nr += 4;
+ ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
+ }
+ }
+
+ dst.nr++;
+ }
+
+ /* The loop above only ever incremented us through the first set
+ * of 4 registers. However, thanks to the magic of COMPR4, we
+ * actually wrote to the first 8 registers, so we need to take
+ * that into account now.
+ */
+ dst.nr += 4;
+
+ /* The COMPR4 code took care of the first 4 sources. We'll let
+ * the regular path handle any remaining sources. Yes, we are
+ * modifying the instruction but we're about to delete it so
+ * this really doesn't hurt anything.
+ */
+ inst->header_size += 4;
+ }
+
+ for (uint8_t i = inst->header_size; i < inst->sources; i++) {
+ if (inst->src[i].file != BAD_FILE)
+ ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
+ dst = offset(dst, ibld, 1);
+ }
+
+ inst->remove(block);
+ progress = true;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+bool
+fs_visitor::lower_integer_multiplication()
+{
+ bool progress = false;
+
+ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+ const fs_builder ibld(this, block, inst);
+
+ if (inst->opcode == BRW_OPCODE_MUL) {
+ if (inst->dst.is_accumulator() ||
+ (inst->dst.type != BRW_REGISTER_TYPE_D &&
+ inst->dst.type != BRW_REGISTER_TYPE_UD))
+ continue;
+
+ /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
+ * operation directly, but CHV/BXT cannot.
+ */
+ if (devinfo->gen >= 8 &&
+ !devinfo->is_cherryview && !devinfo->is_broxton)
+ continue;
+
+ if (inst->src[1].file == IMM &&
+ inst->src[1].ud < (1 << 16)) {
+ /* The MUL instruction isn't commutative. On Gen <= 6, only the low
+ * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
+ * src1 are used.
+ *
+ * If multiplying by an immediate value that fits in 16-bits, do a
+ * single MUL instruction with that value in the proper location.
+ */
+ if (devinfo->gen < 7) {
+ fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8),
+ inst->dst.type);
+ ibld.MOV(imm, inst->src[1]);
+ ibld.MUL(inst->dst, imm, inst->src[0]);
+ } else {
+ const bool ud = (inst->src[1].type == BRW_REGISTER_TYPE_UD);
+ ibld.MUL(inst->dst, inst->src[0],
+ ud ? brw_imm_uw(inst->src[1].ud)
+ : brw_imm_w(inst->src[1].d));
+ }
+ } else {
+ /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
+ * do 32-bit integer multiplication in one instruction, but instead
+ * must do a sequence (which actually calculates a 64-bit result):
+ *
+ * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
+ * mach(8) null g3<8,8,1>D g4<8,8,1>D
+ * mov(8) g2<1>D acc0<8,8,1>D
+ *
+ * But on Gen > 6, the ability to use second accumulator register
+ * (acc1) for non-float data types was removed, preventing a simple
+ * implementation in SIMD16. A 16-channel result can be calculated by
+ * executing the three instructions twice in SIMD8, once with quarter
+ * control of 1Q for the first eight channels and again with 2Q for
+ * the second eight channels.
+ *
+ * Which accumulator register is implicitly accessed (by AccWrEnable
+ * for instance) is determined by the quarter control. Unfortunately
+ * Ivybridge (and presumably Baytrail) has a hardware bug in which an
+ * implicit accumulator access by an instruction with 2Q will access
+ * acc1 regardless of whether the data type is usable in acc1.
+ *
+ * Specifically, the 2Q mach(8) writes acc1 which does not exist for
+ * integer data types.
+ *
+ * Since we only want the low 32-bits of the result, we can do two
+ * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
+ * adjust the high result and add them (like the mach is doing):
+ *
+ * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
+ * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
+ * shl(8) g9<1>D g8<8,8,1>D 16D
+ * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
+ *
+ * We avoid the shl instruction by realizing that we only want to add
+ * the low 16-bits of the "high" result to the high 16-bits of the
+ * "low" result and using proper regioning on the add:
+ *
+ * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
+ * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
+ * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
+ *
+ * Since it does not use the (single) accumulator register, we can
+ * schedule multi-component multiplications much better.
+ */
+
+ fs_reg orig_dst = inst->dst;
+ if (orig_dst.is_null() || orig_dst.file == MRF) {
+ inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
+ inst->dst.type);
+ }
+ fs_reg low = inst->dst;
+ fs_reg high(VGRF, alloc.allocate(dispatch_width / 8),
+ inst->dst.type);
+
+ if (devinfo->gen >= 7) {
+ if (inst->src[1].file == IMM) {
+ ibld.MUL(low, inst->src[0],
+ brw_imm_uw(inst->src[1].ud & 0xffff));
+ ibld.MUL(high, inst->src[0],
+ brw_imm_uw(inst->src[1].ud >> 16));
+ } else {
+ ibld.MUL(low, inst->src[0],
+ subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
+ ibld.MUL(high, inst->src[0],
+ subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1));
+ }
+ } else {
+ ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0),
+ inst->src[1]);
+ ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1),
+ inst->src[1]);
+ }
+
+ ibld.ADD(subscript(inst->dst, BRW_REGISTER_TYPE_UW, 1),
+ subscript(low, BRW_REGISTER_TYPE_UW, 1),
+ subscript(high, BRW_REGISTER_TYPE_UW, 0));
+
+ if (inst->conditional_mod || orig_dst.file == MRF) {
+ set_condmod(inst->conditional_mod,
+ ibld.MOV(orig_dst, inst->dst));
+ }
+ }
+
+ } else if (inst->opcode == SHADER_OPCODE_MULH) {
+ /* Should have been lowered to 8-wide. */
+ assert(inst->exec_size <= get_lowered_simd_width(devinfo, inst));
+ const fs_reg acc = retype(brw_acc_reg(inst->exec_size),
+ inst->dst.type);
+ fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
+ fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
+
+ if (devinfo->gen >= 8) {
+ /* Until Gen8, integer multiplies read 32-bits from one source,
+ * and 16-bits from the other, and relying on the MACH instruction
+ * to generate the high bits of the result.
+ *
+ * On Gen8, the multiply instruction does a full 32x32-bit
+ * multiply, but in order to do a 64-bit multiply we can simulate
+ * the previous behavior and then use a MACH instruction.
+ *
+ * FINISHME: Don't use source modifiers on src1.
+ */
+ assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
+ mul->src[1].type == BRW_REGISTER_TYPE_UD);
+ mul->src[1].type = BRW_REGISTER_TYPE_UW;
+ mul->src[1].stride *= 2;
+
+ } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
+ inst->group > 0) {
+ /* Among other things the quarter control bits influence which
+ * accumulator register is used by the hardware for instructions
+ * that access the accumulator implicitly (e.g. MACH). A
+ * second-half instruction would normally map to acc1, which
+ * doesn't exist on Gen7 and up (the hardware does emulate it for
+ * floating-point instructions *only* by taking advantage of the
+ * extra precision of acc0 not normally used for floating point
+ * arithmetic).
+ *
+ * HSW and up are careful enough not to try to access an
+ * accumulator register that doesn't exist, but on earlier Gen7
+ * hardware we need to make sure that the quarter control bits are
+ * zero to avoid non-deterministic behaviour and emit an extra MOV
+ * to get the result masked correctly according to the current
+ * channel enables.
+ */
+ mach->group = 0;
+ mach->force_writemask_all = true;
+ mach->dst = ibld.vgrf(inst->dst.type);
+ ibld.MOV(inst->dst, mach->dst);
+ }
+ } else {
+ continue;
+ }
+
+ inst->remove(block);
+ progress = true;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+bool
+fs_visitor::lower_minmax()
+{
+ assert(devinfo->gen < 6);
+
+ bool progress = false;
+
+ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+ const fs_builder ibld(this, block, inst);
+
+ if (inst->opcode == BRW_OPCODE_SEL &&
+ inst->predicate == BRW_PREDICATE_NONE) {
+ /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of
+ * the original SEL.L/GE instruction
+ */
+ ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
+ inst->conditional_mod);
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ inst->conditional_mod = BRW_CONDITIONAL_NONE;
+
+ progress = true;
+ }
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+static void
+setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
+ fs_reg *dst, fs_reg color, unsigned components)
+{
+ if (key->clamp_fragment_color) {
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+ assert(color.type == BRW_REGISTER_TYPE_F);
+
+ for (unsigned i = 0; i < components; i++)
+ set_saturate(true,
+ bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
+
+ color = tmp;
+ }
+
+ for (unsigned i = 0; i < components; i++)
+ dst[i] = offset(color, bld, i);
+}
+
+static void
+lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
+ const struct brw_wm_prog_data *prog_data,
+ const brw_wm_prog_key *key,
+ const fs_visitor::thread_payload &payload)
+{
+ assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
+ const gen_device_info *devinfo = bld.shader->devinfo;
+ const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
+ const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
+ const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
+ const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
+ const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
+ const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
+ fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
+ const unsigned components =
+ inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
+
+ /* We can potentially have a message length of up to 15, so we have to set
+ * base_mrf to either 0 or 1 in order to fit in m0..m15.
+ */
+ fs_reg sources[15];
+ int header_size = 2, payload_header_size;
+ unsigned length = 0;
+
+ /* From the Sandy Bridge PRM, volume 4, page 198:
+ *
+ * "Dispatched Pixel Enables. One bit per pixel indicating
+ * which pixels were originally enabled when the thread was
+ * dispatched. This field is only required for the end-of-
+ * thread message and on all dual-source messages."
+ */
+ if (devinfo->gen >= 6 &&
+ (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
+ color1.file == BAD_FILE &&
+ key->nr_color_regions == 1) {
+ header_size = 0;
+ }
+
+ if (header_size != 0) {
+ assert(header_size == 2);
+ /* Allocate 2 registers for a header */
+ length += 2;
+ }
+
+ if (payload.aa_dest_stencil_reg) {
+ sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
+ bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
+ .MOV(sources[length],
+ fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
+ length++;
+ }
+
+ if (sample_mask.file != BAD_FILE) {
+ sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1),
+ BRW_REGISTER_TYPE_UD);
+
+ /* Hand over gl_SampleMask. Only the lower 16 bits of each channel are
+ * relevant. Since it's unsigned single words one vgrf is always
+ * 16-wide, but only the lower or higher 8 channels will be used by the
+ * hardware when doing a SIMD8 write depending on whether we have
+ * selected the subspans for the first or second half respectively.
+ */
+ assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
+ sample_mask.type = BRW_REGISTER_TYPE_UW;
+ sample_mask.stride *= 2;
+
+ bld.exec_all().annotate("FB write oMask")
+ .MOV(horiz_offset(retype(sources[length], BRW_REGISTER_TYPE_UW),
+ inst->group),
+ sample_mask);
+ length++;
+ }
+
+ payload_header_size = length;
+
+ if (src0_alpha.file != BAD_FILE) {
+ /* FIXME: This is being passed at the wrong location in the payload and
+ * doesn't work when gl_SampleMask and MRTs are used simultaneously.
+ * It's supposed to be immediately before oMask but there seems to be no
+ * reasonable way to pass them in the correct order because LOAD_PAYLOAD
+ * requires header sources to form a contiguous segment at the beginning
+ * of the message and src0_alpha has per-channel semantics.
+ */
+ setup_color_payload(bld, key, &sources[length], src0_alpha, 1);
+ length++;
+ } else if (key->replicate_alpha && inst->target != 0) {
+ /* Handle the case when fragment shader doesn't write to draw buffer
+ * zero. No need to call setup_color_payload() for src0_alpha because
+ * alpha value will be undefined.
+ */
+ length++;
+ }
+
+ setup_color_payload(bld, key, &sources[length], color0, components);
+ length += 4;
+
+ if (color1.file != BAD_FILE) {
+ setup_color_payload(bld, key, &sources[length], color1, components);
+ length += 4;
+ }
+
+ if (src_depth.file != BAD_FILE) {
+ sources[length] = src_depth;
+ length++;
+ }
+
+ if (dst_depth.file != BAD_FILE) {
+ sources[length] = dst_depth;
+ length++;
+ }
+
+ if (src_stencil.file != BAD_FILE) {
+ assert(devinfo->gen >= 9);
+ assert(bld.dispatch_width() != 16);
+
+ /* XXX: src_stencil is only available on gen9+. dst_depth is never
+ * available on gen9+. As such it's impossible to have both enabled at the
+ * same time and therefore length cannot overrun the array.
+ */
+ assert(length < 15);
+
+ sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD);
+ bld.exec_all().annotate("FB write OS")
+ .MOV(retype(sources[length], BRW_REGISTER_TYPE_UB),
+ subscript(src_stencil, BRW_REGISTER_TYPE_UB, 0));
+ length++;
+ }
+
+ fs_inst *load;
+ if (devinfo->gen >= 7) {
+ /* Send from the GRF */
+ fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F);
+ load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
+ payload.nr = bld.shader->alloc.allocate(regs_written(load));
+ load->dst = payload;
+
+ inst->src[0] = payload;
+ inst->resize_sources(1);
+ } else {
+ /* Send from the MRF */
+ load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
+ sources, length, payload_header_size);
+
+ /* On pre-SNB, we have to interlace the color values. LOAD_PAYLOAD
+ * will do this for us if we just give it a COMPR4 destination.
+ */
+ if (devinfo->gen < 6 && bld.dispatch_width() == 16)
+ load->dst.nr |= BRW_MRF_COMPR4;
+
+ inst->resize_sources(0);
+ inst->base_mrf = 1;
+ }
+
+ inst->opcode = FS_OPCODE_FB_WRITE;
+ inst->mlen = regs_written(load);
+ inst->header_size = header_size;
+}
+
+static void
+lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst)
+{
+ const fs_builder &ubld = bld.exec_all();
+ const unsigned length = 2;
+ const fs_reg header = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD, length);
+
+ ubld.group(16, 0)
+ .MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+ inst->resize_sources(1);
+ inst->src[0] = header;
+ inst->opcode = FS_OPCODE_FB_READ;
+ inst->mlen = length;
+ inst->header_size = length;
+}
+
+static void
+lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
+ const fs_reg &coordinate,
+ const fs_reg &shadow_c,
+ const fs_reg &lod, const fs_reg &lod2,
+ const fs_reg &surface,
+ const fs_reg &sampler,
+ unsigned coord_components,
+ unsigned grad_components)
+{
+ const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
+ op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
+ fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
+ fs_reg msg_end = msg_begin;
+
+ /* g0 header. */
+ msg_end = offset(msg_end, bld.group(8, 0), 1);
+
+ for (unsigned i = 0; i < coord_components; i++)
+ bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
+ offset(coordinate, bld, i));
+
+ msg_end = offset(msg_end, bld, coord_components);
+
+ /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
+ * require all three components to be present and zero if they are unused.
+ */
+ if (coord_components > 0 &&
+ (has_lod || shadow_c.file != BAD_FILE ||
+ (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
+ for (unsigned i = coord_components; i < 3; i++)
+ bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f));
+
+ msg_end = offset(msg_end, bld, 3 - coord_components);
+ }
+
+ if (op == SHADER_OPCODE_TXD) {
+ /* TXD unsupported in SIMD16 mode. */
+ assert(bld.dispatch_width() == 8);
+
+ /* the slots for u and v are always present, but r is optional */
+ if (coord_components < 2)
+ msg_end = offset(msg_end, bld, 2 - coord_components);
+
+ /* P = u, v, r
+ * dPdx = dudx, dvdx, drdx
+ * dPdy = dudy, dvdy, drdy
+ *
+ * 1-arg: Does not exist.
+ *
+ * 2-arg: dudx dvdx dudy dvdy
+ * dPdx.x dPdx.y dPdy.x dPdy.y
+ * m4 m5 m6 m7
+ *
+ * 3-arg: dudx dvdx drdx dudy dvdy drdy
+ * dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
+ * m5 m6 m7 m8 m9 m10
+ */
+ for (unsigned i = 0; i < grad_components; i++)
+ bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
+
+ msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
+
+ for (unsigned i = 0; i < grad_components; i++)
+ bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
+
+ msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
+ }
+
+ if (has_lod) {
+ /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without*
+ * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode.
+ */
+ assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
+ bld.dispatch_width() == 16);
+
+ const brw_reg_type type =
+ (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
+ BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
+ bld.MOV(retype(msg_end, type), lod);
+ msg_end = offset(msg_end, bld, 1);
+ }
+
+ if (shadow_c.file != BAD_FILE) {
+ if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
+ /* There's no plain shadow compare message, so we use shadow
+ * compare with a bias of 0.0.
+ */
+ bld.MOV(msg_end, brw_imm_f(0.0f));
+ msg_end = offset(msg_end, bld, 1);
+ }
+
+ bld.MOV(msg_end, shadow_c);
+ msg_end = offset(msg_end, bld, 1);
+ }
+
+ inst->opcode = op;
+ inst->src[0] = reg_undef;
+ inst->src[1] = surface;
+ inst->src[2] = sampler;
+ inst->resize_sources(3);
+ inst->base_mrf = msg_begin.nr;
+ inst->mlen = msg_end.nr - msg_begin.nr;
+ inst->header_size = 1;
+}
+
+static void
+lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
+ const fs_reg &coordinate,
+ const fs_reg &shadow_c,
+ const fs_reg &lod, const fs_reg &lod2,
+ const fs_reg &sample_index,
+ const fs_reg &surface,
+ const fs_reg &sampler,
+ unsigned coord_components,
+ unsigned grad_components)
+{
+ fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
+ fs_reg msg_coords = message;
+ unsigned header_size = 0;
+
+ if (inst->offset != 0) {
+ /* The offsets set up by the visitor are in the m1 header, so we can't
+ * go headerless.
+ */
+ header_size = 1;
+ message.nr--;
+ }
+
+ for (unsigned i = 0; i < coord_components; i++)
+ bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type),
+ offset(coordinate, bld, i));
+
+ fs_reg msg_end = offset(msg_coords, bld, coord_components);
+ fs_reg msg_lod = offset(msg_coords, bld, 4);
+
+ if (shadow_c.file != BAD_FILE) {
+ fs_reg msg_shadow = msg_lod;
+ bld.MOV(msg_shadow, shadow_c);
+ msg_lod = offset(msg_shadow, bld, 1);
+ msg_end = msg_lod;
+ }
+
+ switch (op) {
+ case SHADER_OPCODE_TXL:
+ case FS_OPCODE_TXB:
+ bld.MOV(msg_lod, lod);
+ msg_end = offset(msg_lod, bld, 1);
+ break;
+ case SHADER_OPCODE_TXD:
+ /**
+ * P = u, v, r
+ * dPdx = dudx, dvdx, drdx
+ * dPdy = dudy, dvdy, drdy
+ *
+ * Load up these values:
+ * - dudx dudy dvdx dvdy drdx drdy
+ * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
+ */
+ msg_end = msg_lod;
+ for (unsigned i = 0; i < grad_components; i++) {
+ bld.MOV(msg_end, offset(lod, bld, i));
+ msg_end = offset(msg_end, bld, 1);
+
+ bld.MOV(msg_end, offset(lod2, bld, i));
+ msg_end = offset(msg_end, bld, 1);
+ }
+ break;
+ case SHADER_OPCODE_TXS:
+ msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
+ bld.MOV(msg_lod, lod);
+ msg_end = offset(msg_lod, bld, 1);
+ break;
+ case SHADER_OPCODE_TXF:
+ msg_lod = offset(msg_coords, bld, 3);
+ bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
+ msg_end = offset(msg_lod, bld, 1);
+ break;
+ case SHADER_OPCODE_TXF_CMS:
+ msg_lod = offset(msg_coords, bld, 3);
+ /* lod */
+ bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
+ /* sample index */
+ bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
+ msg_end = offset(msg_lod, bld, 2);
+ break;
+ default:
+ break;
+ }
+
+ inst->opcode = op;
+ inst->src[0] = reg_undef;
+ inst->src[1] = surface;
+ inst->src[2] = sampler;
+ inst->resize_sources(3);
+ inst->base_mrf = message.nr;
+ inst->mlen = msg_end.nr - message.nr;
+ inst->header_size = header_size;
+
+ /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
+ assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
+}
+
+static bool
+is_high_sampler(const struct gen_device_info *devinfo, const fs_reg &sampler)
+{
+ if (devinfo->gen < 8 && !devinfo->is_haswell)
+ return false;
+
+ return sampler.file != IMM || sampler.ud >= 16;
+}
+
+static void
+lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
+ const fs_reg &coordinate,
+ const fs_reg &shadow_c,
+ fs_reg lod, const fs_reg &lod2,
+ const fs_reg &sample_index,
+ const fs_reg &mcs,
+ const fs_reg &surface,
+ const fs_reg &sampler,
+ const fs_reg &tg4_offset,
+ unsigned coord_components,
+ unsigned grad_components)
+{
+ const gen_device_info *devinfo = bld.shader->devinfo;
+ unsigned reg_width = bld.dispatch_width() / 8;
+ unsigned header_size = 0, length = 0;
+ fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
+ for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
+ sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
+
+ if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
+ inst->offset != 0 || inst->eot ||
+ op == SHADER_OPCODE_SAMPLEINFO ||
+ is_high_sampler(devinfo, sampler)) {
+ /* For general texture offsets (no txf workaround), we need a header to
+ * put them in. Note that we're only reserving space for it in the
+ * message payload as it will be initialized implicitly by the
+ * generator.
+ *
+ * TG4 needs to place its channel select in the header, for interaction
+ * with ARB_texture_swizzle. The sampler index is only 4-bits, so for
+ * larger sampler numbers we need to offset the Sampler State Pointer in
+ * the header.
+ */
+ header_size = 1;
+ sources[0] = fs_reg();
+ length++;
+
+ /* If we're requesting fewer than four channels worth of response,
+ * and we have an explicit header, we need to set up the sampler
+ * writemask. It's reversed from normal: 1 means "don't write".
+ */
+ if (!inst->eot && regs_written(inst) != 4 * reg_width) {
+ assert(regs_written(inst) % reg_width == 0);
+ unsigned mask = ~((1 << (regs_written(inst) / reg_width)) - 1) & 0xf;
+ inst->offset |= mask << 12;
+ }
+ }
+
+ if (shadow_c.file != BAD_FILE) {
+ bld.MOV(sources[length], shadow_c);
+ length++;
+ }
+
+ bool coordinate_done = false;
+
+ /* Set up the LOD info */
+ switch (op) {
+ case FS_OPCODE_TXB:
+ case SHADER_OPCODE_TXL:
+ if (devinfo->gen >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) {
+ op = SHADER_OPCODE_TXL_LZ;
+ break;
+ }
+ bld.MOV(sources[length], lod);
+ length++;
+ break;
+ case SHADER_OPCODE_TXD:
+ /* TXD should have been lowered in SIMD16 mode. */
+ assert(bld.dispatch_width() == 8);
+
+ /* Load dPdx and the coordinate together:
+ * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
+ */
+ for (unsigned i = 0; i < coord_components; i++) {
+ bld.MOV(sources[length++], offset(coordinate, bld, i));
+
+ /* For cube map array, the coordinate is (u,v,r,ai) but there are
+ * only derivatives for (u, v, r).
+ */
+ if (i < grad_components) {
+ bld.MOV(sources[length++], offset(lod, bld, i));
+ bld.MOV(sources[length++], offset(lod2, bld, i));
+ }
+ }
+
+ coordinate_done = true;
+ break;
+ case SHADER_OPCODE_TXS:
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
+ length++;
+ break;
+ case SHADER_OPCODE_TXF:
+ /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
+ * On Gen9 they are u, v, lod, r
+ */
+ bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), coordinate);
+
+ if (devinfo->gen >= 9) {
+ if (coord_components >= 2) {
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D),
+ offset(coordinate, bld, 1));
+ } else {
+ sources[length] = brw_imm_d(0);
+ }
+ length++;
+ }
+
+ if (devinfo->gen >= 9 && lod.is_zero()) {
+ op = SHADER_OPCODE_TXF_LZ;
+ } else {
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
+ length++;
+ }
+
+ for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++)
+ bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
+ offset(coordinate, bld, i));
+
+ coordinate_done = true;
+ break;
+
+ case SHADER_OPCODE_TXF_CMS:
+ case SHADER_OPCODE_TXF_CMS_W:
+ case SHADER_OPCODE_TXF_UMS:
+ case SHADER_OPCODE_TXF_MCS:
+ if (op == SHADER_OPCODE_TXF_UMS ||
+ op == SHADER_OPCODE_TXF_CMS ||
+ op == SHADER_OPCODE_TXF_CMS_W) {
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
+ length++;
+ }
+
+ if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
+ /* Data from the multisample control surface. */
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
+ length++;
+
+ /* On Gen9+ we'll use ld2dms_w instead which has two registers for
+ * the MCS data.
+ */
+ if (op == SHADER_OPCODE_TXF_CMS_W) {
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD),
+ mcs.file == IMM ?
+ mcs :
+ offset(mcs, bld, 1));
+ length++;
+ }
+ }
+
+ /* There is no offsetting for this message; just copy in the integer
+ * texture coordinates.
+ */
+ for (unsigned i = 0; i < coord_components; i++)
+ bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
+ offset(coordinate, bld, i));
+
+ coordinate_done = true;
+ break;
+ case SHADER_OPCODE_TG4_OFFSET:
+ /* More crazy intermixing */
+ for (unsigned i = 0; i < 2; i++) /* u, v */
+ bld.MOV(sources[length++], offset(coordinate, bld, i));
+
+ for (unsigned i = 0; i < 2; i++) /* offu, offv */
+ bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
+ offset(tg4_offset, bld, i));
+
+ if (coord_components == 3) /* r if present */
+ bld.MOV(sources[length++], offset(coordinate, bld, 2));
+
+ coordinate_done = true;
+ break;
+ default:
+ break;
+ }
+
+ /* Set up the coordinate (except for cases where it was done above) */
+ if (!coordinate_done) {
+ for (unsigned i = 0; i < coord_components; i++)
+ bld.MOV(sources[length++], offset(coordinate, bld, i));
+ }
+
+ int mlen;
+ if (reg_width == 2)
+ mlen = length * reg_width - header_size;
+ else
+ mlen = length * reg_width;
+
+ const fs_reg src_payload = fs_reg(VGRF, bld.shader->alloc.allocate(mlen),
+ BRW_REGISTER_TYPE_F);
+ bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
+
+ /* Generate the SEND. */
+ inst->opcode = op;
+ inst->src[0] = src_payload;
+ inst->src[1] = surface;
+ inst->src[2] = sampler;
+ inst->resize_sources(3);
+ inst->mlen = mlen;
+ inst->header_size = header_size;
+
+ /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
+ assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
+}
+
+static void
+lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
+{
+ const gen_device_info *devinfo = bld.shader->devinfo;
+ const fs_reg &coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
+ const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
+ const fs_reg &lod = inst->src[TEX_LOGICAL_SRC_LOD];
+ const fs_reg &lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
+ const fs_reg &sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
+ const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS];
+ const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
+ const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
+ const fs_reg &tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
+ assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
+ const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
+ assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
+ const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
+
+ if (devinfo->gen >= 7) {
+ lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
+ shadow_c, lod, lod2, sample_index,
+ mcs, surface, sampler, tg4_offset,
+ coord_components, grad_components);
+ } else if (devinfo->gen >= 5) {
+ lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
+ shadow_c, lod, lod2, sample_index,
+ surface, sampler,
+ coord_components, grad_components);
+ } else {
+ lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
+ shadow_c, lod, lod2,
+ surface, sampler,
+ coord_components, grad_components);
+ }
+}
+
+/**
+ * Initialize the header present in some typed and untyped surface
+ * messages.
+ */
+static fs_reg
+emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask)
+{
+ fs_builder ubld = bld.exec_all().group(8, 0);
+ const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+ ubld.MOV(dst, brw_imm_d(0));
+ ubld.MOV(component(dst, 7), sample_mask);
+ return dst;
+}
+
+static void
+lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
+ const fs_reg &sample_mask)
+{
+ /* Get the logical send arguments. */
+ const fs_reg &addr = inst->src[0];
+ const fs_reg &src = inst->src[1];
+ const fs_reg &surface = inst->src[2];
+ const UNUSED fs_reg &dims = inst->src[3];
+ const fs_reg &arg = inst->src[4];
+
+ /* Calculate the total number of components of the payload. */
+ const unsigned addr_sz = inst->components_read(0);
+ const unsigned src_sz = inst->components_read(1);
+ const unsigned header_sz = (sample_mask.file == BAD_FILE ? 0 : 1);
+ const unsigned sz = header_sz + addr_sz + src_sz;
+
+ /* Allocate space for the payload. */
+ fs_reg *const components = new fs_reg[sz];
+ const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
+ unsigned n = 0;
+
+ /* Construct the payload. */
+ if (header_sz)
+ components[n++] = emit_surface_header(bld, sample_mask);
+
+ for (unsigned i = 0; i < addr_sz; i++)
+ components[n++] = offset(addr, bld, i);
+
+ for (unsigned i = 0; i < src_sz; i++)
+ components[n++] = offset(src, bld, i);
+
+ bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
+
+ /* Update the original instruction. */
+ inst->opcode = op;
+ inst->mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
+ inst->header_size = header_sz;
+
+ inst->src[0] = payload;
+ inst->src[1] = surface;
+ inst->src[2] = arg;
+ inst->resize_sources(3);
+
+ delete[] components;
+}
+
+static void
+lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
+{
+ const gen_device_info *devinfo = bld.shader->devinfo;
+
+ if (devinfo->gen >= 7) {
+ /* We are switching the instruction from an ALU-like instruction to a
+ * send-from-grf instruction. Since sends can't handle strides or
+ * source modifiers, we have to make a copy of the offset source.
+ */
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+ bld.MOV(tmp, inst->src[1]);
+ inst->src[1] = tmp;
+
+ inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
+
+ } else {
+ const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->gen),
+ BRW_REGISTER_TYPE_UD);
+
+ bld.MOV(byte_offset(payload, REG_SIZE), inst->src[1]);
+
+ inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4;
+ inst->resize_sources(1);
+ inst->base_mrf = payload.nr;
+ inst->header_size = 1;
+ inst->mlen = 1 + inst->exec_size / 8;
+ }
+}
+
+static void
+lower_math_logical_send(const fs_builder &bld, fs_inst *inst)
+{
+ assert(bld.shader->devinfo->gen < 6);
+
+ inst->base_mrf = 2;
+ inst->mlen = inst->sources * inst->exec_size / 8;
+
+ if (inst->sources > 1) {
+ /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
+ * "Message Payload":
+ *
+ * "Operand0[7]. For the INT DIV functions, this operand is the
+ * denominator."
+ * ...
+ * "Operand1[7]. For the INT DIV functions, this operand is the
+ * numerator."
+ */
+ const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
+ const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
+ const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
+
+ inst->resize_sources(1);
+ inst->src[0] = src0;
+
+ assert(inst->exec_size == 8);
+ bld.MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type), src1);
+ }
+}
+
+bool
+fs_visitor::lower_logical_sends()
+{
+ bool progress = false;
+
+ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+ const fs_builder ibld(this, block, inst);
+
+ switch (inst->opcode) {
+ case FS_OPCODE_FB_WRITE_LOGICAL:
+ assert(stage == MESA_SHADER_FRAGMENT);
+ lower_fb_write_logical_send(ibld, inst,
+ brw_wm_prog_data(prog_data),
+ (const brw_wm_prog_key *)key,
+ payload);
+ break;
+
+ case FS_OPCODE_FB_READ_LOGICAL:
+ lower_fb_read_logical_send(ibld, inst);
+ break;
+
+ case SHADER_OPCODE_TEX_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
+ break;
+
+ case SHADER_OPCODE_TXD_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
+ break;
+
+ case SHADER_OPCODE_TXF_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
+ break;
+
+ case SHADER_OPCODE_TXL_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
+ break;
+
+ case SHADER_OPCODE_TXS_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
+ break;
+
+ case FS_OPCODE_TXB_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
+ break;
+
+ case SHADER_OPCODE_TXF_CMS_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
+ break;
+
+ case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
+ break;
+
+ case SHADER_OPCODE_TXF_UMS_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
+ break;
+
+ case SHADER_OPCODE_TXF_MCS_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
+ break;
+
+ case SHADER_OPCODE_LOD_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
+ break;
+
+ case SHADER_OPCODE_TG4_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
+ break;
+
+ case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
+ break;
+
+ case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO);
+ break;
+
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+ lower_surface_logical_send(ibld, inst,
+ SHADER_OPCODE_UNTYPED_SURFACE_READ,
+ fs_reg());
+ break;
+
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+ lower_surface_logical_send(ibld, inst,
+ SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
+ ibld.sample_mask_reg());
+ break;
+
+ case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+ lower_surface_logical_send(ibld, inst,
+ SHADER_OPCODE_UNTYPED_ATOMIC,
+ ibld.sample_mask_reg());
+ break;
+
+ case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+ lower_surface_logical_send(ibld, inst,
+ SHADER_OPCODE_TYPED_SURFACE_READ,
+ brw_imm_d(0xffff));
+ break;
+
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+ lower_surface_logical_send(ibld, inst,
+ SHADER_OPCODE_TYPED_SURFACE_WRITE,
+ ibld.sample_mask_reg());
+ break;
+
+ case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+ lower_surface_logical_send(ibld, inst,
+ SHADER_OPCODE_TYPED_ATOMIC,
+ ibld.sample_mask_reg());
+ break;
+
+ case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
+ lower_varying_pull_constant_logical_send(ibld, inst);
+ break;
+
+ case SHADER_OPCODE_RCP:
+ case SHADER_OPCODE_RSQ:
+ case SHADER_OPCODE_SQRT:
+ case SHADER_OPCODE_EXP2:
+ case SHADER_OPCODE_LOG2:
+ case SHADER_OPCODE_SIN:
+ case SHADER_OPCODE_COS:
+ case SHADER_OPCODE_POW:
+ case SHADER_OPCODE_INT_QUOTIENT:
+ case SHADER_OPCODE_INT_REMAINDER:
+ /* The math opcodes are overloaded for the send-like and
+ * expression-like instructions which seems kind of icky. Gen6+ has
+ * a native (but rather quirky) MATH instruction so we don't need to
+ * do anything here. On Gen4-5 we'll have to lower the Gen6-like
+ * logical instructions (which we can easily recognize because they
+ * have mlen = 0) into send-like virtual instructions.
+ */
+ if (devinfo->gen < 6 && inst->mlen == 0) {
+ lower_math_logical_send(ibld, inst);
+ break;
+
+ } else {
+ continue;
+ }
+
+ default:
+ continue;
+ }
+
+ progress = true;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+/**
+ * Get the closest allowed SIMD width for instruction \p inst accounting for
+ * some common regioning and execution control restrictions that apply to FPU
+ * instructions. These restrictions don't necessarily have any relevance to
+ * instructions not executed by the FPU pipeline like extended math, control
+ * flow or send message instructions.
+ *
+ * For virtual opcodes it's really up to the instruction -- In some cases
+ * (e.g. where a virtual instruction unrolls into a simple sequence of FPU
+ * instructions) it may simplify virtual instruction lowering if we can
+ * enforce FPU-like regioning restrictions already on the virtual instruction,
+ * in other cases (e.g. virtual send-like instructions) this may be
+ * excessively restrictive.
+ */
+static unsigned
+get_fpu_lowered_simd_width(const struct gen_device_info *devinfo,
+ const fs_inst *inst)
+{
+ /* Maximum execution size representable in the instruction controls. */
+ unsigned max_width = MIN2(32, inst->exec_size);
+
+ /* According to the PRMs:
+ * "A. In Direct Addressing mode, a source cannot span more than 2
+ * adjacent GRF registers.
+ * B. A destination cannot span more than 2 adjacent GRF registers."
+ *
+ * Look for the source or destination with the largest register region
+ * which is the one that is going to limit the overall execution size of
+ * the instruction due to this rule.
+ */
+ unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
+
+ for (unsigned i = 0; i < inst->sources; i++)
+ reg_count = MAX2(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
+
+ /* Calculate the maximum execution size of the instruction based on the
+ * factor by which it goes over the hardware limit of 2 GRFs.
+ */
+ if (reg_count > 2)
+ max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, 2));
+
+ /* According to the IVB PRMs:
+ * "When destination spans two registers, the source MUST span two
+ * registers. The exception to the above rule:
+ *
+ * - When source is scalar, the source registers are not incremented.
+ * - When source is packed integer Word and destination is packed
+ * integer DWord, the source register is not incremented but the
+ * source sub register is incremented."
+ *
+ * The hardware specs from Gen4 to Gen7.5 mention similar regioning
+ * restrictions. The code below intentionally doesn't check whether the
+ * destination type is integer because empirically the hardware doesn't
+ * seem to care what the actual type is as long as it's dword-aligned.
+ */
+ if (devinfo->gen < 8) {
+ for (unsigned i = 0; i < inst->sources; i++) {
+ if (inst->size_written > REG_SIZE &&
+ inst->size_read(i) != 0 && inst->size_read(i) <= REG_SIZE &&
+ !is_uniform(inst->src[i]) &&
+ !(type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
+ type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1)) {
+ const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
+ max_width = MIN2(max_width, inst->exec_size / reg_count);
+ }
+ }
+ }
+
+ /* From the IVB PRMs:
+ * "When an instruction is SIMD32, the low 16 bits of the execution mask
+ * are applied for both halves of the SIMD32 instruction. If different
+ * execution mask channels are required, split the instruction into two
+ * SIMD16 instructions."
+ *
+ * There is similar text in the HSW PRMs. Gen4-6 don't even implement
+ * 32-wide control flow support in hardware and will behave similarly.
+ */
+ if (devinfo->gen < 8 && !inst->force_writemask_all)
+ max_width = MIN2(max_width, 16);
+
+ /* From the IVB PRMs (applies to HSW too):
+ * "Instructions with condition modifiers must not use SIMD32."
+ *
+ * From the BDW PRMs (applies to later hardware too):
+ * "Ternary instruction with condition modifiers must not use SIMD32."
+ */
+ if (inst->conditional_mod && (devinfo->gen < 8 || inst->is_3src(devinfo)))
+ max_width = MIN2(max_width, 16);
+
+ /* From the IVB PRMs (applies to other devices that don't have the
+ * gen_device_info::supports_simd16_3src flag set):
+ * "In Align16 access mode, SIMD16 is not allowed for DW operations and
+ * SIMD8 is not allowed for DF operations."
+ */
+ if (inst->is_3src(devinfo) && !devinfo->supports_simd16_3src)
+ max_width = MIN2(max_width, inst->exec_size / reg_count);
+
+ /* Pre-Gen8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is
+ * the 8-bit quarter of the execution mask signals specified in the
+ * instruction control fields) for the second compressed half of any
+ * single-precision instruction (for double-precision instructions
+ * it's hardwired to use NibCtrl+1, at least on HSW), which means that
+ * the EU will apply the wrong execution controls for the second
+ * sequential GRF write if the number of channels per GRF is not exactly
+ * eight in single-precision mode (or four in double-float mode).
+ *
+ * In this situation we calculate the maximum size of the split
+ * instructions so they only ever write to a single register.
+ */
+ if (devinfo->gen < 8 && inst->size_written > REG_SIZE &&
+ !inst->force_writemask_all) {
+ const unsigned channels_per_grf = inst->exec_size /
+ DIV_ROUND_UP(inst->size_written, REG_SIZE);
+ unsigned exec_type_size = 0;
+ for (int i = 0; i < inst->sources; i++) {
+ if (inst->src[i].file != BAD_FILE)
+ exec_type_size = MAX2(exec_type_size, type_sz(inst->src[i].type));
+ }
+ assert(exec_type_size);
+
+ /* The hardware shifts exactly 8 channels per compressed half of the
+ * instruction in single-precision mode and exactly 4 in double-precision.
+ */
+ if (channels_per_grf != (exec_type_size == 8 ? 4 : 8))
+ max_width = MIN2(max_width, channels_per_grf);
+ }
+
+ /* Only power-of-two execution sizes are representable in the instruction
+ * control fields.
+ */
+ return 1 << _mesa_logbase2(max_width);
+}
+
+/**
+ * Get the maximum allowed SIMD width for instruction \p inst accounting for
+ * various payload size restrictions that apply to sampler message
+ * instructions.
+ *
+ * This is only intended to provide a maximum theoretical bound for the
+ * execution size of the message based on the number of argument components
+ * alone, which in most cases will determine whether the SIMD8 or SIMD16
+ * variant of the message can be used, though some messages may have
+ * additional restrictions not accounted for here (e.g. pre-ILK hardware uses
+ * the message length to determine the exact SIMD width and argument count,
+ * which makes a number of sampler message combinations impossible to
+ * represent).
+ */
+static unsigned
+get_sampler_lowered_simd_width(const struct gen_device_info *devinfo,
+ const fs_inst *inst)
+{
+ /* Calculate the number of coordinate components that have to be present
+ * assuming that additional arguments follow the texel coordinates in the
+ * message payload. On IVB+ there is no need for padding, on ILK-SNB we
+ * need to pad to four or three components depending on the message,
+ * pre-ILK we need to pad to at most three components.
+ */
+ const unsigned req_coord_components =
+ (devinfo->gen >= 7 ||
+ !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
+ (devinfo->gen >= 5 && inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
+ inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
+ 3;
+
+ /* On Gen9+ the LOD argument is for free if we're able to use the LZ
+ * variant of the TXL or TXF message.
+ */
+ const bool implicit_lod = devinfo->gen >= 9 &&
+ (inst->opcode == SHADER_OPCODE_TXL ||
+ inst->opcode == SHADER_OPCODE_TXF) &&
+ inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
+
+ /* Calculate the total number of argument components that need to be passed
+ * to the sampler unit.
+ */
+ const unsigned num_payload_components =
+ MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
+ req_coord_components) +
+ inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
+ (implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
+ inst->components_read(TEX_LOGICAL_SRC_LOD2) +
+ inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
+ (inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
+ inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
+ inst->components_read(TEX_LOGICAL_SRC_MCS);
+
+ /* SIMD16 messages with more than five arguments exceed the maximum message
+ * size supported by the sampler, regardless of whether a header is
+ * provided or not.
+ */
+ return MIN2(inst->exec_size,
+ num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
+}
+
+/**
+ * Get the closest native SIMD width supported by the hardware for instruction
+ * \p inst. The instruction will be left untouched by
+ * fs_visitor::lower_simd_width() if the returned value is equal to the
+ * original execution size.
+ */
+static unsigned
+get_lowered_simd_width(const struct gen_device_info *devinfo,
+ const fs_inst *inst)
+{
+ switch (inst->opcode) {
+ case BRW_OPCODE_MOV:
+ case BRW_OPCODE_SEL:
+ case BRW_OPCODE_NOT:
+ case BRW_OPCODE_AND:
+ case BRW_OPCODE_OR:
+ case BRW_OPCODE_XOR:
+ case BRW_OPCODE_SHR:
+ case BRW_OPCODE_SHL:
+ case BRW_OPCODE_ASR:
+ case BRW_OPCODE_CMPN:
+ case BRW_OPCODE_CSEL:
+ case BRW_OPCODE_F32TO16:
+ case BRW_OPCODE_F16TO32:
+ case BRW_OPCODE_BFREV:
+ case BRW_OPCODE_BFE:
+ case BRW_OPCODE_ADD:
+ case BRW_OPCODE_MUL:
+ case BRW_OPCODE_AVG:
+ case BRW_OPCODE_FRC:
+ case BRW_OPCODE_RNDU:
+ case BRW_OPCODE_RNDD:
+ case BRW_OPCODE_RNDE:
+ case BRW_OPCODE_RNDZ:
+ case BRW_OPCODE_LZD:
+ case BRW_OPCODE_FBH:
+ case BRW_OPCODE_FBL:
+ case BRW_OPCODE_CBIT:
+ case BRW_OPCODE_SAD2:
+ case BRW_OPCODE_MAD:
+ case BRW_OPCODE_LRP:
+ case FS_OPCODE_PACK:
+ return get_fpu_lowered_simd_width(devinfo, inst);
+
+ case BRW_OPCODE_CMP: {
+ /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that
+ * when the destination is a GRF the dependency-clear bit on the flag
+ * register is cleared early.
+ *
+ * Suggested workarounds are to disable coissuing CMP instructions
+ * or to split CMP(16) instructions into two CMP(8) instructions.
+ *
+ * We choose to split into CMP(8) instructions since disabling
+ * coissuing would affect CMP instructions not otherwise affected by
+ * the errata.
+ */
+ const unsigned max_width = (devinfo->gen == 7 && !devinfo->is_haswell &&
+ !inst->dst.is_null() ? 8 : ~0);
+ return MIN2(max_width, get_fpu_lowered_simd_width(devinfo, inst));
+ }
+ case BRW_OPCODE_BFI1:
+ case BRW_OPCODE_BFI2:
+ /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
+ * should
+ * "Force BFI instructions to be executed always in SIMD8."
+ */
+ return MIN2(devinfo->is_haswell ? 8 : ~0u,
+ get_fpu_lowered_simd_width(devinfo, inst));
+
+ case BRW_OPCODE_IF:
+ assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16);
+ return inst->exec_size;
+
+ case SHADER_OPCODE_RCP:
+ case SHADER_OPCODE_RSQ:
+ case SHADER_OPCODE_SQRT:
+ case SHADER_OPCODE_EXP2:
+ case SHADER_OPCODE_LOG2:
+ case SHADER_OPCODE_SIN:
+ case SHADER_OPCODE_COS:
+ /* Unary extended math instructions are limited to SIMD8 on Gen4 and
+ * Gen6.
+ */
+ return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
+ devinfo->gen == 5 || devinfo->is_g4x ? MIN2(16, inst->exec_size) :
+ MIN2(8, inst->exec_size));
+
+ case SHADER_OPCODE_POW:
+ /* SIMD16 is only allowed on Gen7+. */
+ return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
+ MIN2(8, inst->exec_size));
+
+ case SHADER_OPCODE_INT_QUOTIENT:
+ case SHADER_OPCODE_INT_REMAINDER:
+ /* Integer division is limited to SIMD8 on all generations. */
+ return MIN2(8, inst->exec_size);
+
+ case FS_OPCODE_LINTERP:
+ case FS_OPCODE_GET_BUFFER_SIZE:
+ case FS_OPCODE_DDX_COARSE:
+ case FS_OPCODE_DDX_FINE:
+ case FS_OPCODE_DDY_COARSE:
+ case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+ case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
+ case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+ case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
+ case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
+ case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+ case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+ case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+ return MIN2(16, inst->exec_size);
+
+ case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
+ /* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch
+ * message used to implement varying pull constant loads, so expand it
+ * to SIMD16. An alternative with longer message payload length but
+ * shorter return payload would be to use the SIMD8 sampler message that
+ * takes (header, u, v, r) as parameters instead of (header, u).
+ */
+ return (devinfo->gen == 4 ? 16 : MIN2(16, inst->exec_size));
+
+ case FS_OPCODE_DDY_FINE:
+ /* The implementation of this virtual opcode may require emitting
+ * compressed Align16 instructions, which are severely limited on some
+ * generations.
+ *
+ * From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
+ * Region Restrictions):
+ *
+ * "In Align16 access mode, SIMD16 is not allowed for DW operations
+ * and SIMD8 is not allowed for DF operations."
+ *
+ * In this context, "DW operations" means "operations acting on 32-bit
+ * values", so it includes operations on floats.
+ *
+ * Gen4 has a similar restriction. From the i965 PRM, section 11.5.3
+ * (Instruction Compression -> Rules and Restrictions):
+ *
+ * "A compressed instruction must be in Align1 access mode. Align16
+ * mode instructions cannot be compressed."
+ *
+ * Similar text exists in the g45 PRM.
+ *
+ * Empirically, compressed align16 instructions using odd register
+ * numbers don't appear to work on Sandybridge either.
+ */
+ return (devinfo->gen == 4 || devinfo->gen == 6 ||
+ (devinfo->gen == 7 && !devinfo->is_haswell) ?
+ MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size));
+
+ case SHADER_OPCODE_MULH:
+ /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
+ * is 8-wide on Gen7+.
+ */
+ return (devinfo->gen >= 7 ? 8 :
+ get_fpu_lowered_simd_width(devinfo, inst));
+
+ case FS_OPCODE_FB_WRITE_LOGICAL:
+ /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
+ * here.
+ */
+ assert(devinfo->gen != 6 ||
+ inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
+ inst->exec_size == 8);
+ /* Dual-source FB writes are unsupported in SIMD16 mode. */
+ return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
+ 8 : MIN2(16, inst->exec_size));
+
+ case FS_OPCODE_FB_READ_LOGICAL:
+ return MIN2(16, inst->exec_size);
+
+ case SHADER_OPCODE_TEX_LOGICAL:
+ case SHADER_OPCODE_TXF_CMS_LOGICAL:
+ case SHADER_OPCODE_TXF_UMS_LOGICAL:
+ case SHADER_OPCODE_TXF_MCS_LOGICAL:
+ case SHADER_OPCODE_LOD_LOGICAL:
+ case SHADER_OPCODE_TG4_LOGICAL:
+ case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
+ case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+ case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+ return get_sampler_lowered_simd_width(devinfo, inst);
+
+ case SHADER_OPCODE_TXD_LOGICAL:
+ /* TXD is unsupported in SIMD16 mode. */
+ return 8;
+
+ case SHADER_OPCODE_TXL_LOGICAL:
+ case FS_OPCODE_TXB_LOGICAL:
+ /* Only one execution size is representable pre-ILK depending on whether
+ * the shadow reference argument is present.
+ */
+ if (devinfo->gen == 4)
+ return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;
+ else
+ return get_sampler_lowered_simd_width(devinfo, inst);
+
+ case SHADER_OPCODE_TXF_LOGICAL:
+ case SHADER_OPCODE_TXS_LOGICAL:
+ /* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
+ * messages. Use SIMD16 instead.
+ */
+ if (devinfo->gen == 4)
+ return 16;
+ else
+ return get_sampler_lowered_simd_width(devinfo, inst);
+
+ case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+ case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+ return 8;
+
+ case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+ return MIN2(16, inst->exec_size);
+
+ case SHADER_OPCODE_URB_READ_SIMD8:
+ case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
+ case SHADER_OPCODE_URB_WRITE_SIMD8:
+ case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+ case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+ case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
+ return MIN2(8, inst->exec_size);
+
+ case SHADER_OPCODE_MOV_INDIRECT:
+ /* Prior to Broadwell, we only have 8 address subregisters */
+ return MIN3(devinfo->gen >= 8 ? 16 : 8,
+ 2 * REG_SIZE / (inst->dst.stride * type_sz(inst->dst.type)),
+ inst->exec_size);
+
+ case SHADER_OPCODE_LOAD_PAYLOAD: {
+ const unsigned reg_count =
+ DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
+
+ if (reg_count > 2) {
+ /* Only LOAD_PAYLOAD instructions with per-channel destination region
+ * can be easily lowered (which excludes headers and heterogeneous
+ * types).
+ */
+ assert(!inst->header_size);
+ for (unsigned i = 0; i < inst->sources; i++)
+ assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) ||
+ inst->src[i].file == BAD_FILE);
+
+ return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
+ } else {
+ return inst->exec_size;
+ }
+ }
+ default:
+ return inst->exec_size;
+ }
+}
+
+/**
+ * Return true if splitting out the group of channels of instruction \p inst
+ * given by lbld.group() requires allocating a temporary for the i-th source
+ * of the lowered instruction.
+ */
+static inline bool
+needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i)
+{
+ return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
+ (inst->components_read(i) == 1 &&
+ lbld.dispatch_width() <= inst->exec_size));
+}
+
+/**
+ * Extract the data that would be consumed by the channel group given by
+ * lbld.group() from the i-th source region of instruction \p inst and return
+ * it as result in packed form. If any copy instructions are required they
+ * will be emitted before the given \p inst in \p block.
+ */
+static fs_reg
+emit_unzip(const fs_builder &lbld, bblock_t *block, fs_inst *inst,
+ unsigned i)
+{
+ /* Specified channel group from the source region. */
+ const fs_reg src = horiz_offset(inst->src[i], lbld.group());
+
+ if (needs_src_copy(lbld, inst, i)) {
+ /* Builder of the right width to perform the copy avoiding uninitialized
+ * data if the lowered execution size is greater than the original
+ * execution size of the instruction.
+ */
+ const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
+ inst->exec_size), 0);
+ const fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i));
+
+ for (unsigned k = 0; k < inst->components_read(i); ++k)
+ cbld.at(block, inst)
+ .MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
+
+ return tmp;
+
+ } else if (is_periodic(inst->src[i], lbld.dispatch_width())) {
+ /* The source is invariant for all dispatch_width-wide groups of the
+ * original region.
+ */
+ return inst->src[i];
+
+ } else {
+ /* We can just point the lowered instruction at the right channel group
+ * from the original region.
+ */
+ return src;
+ }
+}
+
+/**
+ * Return true if splitting out the group of channels of instruction \p inst
+ * given by lbld.group() requires allocating a temporary for the destination
+ * of the lowered instruction and copying the data back to the original
+ * destination region.
+ */
+static inline bool
+needs_dst_copy(const fs_builder &lbld, const fs_inst *inst)
+{
+ /* If the instruction writes more than one component we'll have to shuffle
+ * the results of multiple lowered instructions in order to make sure that
+ * they end up arranged correctly in the original destination region.
+ */
+ if (inst->size_written > inst->dst.component_size(inst->exec_size))
+ return true;
+
+ /* If the lowered execution size is larger than the original the result of
+ * the instruction won't fit in the original destination, so we'll have to
+ * allocate a temporary in any case.
+ */
+ if (lbld.dispatch_width() > inst->exec_size)
+ return true;
+
+ for (unsigned i = 0; i < inst->sources; i++) {
+ /* If we already made a copy of the source for other reasons there won't
+ * be any overlap with the destination.
+ */
+ if (needs_src_copy(lbld, inst, i))
+ continue;
+
+ /* In order to keep the logic simple we emit a copy whenever the
+ * destination region doesn't exactly match an overlapping source, which
+ * may point at the source and destination not being aligned group by
+ * group which could cause one of the lowered instructions to overwrite
+ * the data read from the same source by other lowered instructions.
+ */
+ if (regions_overlap(inst->dst, inst->size_written,
+ inst->src[i], inst->size_read(i)) &&
+ !inst->dst.equals(inst->src[i]))
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * Insert data from a packed temporary into the channel group given by
+ * lbld.group() of the destination region of instruction \p inst and return
+ * the temporary as result. If any copy instructions are required they will
+ * be emitted around the given \p inst in \p block.
+ */
+static fs_reg
+emit_zip(const fs_builder &lbld, bblock_t *block, fs_inst *inst)
+{
+ /* Builder of the right width to perform the copy avoiding uninitialized
+ * data if the lowered execution size is greater than the original
+ * execution size of the instruction.
+ */
+ const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
+ inst->exec_size), 0);
+
+ /* Specified channel group from the destination region. */
+ const fs_reg dst = horiz_offset(inst->dst, lbld.group());
+ const unsigned dst_size = inst->size_written /
+ inst->dst.component_size(inst->exec_size);
+
+ if (needs_dst_copy(lbld, inst)) {
+ const fs_reg tmp = lbld.vgrf(inst->dst.type, dst_size);
+
+ if (inst->predicate) {
+ /* Handle predication by copying the original contents of
+ * the destination into the temporary before emitting the
+ * lowered instruction.
+ */
+ for (unsigned k = 0; k < dst_size; ++k)
+ cbld.at(block, inst)
+ .MOV(offset(tmp, lbld, k), offset(dst, inst->exec_size, k));
+ }
+
+ for (unsigned k = 0; k < dst_size; ++k)
+ cbld.at(block, inst->next)
+ .MOV(offset(dst, inst->exec_size, k), offset(tmp, lbld, k));
+
+ return tmp;
+
+ } else {
+ /* No need to allocate a temporary for the lowered instruction, just
+ * take the right group of channels from the original region.
+ */
+ return dst;
+ }
+}
+
+bool
+fs_visitor::lower_simd_width()
+{
+ bool progress = false;
+
+ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+ const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
+
+ if (lower_width != inst->exec_size) {
+ /* Builder matching the original instruction. We may also need to
+ * emit an instruction of width larger than the original, set the
+ * execution size of the builder to the highest of both for now so
+ * we're sure that both cases can be handled.
+ */
+ const unsigned max_width = MAX2(inst->exec_size, lower_width);
+ const fs_builder ibld = bld.at(block, inst)
+ .exec_all(inst->force_writemask_all)
+ .group(max_width, inst->group / max_width);
+
+ /* Split the copies in chunks of the execution width of either the
+ * original or the lowered instruction, whichever is lower.
+ */
+ const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
+ const unsigned dst_size = inst->size_written /
+ inst->dst.component_size(inst->exec_size);
+
+ assert(!inst->writes_accumulator && !inst->mlen);
+
+ for (unsigned i = 0; i < n; i++) {
+ /* Emit a copy of the original instruction with the lowered width.
+ * If the EOT flag was set throw it away except for the last
+ * instruction to avoid killing the thread prematurely.
+ */
+ fs_inst split_inst = *inst;
+ split_inst.exec_size = lower_width;
+ split_inst.eot = inst->eot && i == n - 1;
+
+ /* Select the correct channel enables for the i-th group, then
+ * transform the sources and destination and emit the lowered
+ * instruction.
+ */
+ const fs_builder lbld = ibld.group(lower_width, i);
+
+ for (unsigned j = 0; j < inst->sources; j++)
+ split_inst.src[j] = emit_unzip(lbld, block, inst, j);
+
+ split_inst.dst = emit_zip(lbld, block, inst);
+ split_inst.size_written =
+ split_inst.dst.component_size(lower_width) * dst_size;
+
+ lbld.emit(split_inst);
+ }
+
+ inst->remove(block);
+ progress = true;
+ }
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+void
+fs_visitor::dump_instructions()
+{
+ dump_instructions(NULL);
+}
+
+void
+fs_visitor::dump_instructions(const char *name)
+{
+ FILE *file = stderr;
+ if (name && geteuid() != 0) {
+ file = fopen(name, "w");
+ if (!file)
+ file = stderr;
+ }
+
+ if (cfg) {
+ calculate_register_pressure();
+ int ip = 0, max_pressure = 0;
+ foreach_block_and_inst(block, backend_instruction, inst, cfg) {
+ max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
+ fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
+ dump_instruction(inst, file);
+ ip++;
+ }
+ fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
+ } else {
+ int ip = 0;
+ foreach_in_list(backend_instruction, inst, &instructions) {
+ fprintf(file, "%4d: ", ip++);
+ dump_instruction(inst, file);
+ }
+ }
+
+ if (file != stderr) {
+ fclose(file);
+ }
+}
+
+void
+fs_visitor::dump_instruction(backend_instruction *be_inst)
+{
+ dump_instruction(be_inst, stderr);
+}
+
+void
+fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
+{
+ fs_inst *inst = (fs_inst *)be_inst;
+
+ if (inst->predicate) {
+ fprintf(file, "(%cf0.%d) ",
+ inst->predicate_inverse ? '-' : '+',
+ inst->flag_subreg);
+ }
+
+ fprintf(file, "%s", brw_instruction_name(devinfo, inst->opcode));
+ if (inst->saturate)
+ fprintf(file, ".sat");
+ if (inst->conditional_mod) {
+ fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
+ if (!inst->predicate &&
+ (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
+ inst->opcode != BRW_OPCODE_IF &&
+ inst->opcode != BRW_OPCODE_WHILE))) {
+ fprintf(file, ".f0.%d", inst->flag_subreg);
+ }
+ }
+ fprintf(file, "(%d) ", inst->exec_size);
+
+ if (inst->mlen) {
+ fprintf(file, "(mlen: %d) ", inst->mlen);
+ }
+
+ if (inst->eot) {
+ fprintf(file, "(EOT) ");
+ }
+
+ switch (inst->dst.file) {
+ case VGRF:
+ fprintf(file, "vgrf%d", inst->dst.nr);
+ break;
+ case FIXED_GRF:
+ fprintf(file, "g%d", inst->dst.nr);
+ break;
+ case MRF:
+ fprintf(file, "m%d", inst->dst.nr);
+ break;
+ case BAD_FILE:
+ fprintf(file, "(null)");
+ break;
+ case UNIFORM:
+ fprintf(file, "***u%d***", inst->dst.nr);
+ break;
+ case ATTR:
+ fprintf(file, "***attr%d***", inst->dst.nr);
+ break;
+ case ARF:
+ switch (inst->dst.nr) {
+ case BRW_ARF_NULL:
+ fprintf(file, "null");
+ break;
+ case BRW_ARF_ADDRESS:
+ fprintf(file, "a0.%d", inst->dst.subnr);
+ break;
+ case BRW_ARF_ACCUMULATOR:
+ fprintf(file, "acc%d", inst->dst.subnr);
+ break;
+ case BRW_ARF_FLAG:
+ fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
+ break;
+ default:
+ fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
+ break;
+ }
+ break;
+ case IMM:
+ unreachable("not reached");
+ }
+
+ if (inst->dst.offset ||
+ (inst->dst.file == VGRF &&
+ alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
+ const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);
+ fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
+ inst->dst.offset % reg_size);
+ }
+
+ if (inst->dst.stride != 1)
+ fprintf(file, "<%u>", inst->dst.stride);
+ fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
+
+ for (int i = 0; i < inst->sources; i++) {
+ if (inst->src[i].negate)
+ fprintf(file, "-");
+ if (inst->src[i].abs)
+ fprintf(file, "|");
+ switch (inst->src[i].file) {
+ case VGRF:
+ fprintf(file, "vgrf%d", inst->src[i].nr);
+ break;
+ case FIXED_GRF:
+ fprintf(file, "g%d", inst->src[i].nr);
+ break;
+ case MRF:
+ fprintf(file, "***m%d***", inst->src[i].nr);
+ break;
+ case ATTR:
+ fprintf(file, "attr%d", inst->src[i].nr);
+ break;
+ case UNIFORM:
+ fprintf(file, "u%d", inst->src[i].nr);
+ break;
+ case BAD_FILE:
+ fprintf(file, "(null)");
+ break;
+ case IMM:
+ switch (inst->src[i].type) {
+ case BRW_REGISTER_TYPE_F:
+ fprintf(file, "%-gf", inst->src[i].f);
+ break;
+ case BRW_REGISTER_TYPE_DF:
+ fprintf(file, "%fdf", inst->src[i].df);
+ break;
+ case BRW_REGISTER_TYPE_W:
+ case BRW_REGISTER_TYPE_D:
+ fprintf(file, "%dd", inst->src[i].d);
+ break;
+ case BRW_REGISTER_TYPE_UW:
+ case BRW_REGISTER_TYPE_UD:
+ fprintf(file, "%uu", inst->src[i].ud);
+ break;
+ case BRW_REGISTER_TYPE_VF:
+ fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
+ brw_vf_to_float((inst->src[i].ud >> 0) & 0xff),
+ brw_vf_to_float((inst->src[i].ud >> 8) & 0xff),
+ brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
+ brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
+ break;
+ default:
+ fprintf(file, "???");
+ break;
+ }
+ break;
+ case ARF:
+ switch (inst->src[i].nr) {
+ case BRW_ARF_NULL:
+ fprintf(file, "null");
+ break;
+ case BRW_ARF_ADDRESS:
+ fprintf(file, "a0.%d", inst->src[i].subnr);
+ break;
+ case BRW_ARF_ACCUMULATOR:
+ fprintf(file, "acc%d", inst->src[i].subnr);
+ break;
+ case BRW_ARF_FLAG:
+ fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
+ break;
+ default:
+ fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
+ break;
+ }
+ break;
+ }
+
+ if (inst->src[i].offset ||
+ (inst->src[i].file == VGRF &&
+ alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
+ const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);
+ fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
+ inst->src[i].offset % reg_size);
+ }
+
+ if (inst->src[i].abs)
+ fprintf(file, "|");
+
+ if (inst->src[i].file != IMM) {
+ unsigned stride;
+ if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
+ unsigned hstride = inst->src[i].hstride;
+ stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
+ } else {
+ stride = inst->src[i].stride;
+ }
+ if (stride != 1)
+ fprintf(file, "<%u>", stride);
+
+ fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
+ }
+
+ if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
+ fprintf(file, ", ");
+ }
+
+ fprintf(file, " ");
+
+ if (inst->force_writemask_all)
+ fprintf(file, "NoMask ");
+
+ if (inst->exec_size != dispatch_width)
+ fprintf(file, "group%d ", inst->group);
+
+ fprintf(file, "\n");
+}
+
+/**
+ * Possibly returns an instruction that set up @param reg.
+ *
+ * Sometimes we want to take the result of some expression/variable
+ * dereference tree and rewrite the instruction generating the result
+ * of the tree. When processing the tree, we know that the
+ * instructions generated are all writing temporaries that are dead
+ * outside of this tree. So, if we have some instructions that write
+ * a temporary, we're free to point that temp write somewhere else.
+ *
+ * Note that this doesn't guarantee that the instruction generated
+ * only reg -- it might be the size=4 destination of a texture instruction.
+ */
+fs_inst *
+fs_visitor::get_instruction_generating_reg(fs_inst *start,
+ fs_inst *end,
+ const fs_reg &reg)
+{
+ if (end == start ||
+ end->is_partial_write() ||
+ !reg.equals(end->dst)) {
+ return NULL;
+ } else {
+ return end;
+ }
+}
+
+void
+fs_visitor::setup_fs_payload_gen6()
+{
+ assert(stage == MESA_SHADER_FRAGMENT);
+ struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+
+ assert(devinfo->gen >= 6);
+
+ /* R0-1: masks, pixel X/Y coordinates. */
+ payload.num_regs = 2;
+ /* R2: only for 32-pixel dispatch.*/
+
+ /* R3-26: barycentric interpolation coordinates. These appear in the
+ * same order that they appear in the brw_barycentric_mode
+ * enum. Each set of coordinates occupies 2 registers if dispatch width
+ * == 8 and 4 registers if dispatch width == 16. Coordinates only
+ * appear if they were enabled using the "Barycentric Interpolation
+ * Mode" bits in WM_STATE.
+ */
+ for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
+ if (prog_data->barycentric_interp_modes & (1 << i)) {
+ payload.barycentric_coord_reg[i] = payload.num_regs;
+ payload.num_regs += 2;
+ if (dispatch_width == 16) {
+ payload.num_regs += 2;
+ }
+ }
+ }
+
+ /* R27: interpolated depth if uses source depth */
+ prog_data->uses_src_depth =
+ (nir->info->inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+ if (prog_data->uses_src_depth) {
+ payload.source_depth_reg = payload.num_regs;
+ payload.num_regs++;
+ if (dispatch_width == 16) {
+ /* R28: interpolated depth if not SIMD8. */
+ payload.num_regs++;
+ }
+ }
+
+ /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
+ prog_data->uses_src_w =
+ (nir->info->inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+ if (prog_data->uses_src_w) {
+ payload.source_w_reg = payload.num_regs;
+ payload.num_regs++;
+ if (dispatch_width == 16) {
+ /* R30: interpolated W if not SIMD8. */
+ payload.num_regs++;
+ }
+ }
+
+ /* R31: MSAA position offsets. */
+ if (prog_data->persample_dispatch &&
+ (nir->info->system_values_read & SYSTEM_BIT_SAMPLE_POS)) {
+ /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
+ *
+ * "MSDISPMODE_PERSAMPLE is required in order to select
+ * POSOFFSET_SAMPLE"
+ *
+ * So we can only really get sample positions if we are doing real
+ * per-sample dispatch. If we need gl_SamplePosition and we don't have
+ * persample dispatch, we hard-code it to 0.5.
+ */
+ prog_data->uses_pos_offset = true;
+ payload.sample_pos_reg = payload.num_regs;
+ payload.num_regs++;
+ }
+
+ /* R32: MSAA input coverage mask */
+ prog_data->uses_sample_mask =
+ (nir->info->system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0;
+ if (prog_data->uses_sample_mask) {
+ assert(devinfo->gen >= 7);
+ payload.sample_mask_in_reg = payload.num_regs;
+ payload.num_regs++;
+ if (dispatch_width == 16) {
+ /* R33: input coverage mask if not SIMD8. */
+ payload.num_regs++;
+ }
+ }
+
+ /* R34-: bary for 32-pixel. */
+ /* R58-59: interp W for 32-pixel. */
+
+ if (nir->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+ source_depth_to_render_target = true;
+ }
+}
+
+void
+fs_visitor::setup_vs_payload()
+{
+ /* R0: thread header, R1: urb handles */
+ payload.num_regs = 2;
+}
+
+void
+fs_visitor::setup_gs_payload()
+{
+ assert(stage == MESA_SHADER_GEOMETRY);
+
+ struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
+ struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
+
+ /* R0: thread header, R1: output URB handles */
+ payload.num_regs = 2;
+
+ if (gs_prog_data->include_primitive_id) {
+ /* R2: Primitive ID 0..7 */
+ payload.num_regs++;
+ }
+
+ /* Use a maximum of 24 registers for push-model inputs. */
+ const unsigned max_push_components = 24;
+
+ /* If pushing our inputs would take too many registers, reduce the URB read
+ * length (which is in HWords, or 8 registers), and resort to pulling.
+ *
+ * Note that the GS reads <URB Read Length> HWords for every vertex - so we
+ * have to multiply by VerticesIn to obtain the total storage requirement.
+ */
+ if (8 * vue_prog_data->urb_read_length * nir->info->gs.vertices_in >
+ max_push_components || gs_prog_data->invocations > 1) {
+ gs_prog_data->base.include_vue_handles = true;
+
+ /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
+ payload.num_regs += nir->info->gs.vertices_in;
+
+ vue_prog_data->urb_read_length =
+ ROUND_DOWN_TO(max_push_components / nir->info->gs.vertices_in, 8) / 8;
+ }
+}
+
+void
+fs_visitor::setup_cs_payload()
+{
+ assert(devinfo->gen >= 7);
+ payload.num_regs = 1;
+}
+
+void
+fs_visitor::calculate_register_pressure()
+{
+ invalidate_live_intervals();
+ calculate_live_intervals();
+
+ unsigned num_instructions = 0;
+ foreach_block(block, cfg)
+ num_instructions += block->instructions.length();
+
+ regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
+
+ for (unsigned reg = 0; reg < alloc.count; reg++) {
+ for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
+ regs_live_at_ip[ip] += alloc.sizes[reg];
+ }
+}
+
+/**
+ * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
+ *
+ * The needs_unlit_centroid_workaround ends up producing one of these per
+ * channel of centroid input, so it's good to clean them up.
+ *
+ * An assumption here is that nothing ever modifies the dispatched pixels
+ * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
+ * dictates that anyway.
+ */
+bool
+fs_visitor::opt_drop_redundant_mov_to_flags()
+{
+ bool flag_mov_found[2] = {false};
+ bool progress = false;
+
+ /* Instructions removed by this pass can only be added if this were true */
+ if (!devinfo->needs_unlit_centroid_workaround)
+ return false;
+
+ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+ if (inst->is_control_flow()) {
+ memset(flag_mov_found, 0, sizeof(flag_mov_found));
+ } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
+ if (!flag_mov_found[inst->flag_subreg]) {
+ flag_mov_found[inst->flag_subreg] = true;
+ } else {
+ inst->remove(block);
+ progress = true;
+ }
+ } else if (inst->flags_written()) {
+ flag_mov_found[inst->flag_subreg] = false;
+ }
+ }
+
+ return progress;
+}
+
+void
+fs_visitor::optimize()
+{
+ /* Start by validating the shader we currently have. */
+ validate();
+
+ /* bld is the common builder object pointing at the end of the program we
+ * used to translate it into i965 IR. For the optimization and lowering
+ * passes coming next, any code added after the end of the program without
+ * having explicitly called fs_builder::at() clearly points at a mistake.
+ * Ideally optimization passes wouldn't be part of the visitor so they
+ * wouldn't have access to bld at all, but they do, so just in case some
+ * pass forgets to ask for a location explicitly set it to NULL here to
+ * make it trip. The dispatch width is initialized to a bogus value to
+ * make sure that optimizations set the execution controls explicitly to
+ * match the code they are manipulating instead of relying on the defaults.
+ */
+ bld = fs_builder(this, 64);
+
+ assign_constant_locations();
+ lower_constant_loads();
+
+ validate();
+
+ split_virtual_grfs();
+ validate();
+
+#define OPT(pass, args...) ({ \
+ pass_num++; \
+ bool this_progress = pass(args); \
+ \
+ if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
+ char filename[64]; \
+ snprintf(filename, 64, "%s%d-%s-%02d-%02d-" #pass, \
+ stage_abbrev, dispatch_width, nir->info->name, iteration, pass_num); \
+ \
+ backend_shader::dump_instructions(filename); \
+ } \
+ \
+ validate(); \
+ \
+ progress = progress || this_progress; \
+ this_progress; \
+ })
+
+ if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
+ char filename[64];
+ snprintf(filename, 64, "%s%d-%s-00-00-start",
+ stage_abbrev, dispatch_width, nir->info->name);
+
+ backend_shader::dump_instructions(filename);
+ }
+
+ bool progress = false;
+ int iteration = 0;
+ int pass_num = 0;
+
+ OPT(opt_drop_redundant_mov_to_flags);
+
+ do {
+ progress = false;
+ pass_num = 0;
+ iteration++;
+
+ OPT(remove_duplicate_mrf_writes);
+
+ OPT(opt_algebraic);
+ OPT(opt_cse);
+ OPT(opt_copy_propagation);
+ OPT(opt_predicated_break, this);
+ OPT(opt_cmod_propagation);
+ OPT(dead_code_eliminate);
+ OPT(opt_peephole_sel);
+ OPT(dead_control_flow_eliminate, this);
+ OPT(opt_register_renaming);
+ OPT(opt_saturate_propagation);
+ OPT(register_coalesce);
+ OPT(compute_to_mrf);
+ OPT(eliminate_find_live_channel);
+
+ OPT(compact_virtual_grfs);
+ } while (progress);
+
+ progress = false;
+ pass_num = 0;
+
+ if (OPT(lower_pack)) {
+ OPT(register_coalesce);
+ OPT(dead_code_eliminate);
+ }
+
+ if (OPT(lower_d2x)) {
+ OPT(opt_copy_propagation);
+ OPT(dead_code_eliminate);
+ }
+
+ OPT(lower_simd_width);
+
+ /* After SIMD lowering just in case we had to unroll the EOT send. */
+ OPT(opt_sampler_eot);
+
+ OPT(lower_logical_sends);
+
+ if (progress) {
+ OPT(opt_copy_propagation);
+ /* Only run after logical send lowering because it's easier to implement
+ * in terms of physical sends.
+ */
+ if (OPT(opt_zero_samples))
+ OPT(opt_copy_propagation);
+ /* Run after logical send lowering to give it a chance to CSE the
+ * LOAD_PAYLOAD instructions created to construct the payloads of
+ * e.g. texturing messages in cases where it wasn't possible to CSE the
+ * whole logical instruction.
+ */
+ OPT(opt_cse);
+ OPT(register_coalesce);
+ OPT(compute_to_mrf);
+ OPT(dead_code_eliminate);
+ OPT(remove_duplicate_mrf_writes);
+ OPT(opt_peephole_sel);
+ }
+
+ OPT(opt_redundant_discard_jumps);
+
+ if (OPT(lower_load_payload)) {
+ split_virtual_grfs();
+ OPT(register_coalesce);
+ OPT(compute_to_mrf);
+ OPT(dead_code_eliminate);
+ }
+
+ OPT(opt_combine_constants);
+ OPT(lower_integer_multiplication);
+
+ if (devinfo->gen <= 5 && OPT(lower_minmax)) {
+ OPT(opt_cmod_propagation);
+ OPT(opt_cse);
+ OPT(opt_copy_propagation);
+ OPT(dead_code_eliminate);
+ }
+
+ lower_uniform_pull_constant_loads();
+
+ validate();
+}
+
+/**
+ * Three source instruction must have a GRF/MRF destination register.
+ * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
+ */
+void
+fs_visitor::fixup_3src_null_dest()
+{
+ bool progress = false;
+
+ foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+ if (inst->is_3src(devinfo) && inst->dst.is_null()) {
+ inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
+ inst->dst.type);
+ progress = true;
+ }
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+}
+
+void
+fs_visitor::allocate_registers(bool allow_spilling)
+{
+ bool allocated_without_spills;
+
+ static const enum instruction_scheduler_mode pre_modes[] = {
+ SCHEDULE_PRE,
+ SCHEDULE_PRE_NON_LIFO,
+ SCHEDULE_PRE_LIFO,
+ };
+
+ bool spill_all = allow_spilling && (INTEL_DEBUG & DEBUG_SPILL_FS);
+
+ /* Try each scheduling heuristic to see if it can successfully register
+ * allocate without spilling. They should be ordered by decreasing
+ * performance but increasing likelihood of allocating.
+ */
+ for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
+ schedule_instructions(pre_modes[i]);
+
+ if (0) {
+ assign_regs_trivial();
+ allocated_without_spills = true;
+ } else {
+ allocated_without_spills = assign_regs(false, spill_all);
+ }
+ if (allocated_without_spills)
+ break;
+ }
+
+ if (!allocated_without_spills) {
+ if (!allow_spilling)
+ fail("Failure to register allocate and spilling is not allowed.");
+
+ /* We assume that any spilling is worse than just dropping back to
+ * SIMD8. There's probably actually some intermediate point where
+ * SIMD16 with a couple of spills is still better.
+ */
+ if (dispatch_width > min_dispatch_width) {
+ fail("Failure to register allocate. Reduce number of "
+ "live scalar values to avoid this.");
+ } else {
+ compiler->shader_perf_log(log_data,
+ "%s shader triggered register spilling. "
+ "Try reducing the number of live scalar "
+ "values to improve performance.\n",
+ stage_name);
+ }
+
+ /* Since we're out of heuristics, just go spill registers until we
+ * get an allocation.
+ */
+ while (!assign_regs(true, spill_all)) {
+ if (failed)
+ break;
+ }
+ }
+
+ /* This must come after all optimization and register allocation, since
+ * it inserts dead code that happens to have side effects, and it does
+ * so based on the actual physical registers in use.
+ */
+ insert_gen4_send_dependency_workarounds();
+
+ if (failed)
+ return;
+
+ schedule_instructions(SCHEDULE_POST);
+
+ if (last_scratch > 0) {
+ MAYBE_UNUSED unsigned max_scratch_size = 2 * 1024 * 1024;
+
+ prog_data->total_scratch = brw_get_scratch_size(last_scratch);
+
+ if (stage == MESA_SHADER_COMPUTE) {
+ if (devinfo->is_haswell) {
+ /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
+ * field documentation, Haswell supports a minimum of 2kB of
+ * scratch space for compute shaders, unlike every other stage
+ * and platform.
+ */
+ prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048);
+ } else if (devinfo->gen <= 7) {
+ /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
+ * field documentation, platforms prior to Haswell measure scratch
+ * size linearly with a range of [1kB, 12kB] and 1kB granularity.
+ */
+ prog_data->total_scratch = ALIGN(last_scratch, 1024);
+ max_scratch_size = 12 * 1024;
+ }
+ }
+
+ /* We currently only support up to 2MB of scratch space. If we
+ * need to support more eventually, the documentation suggests
+ * that we could allocate a larger buffer, and partition it out
+ * ourselves. We'd just have to undo the hardware's address
+ * calculation by subtracting (FFTID * Per Thread Scratch Space)
+ * and then add FFTID * (Larger Per Thread Scratch Space).
+ *
+ * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
+ * Thread Group Tracking > Local Memory/Scratch Space.
+ */
+ assert(prog_data->total_scratch < max_scratch_size);
+ }
+}
+
+bool
+fs_visitor::run_vs(gl_clip_plane *clip_planes)
+{
+ assert(stage == MESA_SHADER_VERTEX);
+
+ setup_vs_payload();
+
+ if (shader_time_index >= 0)
+ emit_shader_time_begin();
+
+ emit_nir_code();
+
+ if (failed)
+ return false;
+
+ compute_clip_distance(clip_planes);
+
+ emit_urb_writes();
+
+ if (shader_time_index >= 0)
+ emit_shader_time_end();
+
+ calculate_cfg();
+
+ optimize();
+
+ assign_curb_setup();
+ assign_vs_urb_setup();
+
+ fixup_3src_null_dest();
+ allocate_registers(true);
+
+ return !failed;
+}
+
+bool
+fs_visitor::run_tcs_single_patch()
+{
+ assert(stage == MESA_SHADER_TESS_CTRL);
+
+ struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
+
+ /* r1-r4 contain the ICP handles. */
+ payload.num_regs = 5;
+
+ if (shader_time_index >= 0)
+ emit_shader_time_begin();
+
+ /* Initialize gl_InvocationID */
+ fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
+ fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
+ bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
+ bld.MOV(channels_ud, channels_uw);
+
+ if (tcs_prog_data->instances == 1) {
+ invocation_id = channels_ud;
+ } else {
+ invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+ /* Get instance number from g0.2 bits 23:17, and multiply it by 8. */
+ fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
+ fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
+ bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
+ brw_imm_ud(INTEL_MASK(23, 17)));
+ bld.SHR(instance_times_8, t, brw_imm_ud(17 - 3));
+
+ bld.ADD(invocation_id, instance_times_8, channels_ud);
+ }
+
+ /* Fix the disptach mask */
+ if (nir->info->tess.tcs_vertices_out % 8) {
+ bld.CMP(bld.null_reg_ud(), invocation_id,
+ brw_imm_ud(nir->info->tess.tcs_vertices_out), BRW_CONDITIONAL_L);
+ bld.IF(BRW_PREDICATE_NORMAL);
+ }
+
+ emit_nir_code();
+
+ if (nir->info->tess.tcs_vertices_out % 8) {
+ bld.emit(BRW_OPCODE_ENDIF);
+ }
+
+ /* Emit EOT write; set TR DS Cache bit */
+ fs_reg srcs[3] = {
+ fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+ fs_reg(brw_imm_ud(WRITEMASK_X << 16)),
+ fs_reg(brw_imm_ud(0)),
+ };
+ fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
+ bld.LOAD_PAYLOAD(payload, srcs, 3, 2);
+
+ fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
+ bld.null_reg_ud(), payload);
+ inst->mlen = 3;
+ inst->eot = true;
+
+ if (shader_time_index >= 0)
+ emit_shader_time_end();
+
+ if (failed)
+ return false;
+
+ calculate_cfg();
+
+ optimize();
+
+ assign_curb_setup();
+ assign_tcs_single_patch_urb_setup();
+
+ fixup_3src_null_dest();
+ allocate_registers(true);
+
+ return !failed;
+}
+
+bool
+fs_visitor::run_tes()
+{
+ assert(stage == MESA_SHADER_TESS_EVAL);
+
+ /* R0: thread header, R1-3: gl_TessCoord.xyz, R4: URB handles */
+ payload.num_regs = 5;
+
+ if (shader_time_index >= 0)
+ emit_shader_time_begin();
+
+ emit_nir_code();
+
+ if (failed)
+ return false;
+
+ emit_urb_writes();
+
+ if (shader_time_index >= 0)
+ emit_shader_time_end();
+
+ calculate_cfg();
+
+ optimize();
+
+ assign_curb_setup();
+ assign_tes_urb_setup();
+
+ fixup_3src_null_dest();
+ allocate_registers(true);
+
+ return !failed;
+}
+
+bool
+fs_visitor::run_gs()
+{
+ assert(stage == MESA_SHADER_GEOMETRY);
+
+ setup_gs_payload();
+
+ this->final_gs_vertex_count = vgrf(glsl_type::uint_type);
+
+ if (gs_compile->control_data_header_size_bits > 0) {
+ /* Create a VGRF to store accumulated control data bits. */
+ this->control_data_bits = vgrf(glsl_type::uint_type);
+
+ /* If we're outputting more than 32 control data bits, then EmitVertex()
+ * will set control_data_bits to 0 after emitting the first vertex.
+ * Otherwise, we need to initialize it to 0 here.
+ */
+ if (gs_compile->control_data_header_size_bits <= 32) {
+ const fs_builder abld = bld.annotate("initialize control data bits");
+ abld.MOV(this->control_data_bits, brw_imm_ud(0u));
+ }
+ }
+
+ if (shader_time_index >= 0)
+ emit_shader_time_begin();
+
+ emit_nir_code();
+
+ emit_gs_thread_end();
+
+ if (shader_time_index >= 0)
+ emit_shader_time_end();
+
+ if (failed)
+ return false;
+
+ calculate_cfg();
+
+ optimize();
+
+ assign_curb_setup();
+ assign_gs_urb_setup();
+
+ fixup_3src_null_dest();
+ allocate_registers(true);
+
+ return !failed;
+}
+
+bool
+fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
+{
+ struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
+ brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
+
+ assert(stage == MESA_SHADER_FRAGMENT);
+
+ if (devinfo->gen >= 6)
+ setup_fs_payload_gen6();
+ else
+ setup_fs_payload_gen4();
+
+ if (0) {
+ emit_dummy_fs();
+ } else if (do_rep_send) {
+ assert(dispatch_width == 16);
+ emit_repclear_shader();
+ } else {
+ if (shader_time_index >= 0)
+ emit_shader_time_begin();
+
+ calculate_urb_setup();
+ if (nir->info->inputs_read > 0 ||
+ (nir->info->outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
+ if (devinfo->gen < 6)
+ emit_interpolation_setup_gen4();
+ else
+ emit_interpolation_setup_gen6();
+ }
+
+ /* We handle discards by keeping track of the still-live pixels in f0.1.
+ * Initialize it with the dispatched pixels.
+ */
+ if (wm_prog_data->uses_kill) {
+ fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
+ discard_init->flag_subreg = 1;
+ }
+
+ /* Generate FS IR for main(). (the visitor only descends into
+ * functions called "main").
+ */
+ emit_nir_code();
+
+ if (failed)
+ return false;
+
+ if (wm_prog_data->uses_kill)
+ bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
+
+ if (wm_key->alpha_test_func)
+ emit_alpha_test();
+
+ emit_fb_writes();
+
+ if (shader_time_index >= 0)
+ emit_shader_time_end();
+
+ calculate_cfg();
+
+ optimize();
+
+ assign_curb_setup();
+ assign_urb_setup();
+
+ fixup_3src_null_dest();
+ allocate_registers(allow_spilling);
+
+ if (failed)
+ return false;
+ }
+
+ return !failed;
+}
+
+bool
+fs_visitor::run_cs()
+{
+ assert(stage == MESA_SHADER_COMPUTE);
+
+ setup_cs_payload();
+
+ if (shader_time_index >= 0)
+ emit_shader_time_begin();
+
+ if (devinfo->is_haswell && prog_data->total_shared > 0) {
+ /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
+ const fs_builder abld = bld.exec_all().group(1, 0);
+ abld.MOV(retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW),
+ suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
+ }
+
+ emit_nir_code();
+
+ if (failed)
+ return false;
+
+ emit_cs_terminate();
+
+ if (shader_time_index >= 0)
+ emit_shader_time_end();
+
+ calculate_cfg();
+
+ optimize();
+
+ assign_curb_setup();
+
+ fixup_3src_null_dest();
+ allocate_registers(true);
+
+ if (failed)
+ return false;
+
+ return !failed;
+}
+
+/**
+ * Return a bitfield where bit n is set if barycentric interpolation mode n
+ * (see enum brw_barycentric_mode) is needed by the fragment shader.
+ *
+ * We examine the load_barycentric intrinsics rather than looking at input
+ * variables so that we catch interpolateAtCentroid() messages too, which
+ * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
+ */
+static unsigned
+brw_compute_barycentric_interp_modes(const struct gen_device_info *devinfo,
+ const nir_shader *shader)
+{
+ unsigned barycentric_interp_modes = 0;
+
+ nir_foreach_function(f, shader) {
+ if (!f->impl)
+ continue;
+
+ nir_foreach_block(block, f->impl) {
+ nir_foreach_instr(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
+ continue;
+
+ /* Ignore WPOS; it doesn't require interpolation. */
+ if (nir_intrinsic_base(intrin) == VARYING_SLOT_POS)
+ continue;
+
+ intrin = nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
+ enum glsl_interp_mode interp = (enum glsl_interp_mode)
+ nir_intrinsic_interp_mode(intrin);
+ nir_intrinsic_op bary_op = intrin->intrinsic;
+ enum brw_barycentric_mode bary =
+ brw_barycentric_mode(interp, bary_op);
+
+ barycentric_interp_modes |= 1 << bary;
+
+ if (devinfo->needs_unlit_centroid_workaround &&
+ bary_op == nir_intrinsic_load_barycentric_centroid)
+ barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
+ }
+ }
+ }
+
+ return barycentric_interp_modes;
+}
+
+static void
+brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
+ const nir_shader *shader)
+{
+ prog_data->flat_inputs = 0;
+
+ nir_foreach_variable(var, &shader->inputs) {
+ int input_index = prog_data->urb_setup[var->data.location];
+
+ if (input_index < 0)
+ continue;
+
+ /* flat shading */
+ if (var->data.interpolation == INTERP_MODE_FLAT)
+ prog_data->flat_inputs |= (1 << input_index);
+ }
+}
+
+static uint8_t
+computed_depth_mode(const nir_shader *shader)
+{
+ if (shader->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+ switch (shader->info->fs.depth_layout) {
+ case FRAG_DEPTH_LAYOUT_NONE:
+ case FRAG_DEPTH_LAYOUT_ANY:
+ return BRW_PSCDEPTH_ON;
+ case FRAG_DEPTH_LAYOUT_GREATER:
+ return BRW_PSCDEPTH_ON_GE;
+ case FRAG_DEPTH_LAYOUT_LESS:
+ return BRW_PSCDEPTH_ON_LE;
+ case FRAG_DEPTH_LAYOUT_UNCHANGED:
+ return BRW_PSCDEPTH_OFF;
+ }
+ }
+ return BRW_PSCDEPTH_OFF;
+}
+
+/**
+ * Move load_interpolated_input with simple (payload-based) barycentric modes
+ * to the top of the program so we don't emit multiple PLNs for the same input.
+ *
+ * This works around CSE not being able to handle non-dominating cases
+ * such as:
+ *
+ * if (...) {
+ * interpolate input
+ * } else {
+ * interpolate the same exact input
+ * }
+ *
+ * This should be replaced by global value numbering someday.
+ */
+void
+move_interpolation_to_top(nir_shader *nir)
+{
+ nir_foreach_function(f, nir) {
+ if (!f->impl)
+ continue;
+
+ nir_block *top = nir_start_block(f->impl);
+ exec_node *cursor_node = NULL;
+
+ nir_foreach_block(block, f->impl) {
+ if (block == top)
+ continue;
+
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
+ continue;
+ nir_intrinsic_instr *bary_intrinsic =
+ nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
+ nir_intrinsic_op op = bary_intrinsic->intrinsic;
+
+ /* Leave interpolateAtSample/Offset() where they are. */
+ if (op == nir_intrinsic_load_barycentric_at_sample ||
+ op == nir_intrinsic_load_barycentric_at_offset)
+ continue;
+
+ nir_instr *move[3] = {
+ &bary_intrinsic->instr,
+ intrin->src[1].ssa->parent_instr,
+ instr
+ };
+
+ for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
+ if (move[i]->block != top) {
+ move[i]->block = top;
+ exec_node_remove(&move[i]->node);
+ if (cursor_node) {
+ exec_node_insert_after(cursor_node, &move[i]->node);
+ } else {
+ exec_list_push_head(&top->instr_list, &move[i]->node);
+ }
+ cursor_node = &move[i]->node;
+ }
+ }
+ }
+ }
+ nir_metadata_preserve(f->impl, (nir_metadata)
+ ((unsigned) nir_metadata_block_index |
+ (unsigned) nir_metadata_dominance));
+ }
+}
+
+/**
+ * Demote per-sample barycentric intrinsics to centroid.
+ *
+ * Useful when rendering to a non-multisampled buffer.
+ */
+static void
+demote_sample_qualifiers(nir_shader *nir)
+{
+ nir_foreach_function(f, nir) {
+ if (!f->impl)
+ continue;
+
+ nir_builder b;
+ nir_builder_init(&b, f->impl);
+
+ nir_foreach_block(block, f->impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ if (intrin->intrinsic != nir_intrinsic_load_barycentric_sample &&
+ intrin->intrinsic != nir_intrinsic_load_barycentric_at_sample)
+ continue;
+
+ b.cursor = nir_before_instr(instr);
+ nir_ssa_def *centroid =
+ nir_load_barycentric(&b, nir_intrinsic_load_barycentric_centroid,
+ nir_intrinsic_interp_mode(intrin));
+ nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+ nir_src_for_ssa(centroid));
+ nir_instr_remove(instr);
+ }
+ }
+
+ nir_metadata_preserve(f->impl, (nir_metadata)
+ ((unsigned) nir_metadata_block_index |
+ (unsigned) nir_metadata_dominance));
+ }
+}
+
+/**
+ * Pre-gen6, the register file of the EUs was shared between threads,
+ * and each thread used some subset allocated on a 16-register block
+ * granularity. The unit states wanted these block counts.
+ */
+static inline int
+brw_register_blocks(int reg_count)
+{
+ return ALIGN(reg_count, 16) / 16 - 1;
+}
+
+const unsigned *
+brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
+ void *mem_ctx,
+ const struct brw_wm_prog_key *key,
+ struct brw_wm_prog_data *prog_data,
+ const nir_shader *src_shader,
+ struct gl_program *prog,
+ int shader_time_index8, int shader_time_index16,
+ bool allow_spilling,
+ bool use_rep_send, struct brw_vue_map *vue_map,
+ unsigned *final_assembly_size,
+ char **error_str)
+{
+ const struct gen_device_info *devinfo = compiler->devinfo;
+
+ nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
+ shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
+ brw_nir_lower_fs_inputs(shader, devinfo, key);
+ brw_nir_lower_fs_outputs(shader);
+
+ if (devinfo->gen < 6) {
+ brw_setup_vue_interpolation(vue_map, shader, prog_data, devinfo);
+ }
+
+ if (!key->multisample_fbo)
+ NIR_PASS_V(shader, demote_sample_qualifiers);
+ NIR_PASS_V(shader, move_interpolation_to_top);
+ shader = brw_postprocess_nir(shader, compiler, true);
+
+ /* key->alpha_test_func means simulating alpha testing via discards,
+ * so the shader definitely kills pixels.
+ */
+ prog_data->uses_kill = shader->info->fs.uses_discard ||
+ key->alpha_test_func;
+ prog_data->uses_omask = key->multisample_fbo &&
+ shader->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
+ prog_data->computed_depth_mode = computed_depth_mode(shader);
+ prog_data->computed_stencil =
+ shader->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
+
+ prog_data->persample_dispatch =
+ key->multisample_fbo &&
+ (key->persample_interp ||
+ (shader->info->system_values_read & (SYSTEM_BIT_SAMPLE_ID |
+ SYSTEM_BIT_SAMPLE_POS)) ||
+ shader->info->fs.uses_sample_qualifier ||
+ shader->info->outputs_read);
+
+ prog_data->early_fragment_tests = shader->info->fs.early_fragment_tests;
+ prog_data->post_depth_coverage = shader->info->fs.post_depth_coverage;
+ prog_data->inner_coverage = shader->info->fs.inner_coverage;
+
+ prog_data->barycentric_interp_modes =
+ brw_compute_barycentric_interp_modes(compiler->devinfo, shader);
+
+ cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL;
+ uint8_t simd8_grf_start = 0, simd16_grf_start = 0;
+ unsigned simd8_grf_used = 0, simd16_grf_used = 0;
+
+ fs_visitor v8(compiler, log_data, mem_ctx, key,
+ &prog_data->base, prog, shader, 8,
+ shader_time_index8);
+ if (!v8.run_fs(allow_spilling, false /* do_rep_send */)) {
+ if (error_str)
+ *error_str = ralloc_strdup(mem_ctx, v8.fail_msg);
+
+ return NULL;
+ } else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) {
+ simd8_cfg = v8.cfg;
+ simd8_grf_start = v8.payload.num_regs;
+ simd8_grf_used = v8.grf_used;
+ }
+
+ if (v8.max_dispatch_width >= 16 &&
+ likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) {
+ /* Try a SIMD16 compile */
+ fs_visitor v16(compiler, log_data, mem_ctx, key,
+ &prog_data->base, prog, shader, 16,
+ shader_time_index16);
+ v16.import_uniforms(&v8);
+ if (!v16.run_fs(allow_spilling, use_rep_send)) {
+ compiler->shader_perf_log(log_data,
+ "SIMD16 shader failed to compile: %s",
+ v16.fail_msg);
+ } else {
+ simd16_cfg = v16.cfg;
+ simd16_grf_start = v16.payload.num_regs;
+ simd16_grf_used = v16.grf_used;
+ }
+ }
+
+ /* When the caller requests a repclear shader, they want SIMD16-only */
+ if (use_rep_send)
+ simd8_cfg = NULL;
+
+ /* Prior to Iron Lake, the PS had a single shader offset with a jump table
+ * at the top to select the shader. We've never implemented that.
+ * Instead, we just give them exactly one shader and we pick the widest one
+ * available.
+ */
+ if (compiler->devinfo->gen < 5 && simd16_cfg)
+ simd8_cfg = NULL;
+
+ if (prog_data->persample_dispatch) {
+ /* Starting with SandyBridge (where we first get MSAA), the different
+ * pixel dispatch combinations are grouped into classifications A
+ * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1). On all hardware
+ * generations, the only configurations supporting persample dispatch
+ * are are this in which only one dispatch width is enabled.
+ *
+ * If computed depth is enabled, SNB only allows SIMD8 while IVB+
+ * allow SIMD8 or SIMD16 so we choose SIMD16 if available.
+ */
+ if (compiler->devinfo->gen == 6 &&
+ prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) {
+ simd16_cfg = NULL;
+ } else if (simd16_cfg) {
+ simd8_cfg = NULL;
+ }
+ }
+
+ /* We have to compute the flat inputs after the visitor is finished running
+ * because it relies on prog_data->urb_setup which is computed in
+ * fs_visitor::calculate_urb_setup().
+ */
+ brw_compute_flat_inputs(prog_data, shader);
+
+ fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base,
+ v8.promoted_constants, v8.runtime_check_aads_emit,
+ MESA_SHADER_FRAGMENT);
+
+ if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
+ g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s",
+ shader->info->label ?
+ shader->info->label : "unnamed",
+ shader->info->name));
+ }
+
+ if (simd8_cfg) {
+ prog_data->dispatch_8 = true;
+ g.generate_code(simd8_cfg, 8);
+ prog_data->base.dispatch_grf_start_reg = simd8_grf_start;
+ prog_data->reg_blocks_0 = brw_register_blocks(simd8_grf_used);
+
+ if (simd16_cfg) {
+ prog_data->dispatch_16 = true;
+ prog_data->prog_offset_2 = g.generate_code(simd16_cfg, 16);
+ prog_data->dispatch_grf_start_reg_2 = simd16_grf_start;
+ prog_data->reg_blocks_2 = brw_register_blocks(simd16_grf_used);
+ }
+ } else if (simd16_cfg) {
+ prog_data->dispatch_16 = true;
+ g.generate_code(simd16_cfg, 16);
+ prog_data->base.dispatch_grf_start_reg = simd16_grf_start;
+ prog_data->reg_blocks_0 = brw_register_blocks(simd16_grf_used);
+ }
+
+ return g.get_assembly(final_assembly_size);
+}
+
+fs_reg *
+fs_visitor::emit_cs_work_group_id_setup()
+{
+ assert(stage == MESA_SHADER_COMPUTE);
+
+ fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
+
+ struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
+ struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD));
+ struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD));
+
+ bld.MOV(*reg, r0_1);
+ bld.MOV(offset(*reg, bld, 1), r0_6);
+ bld.MOV(offset(*reg, bld, 2), r0_7);
+
+ return reg;
+}
+
+static void
+fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords)
+{
+ block->dwords = dwords;
+ block->regs = DIV_ROUND_UP(dwords, 8);
+ block->size = block->regs * 32;
+}
+
+static void
+cs_fill_push_const_info(const struct gen_device_info *devinfo,
+ struct brw_cs_prog_data *cs_prog_data)
+{
+ const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
+ bool fill_thread_id =
+ cs_prog_data->thread_local_id_index >= 0 &&
+ cs_prog_data->thread_local_id_index < (int)prog_data->nr_params;
+ bool cross_thread_supported = devinfo->gen > 7 || devinfo->is_haswell;
+
+ /* The thread ID should be stored in the last param dword */
+ assert(prog_data->nr_params > 0 || !fill_thread_id);
+ assert(!fill_thread_id ||
+ cs_prog_data->thread_local_id_index ==
+ (int)prog_data->nr_params - 1);
+
+ unsigned cross_thread_dwords, per_thread_dwords;
+ if (!cross_thread_supported) {
+ cross_thread_dwords = 0u;
+ per_thread_dwords = prog_data->nr_params;
+ } else if (fill_thread_id) {
+ /* Fill all but the last register with cross-thread payload */
+ cross_thread_dwords = 8 * (cs_prog_data->thread_local_id_index / 8);
+ per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
+ assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
+ } else {
+ /* Fill all data using cross-thread payload */
+ cross_thread_dwords = prog_data->nr_params;
+ per_thread_dwords = 0u;
+ }
+
+ fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
+ fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
+
+ unsigned total_dwords =
+ (cs_prog_data->push.per_thread.size * cs_prog_data->threads +
+ cs_prog_data->push.cross_thread.size) / 4;
+ fill_push_const_block_info(&cs_prog_data->push.total, total_dwords);
+
+ assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
+ cs_prog_data->push.per_thread.size == 0);
+ assert(cs_prog_data->push.cross_thread.dwords +
+ cs_prog_data->push.per_thread.dwords ==
+ prog_data->nr_params);
+}
+
+static void
+cs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, unsigned size)
+{
+ cs_prog_data->simd_size = size;
+ unsigned group_size = cs_prog_data->local_size[0] *
+ cs_prog_data->local_size[1] * cs_prog_data->local_size[2];
+ cs_prog_data->threads = (group_size + size - 1) / size;
+}
+
+const unsigned *
+brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
+ void *mem_ctx,
+ const struct brw_cs_prog_key *key,
+ struct brw_cs_prog_data *prog_data,
+ const nir_shader *src_shader,
+ int shader_time_index,
+ unsigned *final_assembly_size,
+ char **error_str)
+{
+ nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
+ shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
+ brw_nir_lower_cs_shared(shader);
+ prog_data->base.total_shared += shader->num_shared;
+
+ /* Now that we cloned the nir_shader, we can update num_uniforms based on
+ * the thread_local_id_index.
+ */
+ assert(prog_data->thread_local_id_index >= 0);
+ shader->num_uniforms =
+ MAX2(shader->num_uniforms,
+ (unsigned)4 * (prog_data->thread_local_id_index + 1));
+
+ brw_nir_lower_intrinsics(shader, &prog_data->base);
+ shader = brw_postprocess_nir(shader, compiler, true);
+
+ prog_data->local_size[0] = shader->info->cs.local_size[0];
+ prog_data->local_size[1] = shader->info->cs.local_size[1];
+ prog_data->local_size[2] = shader->info->cs.local_size[2];
+ unsigned local_workgroup_size =
+ shader->info->cs.local_size[0] * shader->info->cs.local_size[1] *
+ shader->info->cs.local_size[2];
+
+ unsigned max_cs_threads = compiler->devinfo->max_cs_threads;
+ unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads);
+
+ cfg_t *cfg = NULL;
+ const char *fail_msg = NULL;
+
+ /* Now the main event: Visit the shader IR and generate our CS IR for it.
+ */
+ fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base,
+ NULL, /* Never used in core profile */
+ shader, 8, shader_time_index);
+ if (simd_required <= 8) {
+ if (!v8.run_cs()) {
+ fail_msg = v8.fail_msg;
+ } else {
+ cfg = v8.cfg;
+ cs_set_simd_size(prog_data, 8);
+ cs_fill_push_const_info(compiler->devinfo, prog_data);
+ prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs;
+ }
+ }
+
+ fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base,
+ NULL, /* Never used in core profile */
+ shader, 16, shader_time_index);
+ if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
+ !fail_msg && v8.max_dispatch_width >= 16 &&
+ simd_required <= 16) {
+ /* Try a SIMD16 compile */
+ if (simd_required <= 8)
+ v16.import_uniforms(&v8);
+ if (!v16.run_cs()) {
+ compiler->shader_perf_log(log_data,
+ "SIMD16 shader failed to compile: %s",
+ v16.fail_msg);
+ if (!cfg) {
+ fail_msg =
+ "Couldn't generate SIMD16 program and not "
+ "enough threads for SIMD8";
+ }
+ } else {
+ cfg = v16.cfg;
+ cs_set_simd_size(prog_data, 16);
+ cs_fill_push_const_info(compiler->devinfo, prog_data);
+ prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
+ }
+ }
+
+ fs_visitor v32(compiler, log_data, mem_ctx, key, &prog_data->base,
+ NULL, /* Never used in core profile */
+ shader, 32, shader_time_index);
+ if (!fail_msg && v8.max_dispatch_width >= 32 &&
+ (simd_required > 16 || (INTEL_DEBUG & DEBUG_DO32))) {
+ /* Try a SIMD32 compile */
+ if (simd_required <= 8)
+ v32.import_uniforms(&v8);
+ else if (simd_required <= 16)
+ v32.import_uniforms(&v16);
+
+ if (!v32.run_cs()) {
+ compiler->shader_perf_log(log_data,
+ "SIMD32 shader failed to compile: %s",
+ v16.fail_msg);
+ if (!cfg) {
+ fail_msg =
+ "Couldn't generate SIMD32 program and not "
+ "enough threads for SIMD16";
+ }
+ } else {
+ cfg = v32.cfg;
+ cs_set_simd_size(prog_data, 32);
+ cs_fill_push_const_info(compiler->devinfo, prog_data);
+ }
+ }
+
+ if (unlikely(cfg == NULL)) {
+ assert(fail_msg);
+ if (error_str)
+ *error_str = ralloc_strdup(mem_ctx, fail_msg);
+
+ return NULL;
+ }
+
+ fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base,
+ v8.promoted_constants, v8.runtime_check_aads_emit,
+ MESA_SHADER_COMPUTE);
+ if (INTEL_DEBUG & DEBUG_CS) {
+ char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
+ shader->info->label ? shader->info->label :
+ "unnamed",
+ shader->info->name);
+ g.enable_debug(name);
+ }
+
+ g.generate_code(cfg, prog_data->simd_size);
+
+ return g.get_assembly(final_assembly_size);
+}
+
+/**
+ * Test the dispatch mask packing assumptions of
+ * brw_stage_has_packed_dispatch(). Call this from e.g. the top of
+ * fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is
+ * executed with an unexpected dispatch mask.
+ */
+static UNUSED void
+brw_fs_test_dispatch_packing(const fs_builder &bld)
+{
+ const gl_shader_stage stage = bld.shader->stage;
+
+ if (brw_stage_has_packed_dispatch(bld.shader->devinfo, stage,
+ bld.shader->stage_prog_data)) {
+ const fs_builder ubld = bld.exec_all().group(1, 0);
+ const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0);
+ const fs_reg mask = (stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
+ brw_dmask_reg());
+
+ ubld.ADD(tmp, mask, brw_imm_ud(1));
+ ubld.AND(tmp, mask, tmp);
+
+ /* This will loop forever if the dispatch mask doesn't have the expected
+ * form '2^n-1', in which case tmp will be non-zero.
+ */
+ bld.emit(BRW_OPCODE_DO);
+ bld.CMP(bld.null_reg_ud(), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
+ set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE));
+ }
+}
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
new file mode 100644
index 00000000000..00861ce5dad
--- /dev/null
+++ b/src/intel/compiler/brw_fs.h
@@ -0,0 +1,500 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ * Eric Anholt <[email protected]>
+ *
+ */
+
+#pragma once
+
+#include "brw_shader.h"
+#include "brw_ir_fs.h"
+#include "brw_fs_builder.h"
+#include "compiler/nir/nir.h"
+
+struct bblock_t;
+namespace {
+ struct acp_entry;
+}
+
+namespace brw {
+ class fs_live_variables;
+}
+
+struct brw_gs_compile;
+
+static inline fs_reg
+offset(const fs_reg &reg, const brw::fs_builder &bld, unsigned delta)
+{
+ return offset(reg, bld.dispatch_width(), delta);
+}
+
+/**
+ * The fragment shader front-end.
+ *
+ * Translates either GLSL IR or Mesa IR (for ARB_fragment_program) into FS IR.
+ */
+class fs_visitor : public backend_shader
+{
+public:
+ fs_visitor(const struct brw_compiler *compiler, void *log_data,
+ void *mem_ctx,
+ const void *key,
+ struct brw_stage_prog_data *prog_data,
+ struct gl_program *prog,
+ const nir_shader *shader,
+ unsigned dispatch_width,
+ int shader_time_index,
+ const struct brw_vue_map *input_vue_map = NULL);
+ fs_visitor(const struct brw_compiler *compiler, void *log_data,
+ void *mem_ctx,
+ struct brw_gs_compile *gs_compile,
+ struct brw_gs_prog_data *prog_data,
+ const nir_shader *shader,
+ int shader_time_index);
+ void init();
+ ~fs_visitor();
+
+ fs_reg vgrf(const glsl_type *const type);
+ void import_uniforms(fs_visitor *v);
+ void setup_uniform_clipplane_values(gl_clip_plane *clip_planes);
+ void compute_clip_distance(gl_clip_plane *clip_planes);
+
+ fs_inst *get_instruction_generating_reg(fs_inst *start,
+ fs_inst *end,
+ const fs_reg &reg);
+
+ void VARYING_PULL_CONSTANT_LOAD(const brw::fs_builder &bld,
+ const fs_reg &dst,
+ const fs_reg &surf_index,
+ const fs_reg &varying_offset,
+ uint32_t const_offset);
+ void DEP_RESOLVE_MOV(const brw::fs_builder &bld, int grf);
+
+ bool run_fs(bool allow_spilling, bool do_rep_send);
+ bool run_vs(gl_clip_plane *clip_planes);
+ bool run_tcs_single_patch();
+ bool run_tes();
+ bool run_gs();
+ bool run_cs();
+ void optimize();
+ void allocate_registers(bool allow_spilling);
+ void setup_fs_payload_gen4();
+ void setup_fs_payload_gen6();
+ void setup_vs_payload();
+ void setup_gs_payload();
+ void setup_cs_payload();
+ void fixup_3src_null_dest();
+ void assign_curb_setup();
+ void calculate_urb_setup();
+ void assign_urb_setup();
+ void convert_attr_sources_to_hw_regs(fs_inst *inst);
+ void assign_vs_urb_setup();
+ void assign_tcs_single_patch_urb_setup();
+ void assign_tes_urb_setup();
+ void assign_gs_urb_setup();
+ bool assign_regs(bool allow_spilling, bool spill_all);
+ void assign_regs_trivial();
+ void calculate_payload_ranges(int payload_node_count,
+ int *payload_last_use_ip);
+ void setup_payload_interference(struct ra_graph *g, int payload_reg_count,
+ int first_payload_node);
+ int choose_spill_reg(struct ra_graph *g);
+ void spill_reg(int spill_reg);
+ void split_virtual_grfs();
+ bool compact_virtual_grfs();
+ void assign_constant_locations();
+ void lower_constant_loads();
+ void invalidate_live_intervals();
+ void calculate_live_intervals();
+ void calculate_register_pressure();
+ void validate();
+ bool opt_algebraic();
+ bool opt_redundant_discard_jumps();
+ bool opt_cse();
+ bool opt_cse_local(bblock_t *block);
+ bool opt_copy_propagation();
+ bool try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry);
+ bool try_constant_propagate(fs_inst *inst, acp_entry *entry);
+ bool opt_copy_propagation_local(void *mem_ctx, bblock_t *block,
+ exec_list *acp);
+ bool opt_drop_redundant_mov_to_flags();
+ bool opt_register_renaming();
+ bool register_coalesce();
+ bool compute_to_mrf();
+ bool eliminate_find_live_channel();
+ bool dead_code_eliminate();
+ bool remove_duplicate_mrf_writes();
+
+ bool opt_sampler_eot();
+ bool virtual_grf_interferes(int a, int b);
+ void schedule_instructions(instruction_scheduler_mode mode);
+ void insert_gen4_send_dependency_workarounds();
+ void insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
+ fs_inst *inst);
+ void insert_gen4_post_send_dependency_workarounds(bblock_t *block,
+ fs_inst *inst);
+ void vfail(const char *msg, va_list args);
+ void fail(const char *msg, ...);
+ void limit_dispatch_width(unsigned n, const char *msg);
+ void lower_uniform_pull_constant_loads();
+ bool lower_load_payload();
+ bool lower_pack();
+ bool lower_d2x();
+ bool lower_logical_sends();
+ bool lower_integer_multiplication();
+ bool lower_minmax();
+ bool lower_simd_width();
+ bool opt_combine_constants();
+
+ void emit_dummy_fs();
+ void emit_repclear_shader();
+ void emit_fragcoord_interpolation(fs_reg wpos);
+ fs_reg *emit_frontfacing_interpolation();
+ fs_reg *emit_samplepos_setup();
+ fs_reg *emit_sampleid_setup();
+ fs_reg *emit_samplemaskin_setup();
+ fs_reg *emit_vs_system_value(int location);
+ void emit_interpolation_setup_gen4();
+ void emit_interpolation_setup_gen6();
+ void compute_sample_position(fs_reg dst, fs_reg int_sample_pos);
+ fs_reg emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
+ const fs_reg &sampler);
+ void emit_gen6_gather_wa(uint8_t wa, fs_reg dst);
+ fs_reg resolve_source_modifiers(const fs_reg &src);
+ void emit_discard_jump();
+ bool opt_peephole_sel();
+ bool opt_peephole_predicated_break();
+ bool opt_saturate_propagation();
+ bool opt_cmod_propagation();
+ bool opt_zero_samples();
+
+ void emit_nir_code();
+ void nir_setup_outputs();
+ void nir_setup_uniforms();
+ void nir_emit_system_values();
+ void nir_emit_impl(nir_function_impl *impl);
+ void nir_emit_cf_list(exec_list *list);
+ void nir_emit_if(nir_if *if_stmt);
+ void nir_emit_loop(nir_loop *loop);
+ void nir_emit_block(nir_block *block);
+ void nir_emit_instr(nir_instr *instr);
+ void nir_emit_alu(const brw::fs_builder &bld, nir_alu_instr *instr);
+ void nir_emit_load_const(const brw::fs_builder &bld,
+ nir_load_const_instr *instr);
+ void nir_emit_vs_intrinsic(const brw::fs_builder &bld,
+ nir_intrinsic_instr *instr);
+ void nir_emit_tcs_intrinsic(const brw::fs_builder &bld,
+ nir_intrinsic_instr *instr);
+ void nir_emit_gs_intrinsic(const brw::fs_builder &bld,
+ nir_intrinsic_instr *instr);
+ void nir_emit_fs_intrinsic(const brw::fs_builder &bld,
+ nir_intrinsic_instr *instr);
+ void nir_emit_cs_intrinsic(const brw::fs_builder &bld,
+ nir_intrinsic_instr *instr);
+ void nir_emit_intrinsic(const brw::fs_builder &bld,
+ nir_intrinsic_instr *instr);
+ void nir_emit_tes_intrinsic(const brw::fs_builder &bld,
+ nir_intrinsic_instr *instr);
+ void nir_emit_ssbo_atomic(const brw::fs_builder &bld,
+ int op, nir_intrinsic_instr *instr);
+ void nir_emit_shared_atomic(const brw::fs_builder &bld,
+ int op, nir_intrinsic_instr *instr);
+ void nir_emit_texture(const brw::fs_builder &bld,
+ nir_tex_instr *instr);
+ void nir_emit_jump(const brw::fs_builder &bld,
+ nir_jump_instr *instr);
+ fs_reg get_nir_src(const nir_src &src);
+ fs_reg get_nir_src_imm(const nir_src &src);
+ fs_reg get_nir_dest(const nir_dest &dest);
+ fs_reg get_nir_image_deref(const nir_deref_var *deref);
+ fs_reg get_indirect_offset(nir_intrinsic_instr *instr);
+ void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst,
+ unsigned wr_mask);
+
+ bool optimize_extract_to_float(nir_alu_instr *instr,
+ const fs_reg &result);
+ bool optimize_frontfacing_ternary(nir_alu_instr *instr,
+ const fs_reg &result);
+
+ void emit_alpha_test();
+ fs_inst *emit_single_fb_write(const brw::fs_builder &bld,
+ fs_reg color1, fs_reg color2,
+ fs_reg src0_alpha, unsigned components);
+ void emit_fb_writes();
+ fs_inst *emit_non_coherent_fb_read(const brw::fs_builder &bld,
+ const fs_reg &dst, unsigned target);
+ void emit_urb_writes(const fs_reg &gs_vertex_count = fs_reg());
+ void set_gs_stream_control_data_bits(const fs_reg &vertex_count,
+ unsigned stream_id);
+ void emit_gs_control_data_bits(const fs_reg &vertex_count);
+ void emit_gs_end_primitive(const nir_src &vertex_count_nir_src);
+ void emit_gs_vertex(const nir_src &vertex_count_nir_src,
+ unsigned stream_id);
+ void emit_gs_thread_end();
+ void emit_gs_input_load(const fs_reg &dst, const nir_src &vertex_src,
+ unsigned base_offset, const nir_src &offset_src,
+ unsigned num_components, unsigned first_component);
+ void emit_cs_terminate();
+ fs_reg *emit_cs_work_group_id_setup();
+
+ void emit_barrier();
+
+ void emit_shader_time_begin();
+ void emit_shader_time_end();
+ void SHADER_TIME_ADD(const brw::fs_builder &bld,
+ int shader_time_subindex,
+ fs_reg value);
+
+ fs_reg get_timestamp(const brw::fs_builder &bld);
+
+ struct brw_reg interp_reg(int location, int channel);
+
+ int implied_mrf_writes(fs_inst *inst);
+
+ virtual void dump_instructions();
+ virtual void dump_instructions(const char *name);
+ void dump_instruction(backend_instruction *inst);
+ void dump_instruction(backend_instruction *inst, FILE *file);
+
+ const void *const key;
+ const struct brw_sampler_prog_key_data *key_tex;
+
+ struct brw_gs_compile *gs_compile;
+
+ struct brw_stage_prog_data *prog_data;
+ struct gl_program *prog;
+
+ const struct brw_vue_map *input_vue_map;
+
+ int *virtual_grf_start;
+ int *virtual_grf_end;
+ brw::fs_live_variables *live_intervals;
+
+ int *regs_live_at_ip;
+
+ /** Number of uniform variable components visited. */
+ unsigned uniforms;
+
+ /** Byte-offset for the next available spot in the scratch space buffer. */
+ unsigned last_scratch;
+
+ /**
+ * Array mapping UNIFORM register numbers to the pull parameter index,
+ * or -1 if this uniform register isn't being uploaded as a pull constant.
+ */
+ int *pull_constant_loc;
+
+ /**
+ * Array mapping UNIFORM register numbers to the push parameter index,
+ * or -1 if this uniform register isn't being uploaded as a push constant.
+ */
+ int *push_constant_loc;
+
+ fs_reg frag_depth;
+ fs_reg frag_stencil;
+ fs_reg sample_mask;
+ fs_reg outputs[VARYING_SLOT_MAX];
+ fs_reg dual_src_output;
+ int first_non_payload_grf;
+ /** Either BRW_MAX_GRF or GEN7_MRF_HACK_START */
+ unsigned max_grf;
+
+ fs_reg *nir_locals;
+ fs_reg *nir_ssa_values;
+ fs_reg *nir_system_values;
+
+ bool failed;
+ char *fail_msg;
+
+ /** Register numbers for thread payload fields. */
+ struct thread_payload {
+ uint8_t source_depth_reg;
+ uint8_t source_w_reg;
+ uint8_t aa_dest_stencil_reg;
+ uint8_t dest_depth_reg;
+ uint8_t sample_pos_reg;
+ uint8_t sample_mask_in_reg;
+ uint8_t barycentric_coord_reg[BRW_BARYCENTRIC_MODE_COUNT];
+ uint8_t local_invocation_id_reg;
+
+ /** The number of thread payload registers the hardware will supply. */
+ uint8_t num_regs;
+ } payload;
+
+ bool source_depth_to_render_target;
+ bool runtime_check_aads_emit;
+
+ fs_reg pixel_x;
+ fs_reg pixel_y;
+ fs_reg wpos_w;
+ fs_reg pixel_w;
+ fs_reg delta_xy[BRW_BARYCENTRIC_MODE_COUNT];
+ fs_reg shader_start_time;
+ fs_reg userplane[MAX_CLIP_PLANES];
+ fs_reg final_gs_vertex_count;
+ fs_reg control_data_bits;
+ fs_reg invocation_id;
+
+ unsigned grf_used;
+ bool spilled_any_registers;
+
+ const unsigned dispatch_width; /**< 8, 16 or 32 */
+ unsigned min_dispatch_width;
+ unsigned max_dispatch_width;
+
+ int shader_time_index;
+
+ unsigned promoted_constants;
+ brw::fs_builder bld;
+};
+
+/**
+ * The fragment shader code generator.
+ *
+ * Translates FS IR to actual i965 assembly code.
+ */
+class fs_generator
+{
+public:
+ fs_generator(const struct brw_compiler *compiler, void *log_data,
+ void *mem_ctx,
+ const void *key,
+ struct brw_stage_prog_data *prog_data,
+ unsigned promoted_constants,
+ bool runtime_check_aads_emit,
+ gl_shader_stage stage);
+ ~fs_generator();
+
+ void enable_debug(const char *shader_name);
+ int generate_code(const cfg_t *cfg, int dispatch_width);
+ const unsigned *get_assembly(unsigned int *assembly_size);
+
+private:
+ void fire_fb_write(fs_inst *inst,
+ struct brw_reg payload,
+ struct brw_reg implied_header,
+ GLuint nr);
+ void generate_fb_write(fs_inst *inst, struct brw_reg payload);
+ void generate_fb_read(fs_inst *inst, struct brw_reg dst,
+ struct brw_reg payload);
+ void generate_urb_read(fs_inst *inst, struct brw_reg dst, struct brw_reg payload);
+ void generate_urb_write(fs_inst *inst, struct brw_reg payload);
+ void generate_cs_terminate(fs_inst *inst, struct brw_reg payload);
+ void generate_barrier(fs_inst *inst, struct brw_reg src);
+ void generate_linterp(fs_inst *inst, struct brw_reg dst,
+ struct brw_reg *src);
+ void generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
+ struct brw_reg surface_index,
+ struct brw_reg sampler_index);
+ void generate_get_buffer_size(fs_inst *inst, struct brw_reg dst,
+ struct brw_reg src,
+ struct brw_reg surf_index);
+ void generate_ddx(enum opcode op, struct brw_reg dst, struct brw_reg src);
+ void generate_ddy(enum opcode op, struct brw_reg dst, struct brw_reg src);
+ void generate_scratch_write(fs_inst *inst, struct brw_reg src);
+ void generate_scratch_read(fs_inst *inst, struct brw_reg dst);
+ void generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst);
+ void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst,
+ struct brw_reg index,
+ struct brw_reg offset);
+ void generate_uniform_pull_constant_load_gen7(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg surf_index,
+ struct brw_reg payload);
+ void generate_varying_pull_constant_load_gen4(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg index);
+ void generate_varying_pull_constant_load_gen7(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg index,
+ struct brw_reg offset);
+ void generate_mov_dispatch_to_flags(fs_inst *inst);
+
+ void generate_pixel_interpolator_query(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg src,
+ struct brw_reg msg_data,
+ unsigned msg_type);
+
+ void generate_set_sample_id(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg src0,
+ struct brw_reg src1);
+
+ void generate_discard_jump(fs_inst *inst);
+
+ void generate_pack_half_2x16_split(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg x,
+ struct brw_reg y);
+ void generate_unpack_half_2x16_split(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg src);
+
+ void generate_shader_time_add(fs_inst *inst,
+ struct brw_reg payload,
+ struct brw_reg offset,
+ struct brw_reg value);
+
+ void generate_mov_indirect(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg reg,
+ struct brw_reg indirect_byte_offset);
+
+ bool patch_discard_jumps_to_fb_writes();
+
+ const struct brw_compiler *compiler;
+ void *log_data; /* Passed to compiler->*_log functions */
+
+ const struct gen_device_info *devinfo;
+
+ struct brw_codegen *p;
+ const void * const key;
+ struct brw_stage_prog_data * const prog_data;
+
+ unsigned dispatch_width; /**< 8, 16 or 32 */
+
+ exec_list discard_halt_patches;
+ unsigned promoted_constants;
+ bool runtime_check_aads_emit;
+ bool debug_flag;
+ const char *shader_name;
+ gl_shader_stage stage;
+ void *mem_ctx;
+};
+
+void shuffle_32bit_load_result_to_64bit_data(const brw::fs_builder &bld,
+ const fs_reg &dst,
+ const fs_reg &src,
+ uint32_t components);
+
+void shuffle_64bit_data_for_32bit_write(const brw::fs_builder &bld,
+ const fs_reg &dst,
+ const fs_reg &src,
+ uint32_t components);
+fs_reg setup_imm_df(const brw::fs_builder &bld,
+ double v);
+
+enum brw_barycentric_mode brw_barycentric_mode(enum glsl_interp_mode mode,
+ nir_intrinsic_op op);
diff --git a/src/intel/compiler/brw_fs_builder.h b/src/intel/compiler/brw_fs_builder.h
new file mode 100644
index 00000000000..87394bc17b3
--- /dev/null
+++ b/src/intel/compiler/brw_fs_builder.h
@@ -0,0 +1,662 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2010-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_FS_BUILDER_H
+#define BRW_FS_BUILDER_H
+
+#include "brw_ir_fs.h"
+#include "brw_shader.h"
+
+namespace brw {
+ /**
+ * Toolbox to assemble an FS IR program out of individual instructions.
+ *
+ * This object is meant to have an interface consistent with
+ * brw::vec4_builder. They cannot be fully interchangeable because
+ * brw::fs_builder generates scalar code while brw::vec4_builder generates
+ * vector code.
+ */
+ class fs_builder {
+ public:
+ /** Type used in this IR to represent a source of an instruction. */
+ typedef fs_reg src_reg;
+
+ /** Type used in this IR to represent the destination of an instruction. */
+ typedef fs_reg dst_reg;
+
+ /** Type used in this IR to represent an instruction. */
+ typedef fs_inst instruction;
+
+ /**
+ * Construct an fs_builder that inserts instructions into \p shader.
+ * \p dispatch_width gives the native execution width of the program.
+ */
+ fs_builder(backend_shader *shader,
+ unsigned dispatch_width) :
+ shader(shader), block(NULL), cursor(NULL),
+ _dispatch_width(dispatch_width),
+ _group(0),
+ force_writemask_all(false),
+ annotation()
+ {
+ }
+
+ /**
+ * Construct an fs_builder that inserts instructions into \p shader
+ * before instruction \p inst in basic block \p block. The default
+ * execution controls and debug annotation are initialized from the
+ * instruction passed as argument.
+ */
+ fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
+ shader(shader), block(block), cursor(inst),
+ _dispatch_width(inst->exec_size),
+ _group(inst->group),
+ force_writemask_all(inst->force_writemask_all)
+ {
+ annotation.str = inst->annotation;
+ annotation.ir = inst->ir;
+ }
+
+ /**
+ * Construct an fs_builder that inserts instructions before \p cursor in
+ * basic block \p block, inheriting other code generation parameters
+ * from this.
+ */
+ fs_builder
+ at(bblock_t *block, exec_node *cursor) const
+ {
+ fs_builder bld = *this;
+ bld.block = block;
+ bld.cursor = cursor;
+ return bld;
+ }
+
+ /**
+ * Construct an fs_builder appending instructions at the end of the
+ * instruction list of the shader, inheriting other code generation
+ * parameters from this.
+ */
+ fs_builder
+ at_end() const
+ {
+ return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
+ }
+
+ /**
+ * Construct a builder specifying the default SIMD width and group of
+ * channel enable signals, inheriting other code generation parameters
+ * from this.
+ *
+ * \p n gives the default SIMD width, \p i gives the slot group used for
+ * predication and control flow masking in multiples of \p n channels.
+ */
+ fs_builder
+ group(unsigned n, unsigned i) const
+ {
+ assert(force_writemask_all ||
+ (n <= dispatch_width() && i < dispatch_width() / n));
+ fs_builder bld = *this;
+ bld._dispatch_width = n;
+ bld._group += i * n;
+ return bld;
+ }
+
+ /**
+ * Alias for group() with width equal to eight.
+ */
+ fs_builder
+ half(unsigned i) const
+ {
+ return group(8, i);
+ }
+
+ /**
+ * Construct a builder with per-channel control flow execution masking
+ * disabled if \p b is true. If control flow execution masking is
+ * already disabled this has no effect.
+ */
+ fs_builder
+ exec_all(bool b = true) const
+ {
+ fs_builder bld = *this;
+ if (b)
+ bld.force_writemask_all = true;
+ return bld;
+ }
+
+ /**
+ * Construct a builder with the given debug annotation info.
+ */
+ fs_builder
+ annotate(const char *str, const void *ir = NULL) const
+ {
+ fs_builder bld = *this;
+ bld.annotation.str = str;
+ bld.annotation.ir = ir;
+ return bld;
+ }
+
+ /**
+ * Get the SIMD width in use.
+ */
+ unsigned
+ dispatch_width() const
+ {
+ return _dispatch_width;
+ }
+
+ /**
+ * Get the channel group in use.
+ */
+ unsigned
+ group() const
+ {
+ return _group;
+ }
+
+ /**
+ * Allocate a virtual register of natural vector size (one for this IR)
+ * and SIMD width. \p n gives the amount of space to allocate in
+ * dispatch_width units (which is just enough space for one logical
+ * component in this IR).
+ */
+ dst_reg
+ vgrf(enum brw_reg_type type, unsigned n = 1) const
+ {
+ assert(dispatch_width() <= 32);
+
+ if (n > 0)
+ return dst_reg(VGRF, shader->alloc.allocate(
+ DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
+ REG_SIZE)),
+ type);
+ else
+ return retype(null_reg_ud(), type);
+ }
+
+ /**
+ * Create a null register of floating type.
+ */
+ dst_reg
+ null_reg_f() const
+ {
+ return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
+ }
+
+ dst_reg
+ null_reg_df() const
+ {
+ return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
+ }
+
+ /**
+ * Create a null register of signed integer type.
+ */
+ dst_reg
+ null_reg_d() const
+ {
+ return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ }
+
+ /**
+ * Create a null register of unsigned integer type.
+ */
+ dst_reg
+ null_reg_ud() const
+ {
+ return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
+ }
+
+ /**
+ * Get the mask of SIMD channels enabled by dispatch and not yet
+ * disabled by discard.
+ */
+ src_reg
+ sample_mask_reg() const
+ {
+ assert(shader->stage != MESA_SHADER_FRAGMENT ||
+ group() + dispatch_width() <= 16);
+ if (shader->stage != MESA_SHADER_FRAGMENT) {
+ return brw_imm_d(0xffffffff);
+ } else if (brw_wm_prog_data(shader->stage_prog_data)->uses_kill) {
+ return brw_flag_reg(0, 1);
+ } else {
+ return retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD);
+ }
+ }
+
+ /**
+ * Insert an instruction into the program.
+ */
+ instruction *
+ emit(const instruction &inst) const
+ {
+ return emit(new(shader->mem_ctx) instruction(inst));
+ }
+
+ /**
+ * Create and insert a nullary control instruction into the program.
+ */
+ instruction *
+ emit(enum opcode opcode) const
+ {
+ return emit(instruction(opcode, dispatch_width()));
+ }
+
+ /**
+ * Create and insert a nullary instruction into the program.
+ */
+ instruction *
+ emit(enum opcode opcode, const dst_reg &dst) const
+ {
+ return emit(instruction(opcode, dispatch_width(), dst));
+ }
+
+ /**
+ * Create and insert a unary instruction into the program.
+ */
+ instruction *
+ emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
+ {
+ switch (opcode) {
+ case SHADER_OPCODE_RCP:
+ case SHADER_OPCODE_RSQ:
+ case SHADER_OPCODE_SQRT:
+ case SHADER_OPCODE_EXP2:
+ case SHADER_OPCODE_LOG2:
+ case SHADER_OPCODE_SIN:
+ case SHADER_OPCODE_COS:
+ return emit(instruction(opcode, dispatch_width(), dst,
+ fix_math_operand(src0)));
+
+ default:
+ return emit(instruction(opcode, dispatch_width(), dst, src0));
+ }
+ }
+
+ /**
+ * Create and insert a binary instruction into the program.
+ */
+ instruction *
+ emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+ const src_reg &src1) const
+ {
+ switch (opcode) {
+ case SHADER_OPCODE_POW:
+ case SHADER_OPCODE_INT_QUOTIENT:
+ case SHADER_OPCODE_INT_REMAINDER:
+ return emit(instruction(opcode, dispatch_width(), dst,
+ fix_math_operand(src0),
+ fix_math_operand(src1)));
+
+ default:
+ return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
+
+ }
+ }
+
+ /**
+ * Create and insert a ternary instruction into the program.
+ */
+ instruction *
+ emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+ const src_reg &src1, const src_reg &src2) const
+ {
+ switch (opcode) {
+ case BRW_OPCODE_BFE:
+ case BRW_OPCODE_BFI2:
+ case BRW_OPCODE_MAD:
+ case BRW_OPCODE_LRP:
+ return emit(instruction(opcode, dispatch_width(), dst,
+ fix_3src_operand(src0),
+ fix_3src_operand(src1),
+ fix_3src_operand(src2)));
+
+ default:
+ return emit(instruction(opcode, dispatch_width(), dst,
+ src0, src1, src2));
+ }
+ }
+
+ /**
+ * Create and insert an instruction with a variable number of sources
+ * into the program.
+ */
+ instruction *
+ emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
+ unsigned n) const
+ {
+ return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
+ }
+
+ /**
+ * Insert a preallocated instruction into the program.
+ */
+ instruction *
+ emit(instruction *inst) const
+ {
+ assert(inst->exec_size <= 32);
+ assert(inst->exec_size == dispatch_width() ||
+ force_writemask_all);
+
+ inst->group = _group;
+ inst->force_writemask_all = force_writemask_all;
+ inst->annotation = annotation.str;
+ inst->ir = annotation.ir;
+
+ if (block)
+ static_cast<instruction *>(cursor)->insert_before(block, inst);
+ else
+ cursor->insert_before(inst);
+
+ return inst;
+ }
+
+ /**
+ * Select \p src0 if the comparison of both sources with the given
+ * conditional mod evaluates to true, otherwise select \p src1.
+ *
+ * Generally useful to get the minimum or maximum of two values.
+ */
+ instruction *
+ emit_minmax(const dst_reg &dst, const src_reg &src0,
+ const src_reg &src1, brw_conditional_mod mod) const
+ {
+ assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
+
+ return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
+ fix_unsigned_negate(src1)));
+ }
+
+ /**
+ * Copy any live channel from \p src to the first channel of the result.
+ */
+ src_reg
+ emit_uniformize(const src_reg &src) const
+ {
+ /* FIXME: We use a vector chan_index and dst to allow constant and
+ * copy propagration to move result all the way into the consuming
+ * instruction (typically a surface index or sampler index for a
+ * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
+ * dispatch. Once we teach const/copy propagation about scalars we
+ * should go back to scalar destinations here.
+ */
+ const fs_builder ubld = exec_all();
+ const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
+ const dst_reg dst = vgrf(src.type);
+
+ ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
+ ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
+
+ return src_reg(component(dst, 0));
+ }
+
+ /**
+ * Assorted arithmetic ops.
+ * @{
+ */
+#define ALU1(op) \
+ instruction * \
+ op(const dst_reg &dst, const src_reg &src0) const \
+ { \
+ return emit(BRW_OPCODE_##op, dst, src0); \
+ }
+
+#define ALU2(op) \
+ instruction * \
+ op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
+ { \
+ return emit(BRW_OPCODE_##op, dst, src0, src1); \
+ }
+
+#define ALU2_ACC(op) \
+ instruction * \
+ op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
+ { \
+ instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
+ inst->writes_accumulator = true; \
+ return inst; \
+ }
+
+#define ALU3(op) \
+ instruction * \
+ op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
+ const src_reg &src2) const \
+ { \
+ return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
+ }
+
+ ALU2(ADD)
+ ALU2_ACC(ADDC)
+ ALU2(AND)
+ ALU2(ASR)
+ ALU2(AVG)
+ ALU3(BFE)
+ ALU2(BFI1)
+ ALU3(BFI2)
+ ALU1(BFREV)
+ ALU1(CBIT)
+ ALU2(CMPN)
+ ALU3(CSEL)
+ ALU1(DIM)
+ ALU2(DP2)
+ ALU2(DP3)
+ ALU2(DP4)
+ ALU2(DPH)
+ ALU1(F16TO32)
+ ALU1(F32TO16)
+ ALU1(FBH)
+ ALU1(FBL)
+ ALU1(FRC)
+ ALU2(LINE)
+ ALU1(LZD)
+ ALU2(MAC)
+ ALU2_ACC(MACH)
+ ALU3(MAD)
+ ALU1(MOV)
+ ALU2(MUL)
+ ALU1(NOT)
+ ALU2(OR)
+ ALU2(PLN)
+ ALU1(RNDD)
+ ALU1(RNDE)
+ ALU1(RNDU)
+ ALU1(RNDZ)
+ ALU2(SAD2)
+ ALU2_ACC(SADA2)
+ ALU2(SEL)
+ ALU2(SHL)
+ ALU2(SHR)
+ ALU2_ACC(SUBB)
+ ALU2(XOR)
+
+#undef ALU3
+#undef ALU2_ACC
+#undef ALU2
+#undef ALU1
+ /** @} */
+
+ /**
+ * CMP: Sets the low bit of the destination channels with the result
+ * of the comparison, while the upper bits are undefined, and updates
+ * the flag register with the packed 16 bits of the result.
+ */
+ instruction *
+ CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
+ brw_conditional_mod condition) const
+ {
+ /* Take the instruction:
+ *
+ * CMP null<d> src0<f> src1<f>
+ *
+ * Original gen4 does type conversion to the destination type
+ * before comparison, producing garbage results for floating
+ * point comparisons.
+ *
+ * The destination type doesn't matter on newer generations,
+ * so we set the type to match src0 so we can compact the
+ * instruction.
+ */
+ return set_condmod(condition,
+ emit(BRW_OPCODE_CMP, retype(dst, src0.type),
+ fix_unsigned_negate(src0),
+ fix_unsigned_negate(src1)));
+ }
+
+ /**
+ * Gen4 predicated IF.
+ */
+ instruction *
+ IF(brw_predicate predicate) const
+ {
+ return set_predicate(predicate, emit(BRW_OPCODE_IF));
+ }
+
+ /**
+ * Emit a linear interpolation instruction.
+ */
+ instruction *
+ LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
+ const src_reg &a) const
+ {
+ if (shader->devinfo->gen >= 6) {
+ /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
+ * we need to reorder the operands.
+ */
+ return emit(BRW_OPCODE_LRP, dst, a, y, x);
+
+ } else {
+ /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
+ const dst_reg y_times_a = vgrf(dst.type);
+ const dst_reg one_minus_a = vgrf(dst.type);
+ const dst_reg x_times_one_minus_a = vgrf(dst.type);
+
+ MUL(y_times_a, y, a);
+ ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
+ MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
+ return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
+ }
+ }
+
+ /**
+ * Collect a number of registers in a contiguous range of registers.
+ */
+ instruction *
+ LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
+ unsigned sources, unsigned header_size) const
+ {
+ instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
+ inst->header_size = header_size;
+ inst->size_written = header_size * REG_SIZE;
+ for (unsigned i = header_size; i < sources; i++) {
+ inst->size_written +=
+ ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
+ REG_SIZE);
+ }
+
+ return inst;
+ }
+
+ backend_shader *shader;
+
+ private:
+ /**
+ * Workaround for negation of UD registers. See comment in
+ * fs_generator::generate_code() for more details.
+ */
+ src_reg
+ fix_unsigned_negate(const src_reg &src) const
+ {
+ if (src.type == BRW_REGISTER_TYPE_UD &&
+ src.negate) {
+ dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
+ MOV(temp, src);
+ return src_reg(temp);
+ } else {
+ return src;
+ }
+ }
+
+ /**
+ * Workaround for source register modes not supported by the ternary
+ * instruction encoding.
+ */
+ src_reg
+ fix_3src_operand(const src_reg &src) const
+ {
+ if (src.file == VGRF || src.file == UNIFORM || src.stride > 1) {
+ return src;
+ } else {
+ dst_reg expanded = vgrf(src.type);
+ MOV(expanded, src);
+ return expanded;
+ }
+ }
+
+ /**
+ * Workaround for source register modes not supported by the math
+ * instruction.
+ */
+ src_reg
+ fix_math_operand(const src_reg &src) const
+ {
+ /* Can't do hstride == 0 args on gen6 math, so expand it out. We
+ * might be able to do better by doing execsize = 1 math and then
+ * expanding that result out, but we would need to be careful with
+ * masking.
+ *
+ * Gen6 hardware ignores source modifiers (negate and abs) on math
+ * instructions, so we also move to a temp to set those up.
+ *
+ * Gen7 relaxes most of the above restrictions, but still can't use IMM
+ * operands to math
+ */
+ if ((shader->devinfo->gen == 6 &&
+ (src.file == IMM || src.file == UNIFORM ||
+ src.abs || src.negate)) ||
+ (shader->devinfo->gen == 7 && src.file == IMM)) {
+ const dst_reg tmp = vgrf(src.type);
+ MOV(tmp, src);
+ return tmp;
+ } else {
+ return src;
+ }
+ }
+
+ bblock_t *block;
+ exec_node *cursor;
+
+ unsigned _dispatch_width;
+ unsigned _group;
+ bool force_writemask_all;
+
+ /** Debug annotation info. */
+ struct {
+ const char *str;
+ const void *ir;
+ } annotation;
+ };
+}
+
+#endif
diff --git a/src/intel/compiler/brw_fs_cmod_propagation.cpp b/src/intel/compiler/brw_fs_cmod_propagation.cpp
new file mode 100644
index 00000000000..2d50c92e9e3
--- /dev/null
+++ b/src/intel/compiler/brw_fs_cmod_propagation.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "brw_eu.h"
+
+/** @file brw_fs_cmod_propagation.cpp
+ *
+ * Implements a pass that propagates the conditional modifier from a CMP x 0.0
+ * instruction into the instruction that generated x. For instance, in this
+ * sequence
+ *
+ * add(8) g70<1>F g69<8,8,1>F 4096F
+ * cmp.ge.f0(8) null g70<8,8,1>F 0F
+ *
+ * we can do the comparison as part of the ADD instruction directly:
+ *
+ * add.ge.f0(8) g70<1>F g69<8,8,1>F 4096F
+ *
+ * If there had been a use of the flag register and another CMP using g70
+ *
+ * add.ge.f0(8) g70<1>F g69<8,8,1>F 4096F
+ * (+f0) sel(8) g71<F> g72<8,8,1>F g73<8,8,1>F
+ * cmp.ge.f0(8) null g70<8,8,1>F 0F
+ *
+ * we can recognize that the CMP is generating the flag value that already
+ * exists and therefore remove the instruction.
+ */
+
+static bool
+opt_cmod_propagation_local(const gen_device_info *devinfo, bblock_t *block)
+{
+ bool progress = false;
+ int ip = block->end_ip + 1;
+
+ foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
+ ip--;
+
+ if ((inst->opcode != BRW_OPCODE_AND &&
+ inst->opcode != BRW_OPCODE_CMP &&
+ inst->opcode != BRW_OPCODE_MOV) ||
+ inst->predicate != BRW_PREDICATE_NONE ||
+ !inst->dst.is_null() ||
+ inst->src[0].file != VGRF ||
+ inst->src[0].abs)
+ continue;
+
+ /* Only an AND.NZ can be propagated. Many AND.Z instructions are
+ * generated (for ir_unop_not in fs_visitor::emit_bool_to_cond_code).
+ * Propagating those would require inverting the condition on the CMP.
+ * This changes both the flag value and the register destination of the
+ * CMP. That result may be used elsewhere, so we can't change its value
+ * on a whim.
+ */
+ if (inst->opcode == BRW_OPCODE_AND &&
+ !(inst->src[1].is_one() &&
+ inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+ !inst->src[0].negate))
+ continue;
+
+ if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero())
+ continue;
+
+ if (inst->opcode == BRW_OPCODE_MOV &&
+ inst->conditional_mod != BRW_CONDITIONAL_NZ)
+ continue;
+
+ bool read_flag = false;
+ foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+ if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+ inst->src[0], inst->size_read(0))) {
+ if (scan_inst->is_partial_write() ||
+ scan_inst->dst.offset != inst->src[0].offset ||
+ scan_inst->exec_size != inst->exec_size)
+ break;
+
+ /* CMP's result is the same regardless of dest type. */
+ if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+ scan_inst->opcode == BRW_OPCODE_CMP &&
+ (inst->dst.type == BRW_REGISTER_TYPE_D ||
+ inst->dst.type == BRW_REGISTER_TYPE_UD)) {
+ inst->remove(block);
+ progress = true;
+ break;
+ }
+
+ /* If the AND wasn't handled by the previous case, it isn't safe
+ * to remove it.
+ */
+ if (inst->opcode == BRW_OPCODE_AND)
+ break;
+
+ /* Comparisons operate differently for ints and floats */
+ if (scan_inst->dst.type != inst->dst.type &&
+ (scan_inst->dst.type == BRW_REGISTER_TYPE_F ||
+ inst->dst.type == BRW_REGISTER_TYPE_F))
+ break;
+
+ /* If the instruction generating inst's source also wrote the
+ * flag, and inst is doing a simple .nz comparison, then inst
+ * is redundant - the appropriate value is already in the flag
+ * register. Delete inst.
+ */
+ if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+ !inst->src[0].negate &&
+ scan_inst->flags_written()) {
+ inst->remove(block);
+ progress = true;
+ break;
+ }
+
+ /* The conditional mod of the CMP/CMPN instructions behaves
+ * specially because the flag output is not calculated from the
+ * result of the instruction, but the other way around, which
+ * means that even if the condmod to propagate and the condmod
+ * from the CMP instruction are the same they will in general give
+ * different results because they are evaluated based on different
+ * inputs.
+ */
+ if (scan_inst->opcode == BRW_OPCODE_CMP ||
+ scan_inst->opcode == BRW_OPCODE_CMPN)
+ break;
+
+ /* Otherwise, try propagating the conditional. */
+ enum brw_conditional_mod cond =
+ inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
+ : inst->conditional_mod;
+
+ if (scan_inst->can_do_cmod() &&
+ ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
+ scan_inst->conditional_mod == cond)) {
+ scan_inst->conditional_mod = cond;
+ inst->remove(block);
+ progress = true;
+ }
+ break;
+ }
+
+ if (scan_inst->flags_written())
+ break;
+
+ read_flag = read_flag || scan_inst->flags_read(devinfo);
+ }
+ }
+
+ return progress;
+}
+
+bool
+fs_visitor::opt_cmod_propagation()
+{
+ bool progress = false;
+
+ foreach_block_reverse(block, cfg) {
+ progress = opt_cmod_propagation_local(devinfo, block) || progress;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp b/src/intel/compiler/brw_fs_combine_constants.cpp
new file mode 100644
index 00000000000..e0c95d379b8
--- /dev/null
+++ b/src/intel/compiler/brw_fs_combine_constants.cpp
@@ -0,0 +1,329 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_combine_constants.cpp
+ *
+ * This file contains the opt_combine_constants() pass that runs after the
+ * regular optimization loop. It passes over the instruction list and
+ * selectively promotes immediate values to registers by emitting a mov(1)
+ * instruction.
+ *
+ * This is useful on Gen 7 particularly, because a few instructions can be
+ * coissued (i.e., issued in the same cycle as another thread on the same EU
+ * issues an instruction) under some circumstances, one of which is that they
+ * cannot use immediate values.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+static const bool debug = false;
+
+/* Returns whether an instruction could co-issue if its immediate source were
+ * replaced with a GRF source.
+ */
+static bool
+could_coissue(const struct gen_device_info *devinfo, const fs_inst *inst)
+{
+ if (devinfo->gen != 7)
+ return false;
+
+ switch (inst->opcode) {
+ case BRW_OPCODE_MOV:
+ case BRW_OPCODE_CMP:
+ case BRW_OPCODE_ADD:
+ case BRW_OPCODE_MUL:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/**
+ * Returns true for instructions that don't support immediate sources.
+ */
+static bool
+must_promote_imm(const struct gen_device_info *devinfo, const fs_inst *inst)
+{
+ switch (inst->opcode) {
+ case SHADER_OPCODE_POW:
+ return devinfo->gen < 8;
+ case BRW_OPCODE_MAD:
+ case BRW_OPCODE_LRP:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/** A box for putting fs_regs in a linked list. */
+struct reg_link {
+ DECLARE_RALLOC_CXX_OPERATORS(reg_link)
+
+ reg_link(fs_reg *reg) : reg(reg) {}
+
+ struct exec_node link;
+ fs_reg *reg;
+};
+
+static struct exec_node *
+link(void *mem_ctx, fs_reg *reg)
+{
+ reg_link *l = new(mem_ctx) reg_link(reg);
+ return &l->link;
+}
+
+/**
+ * Information about an immediate value.
+ */
+struct imm {
+ /** The common ancestor of all blocks using this immediate value. */
+ bblock_t *block;
+
+ /**
+ * The instruction generating the immediate value, if all uses are contained
+ * within a single basic block. Otherwise, NULL.
+ */
+ fs_inst *inst;
+
+ /**
+ * A list of fs_regs that refer to this immediate. If we promote it, we'll
+ * have to patch these up to refer to the new GRF.
+ */
+ exec_list *uses;
+
+ /** The immediate value. We currently only handle floats. */
+ float val;
+
+ /**
+ * The GRF register and subregister number where we've decided to store the
+ * constant value.
+ */
+ uint8_t subreg_offset;
+ uint16_t nr;
+
+ /** The number of coissuable instructions using this immediate. */
+ uint16_t uses_by_coissue;
+
+ /**
+ * Whether this constant is used by an instruction that can't handle an
+ * immediate source (and already has to be promoted to a GRF).
+ */
+ bool must_promote;
+
+ uint16_t first_use_ip;
+ uint16_t last_use_ip;
+};
+
+/** The working set of information about immediates. */
+struct table {
+ struct imm *imm;
+ int size;
+ int len;
+};
+
+static struct imm *
+find_imm(struct table *table, float val)
+{
+ for (int i = 0; i < table->len; i++) {
+ if (table->imm[i].val == val) {
+ return &table->imm[i];
+ }
+ }
+ return NULL;
+}
+
+static struct imm *
+new_imm(struct table *table, void *mem_ctx)
+{
+ if (table->len == table->size) {
+ table->size *= 2;
+ table->imm = reralloc(mem_ctx, table->imm, struct imm, table->size);
+ }
+ return &table->imm[table->len++];
+}
+
+/**
+ * Comparator used for sorting an array of imm structures.
+ *
+ * We sort by basic block number, then last use IP, then first use IP (least
+ * to greatest). This sorting causes immediates live in the same area to be
+ * allocated to the same register in the hopes that all values will be dead
+ * about the same time and the register can be reused.
+ */
+static int
+compare(const void *_a, const void *_b)
+{
+ const struct imm *a = (const struct imm *)_a,
+ *b = (const struct imm *)_b;
+
+ int block_diff = a->block->num - b->block->num;
+ if (block_diff)
+ return block_diff;
+
+ int end_diff = a->last_use_ip - b->last_use_ip;
+ if (end_diff)
+ return end_diff;
+
+ return a->first_use_ip - b->first_use_ip;
+}
+
+bool
+fs_visitor::opt_combine_constants()
+{
+ void *const_ctx = ralloc_context(NULL);
+
+ struct table table;
+ table.size = 8;
+ table.len = 0;
+ table.imm = ralloc_array(const_ctx, struct imm, table.size);
+
+ cfg->calculate_idom();
+ unsigned ip = -1;
+
+ /* Make a pass through all instructions and count the number of times each
+ * constant is used by coissueable instructions or instructions that cannot
+ * take immediate arguments.
+ */
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ ip++;
+
+ if (!could_coissue(devinfo, inst) && !must_promote_imm(devinfo, inst))
+ continue;
+
+ for (int i = 0; i < inst->sources; i++) {
+ if (inst->src[i].file != IMM ||
+ inst->src[i].type != BRW_REGISTER_TYPE_F)
+ continue;
+
+ float val = !inst->can_do_source_mods(devinfo) ? inst->src[i].f :
+ fabs(inst->src[i].f);
+ struct imm *imm = find_imm(&table, val);
+
+ if (imm) {
+ bblock_t *intersection = cfg_t::intersect(block, imm->block);
+ if (intersection != imm->block)
+ imm->inst = NULL;
+ imm->block = intersection;
+ imm->uses->push_tail(link(const_ctx, &inst->src[i]));
+ imm->uses_by_coissue += could_coissue(devinfo, inst);
+ imm->must_promote = imm->must_promote || must_promote_imm(devinfo, inst);
+ imm->last_use_ip = ip;
+ } else {
+ imm = new_imm(&table, const_ctx);
+ imm->block = block;
+ imm->inst = inst;
+ imm->uses = new(const_ctx) exec_list();
+ imm->uses->push_tail(link(const_ctx, &inst->src[i]));
+ imm->val = val;
+ imm->uses_by_coissue = could_coissue(devinfo, inst);
+ imm->must_promote = must_promote_imm(devinfo, inst);
+ imm->first_use_ip = ip;
+ imm->last_use_ip = ip;
+ }
+ }
+ }
+
+ /* Remove constants from the table that don't have enough uses to make them
+ * profitable to store in a register.
+ */
+ for (int i = 0; i < table.len;) {
+ struct imm *imm = &table.imm[i];
+
+ if (!imm->must_promote && imm->uses_by_coissue < 4) {
+ table.imm[i] = table.imm[table.len - 1];
+ table.len--;
+ continue;
+ }
+ i++;
+ }
+ if (table.len == 0) {
+ ralloc_free(const_ctx);
+ return false;
+ }
+ if (cfg->num_blocks != 1)
+ qsort(table.imm, table.len, sizeof(struct imm), compare);
+
+ /* Insert MOVs to load the constant values into GRFs. */
+ fs_reg reg(VGRF, alloc.allocate(1));
+ reg.stride = 0;
+ for (int i = 0; i < table.len; i++) {
+ struct imm *imm = &table.imm[i];
+ /* Insert it either before the instruction that generated the immediate
+ * or after the last non-control flow instruction of the common ancestor.
+ */
+ exec_node *n = (imm->inst ? imm->inst :
+ imm->block->last_non_control_flow_inst()->next);
+ const fs_builder ibld = bld.at(imm->block, n).exec_all().group(1, 0);
+
+ ibld.MOV(reg, brw_imm_f(imm->val));
+ imm->nr = reg.nr;
+ imm->subreg_offset = reg.offset;
+
+ reg.offset += sizeof(float);
+ if (reg.offset == 8 * sizeof(float)) {
+ reg.nr = alloc.allocate(1);
+ reg.offset = 0;
+ }
+ }
+ promoted_constants = table.len;
+
+ /* Rewrite the immediate sources to refer to the new GRFs. */
+ for (int i = 0; i < table.len; i++) {
+ foreach_list_typed(reg_link, link, link, table.imm[i].uses) {
+ fs_reg *reg = link->reg;
+ reg->file = VGRF;
+ reg->nr = table.imm[i].nr;
+ reg->offset = table.imm[i].subreg_offset;
+ reg->stride = 0;
+ reg->negate = signbit(reg->f) != signbit(table.imm[i].val);
+ assert((isnan(reg->f) && isnan(table.imm[i].val)) ||
+ fabsf(reg->f) == fabs(table.imm[i].val));
+ }
+ }
+
+ if (debug) {
+ for (int i = 0; i < table.len; i++) {
+ struct imm *imm = &table.imm[i];
+
+ printf("%.3fF - block %3d, reg %3d sub %2d, Uses: (%2d, %2d), "
+ "IP: %4d to %4d, length %4d\n",
+ imm->val,
+ imm->block->num,
+ imm->nr,
+ imm->subreg_offset,
+ imm->must_promote,
+ imm->uses_by_coissue,
+ imm->first_use_ip,
+ imm->last_use_ip,
+ imm->last_use_ip - imm->first_use_ip);
+ }
+ }
+
+ ralloc_free(const_ctx);
+ invalidate_live_intervals();
+
+ return true;
+}
diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp
new file mode 100644
index 00000000000..cb117396089
--- /dev/null
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -0,0 +1,869 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_copy_propagation.cpp
+ *
+ * Support for global copy propagation in two passes: A local pass that does
+ * intra-block copy (and constant) propagation, and a global pass that uses
+ * dataflow analysis on the copies available at the end of each block to re-do
+ * local copy propagation with more copies available.
+ *
+ * See Muchnick's Advanced Compiler Design and Implementation, section
+ * 12.5 (p356).
+ */
+
+#define ACP_HASH_SIZE 16
+
+#include "util/bitset.h"
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "brw_eu.h"
+
+namespace { /* avoid conflict with opt_copy_propagation_elements */
+struct acp_entry : public exec_node {
+ fs_reg dst;
+ fs_reg src;
+ uint8_t size_written;
+ uint8_t size_read;
+ enum opcode opcode;
+ bool saturate;
+};
+
+struct block_data {
+ /**
+ * Which entries in the fs_copy_prop_dataflow acp table are live at the
+ * start of this block. This is the useful output of the analysis, since
+ * it lets us plug those into the local copy propagation on the second
+ * pass.
+ */
+ BITSET_WORD *livein;
+
+ /**
+ * Which entries in the fs_copy_prop_dataflow acp table are live at the end
+ * of this block. This is done in initial setup from the per-block acps
+ * returned by the first local copy prop pass.
+ */
+ BITSET_WORD *liveout;
+
+ /**
+ * Which entries in the fs_copy_prop_dataflow acp table are generated by
+ * instructions in this block which reach the end of the block without
+ * being killed.
+ */
+ BITSET_WORD *copy;
+
+ /**
+ * Which entries in the fs_copy_prop_dataflow acp table are killed over the
+ * course of this block.
+ */
+ BITSET_WORD *kill;
+};
+
+class fs_copy_prop_dataflow
+{
+public:
+ fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg,
+ exec_list *out_acp[ACP_HASH_SIZE]);
+
+ void setup_initial_values();
+ void run();
+
+ void dump_block_data() const UNUSED;
+
+ void *mem_ctx;
+ cfg_t *cfg;
+
+ acp_entry **acp;
+ int num_acp;
+ int bitset_words;
+
+ struct block_data *bd;
+};
+} /* anonymous namespace */
+
+fs_copy_prop_dataflow::fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg,
+ exec_list *out_acp[ACP_HASH_SIZE])
+ : mem_ctx(mem_ctx), cfg(cfg)
+{
+ bd = rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks);
+
+ num_acp = 0;
+ foreach_block (block, cfg) {
+ for (int i = 0; i < ACP_HASH_SIZE; i++) {
+ num_acp += out_acp[block->num][i].length();
+ }
+ }
+
+ acp = rzalloc_array(mem_ctx, struct acp_entry *, num_acp);
+
+ bitset_words = BITSET_WORDS(num_acp);
+
+ int next_acp = 0;
+ foreach_block (block, cfg) {
+ bd[block->num].livein = rzalloc_array(bd, BITSET_WORD, bitset_words);
+ bd[block->num].liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
+ bd[block->num].copy = rzalloc_array(bd, BITSET_WORD, bitset_words);
+ bd[block->num].kill = rzalloc_array(bd, BITSET_WORD, bitset_words);
+
+ for (int i = 0; i < ACP_HASH_SIZE; i++) {
+ foreach_in_list(acp_entry, entry, &out_acp[block->num][i]) {
+ acp[next_acp] = entry;
+
+ /* opt_copy_propagation_local populates out_acp with copies created
+ * in a block which are still live at the end of the block. This
+ * is exactly what we want in the COPY set.
+ */
+ BITSET_SET(bd[block->num].copy, next_acp);
+
+ next_acp++;
+ }
+ }
+ }
+
+ assert(next_acp == num_acp);
+
+ setup_initial_values();
+ run();
+}
+
+/**
+ * Set up initial values for each of the data flow sets, prior to running
+ * the fixed-point algorithm.
+ */
+void
+fs_copy_prop_dataflow::setup_initial_values()
+{
+ /* Initialize the COPY and KILL sets. */
+ foreach_block (block, cfg) {
+ foreach_inst_in_block(fs_inst, inst, block) {
+ if (inst->dst.file != VGRF)
+ continue;
+
+ /* Mark ACP entries which are killed by this instruction. */
+ for (int i = 0; i < num_acp; i++) {
+ if (regions_overlap(inst->dst, inst->size_written,
+ acp[i]->dst, acp[i]->size_written) ||
+ regions_overlap(inst->dst, inst->size_written,
+ acp[i]->src, acp[i]->size_read)) {
+ BITSET_SET(bd[block->num].kill, i);
+ }
+ }
+ }
+ }
+
+ /* Populate the initial values for the livein and liveout sets. For the
+ * block at the start of the program, livein = 0 and liveout = copy.
+ * For the others, set liveout to 0 (the empty set) and livein to ~0
+ * (the universal set).
+ */
+ foreach_block (block, cfg) {
+ if (block->parents.is_empty()) {
+ for (int i = 0; i < bitset_words; i++) {
+ bd[block->num].livein[i] = 0u;
+ bd[block->num].liveout[i] = bd[block->num].copy[i];
+ }
+ } else {
+ for (int i = 0; i < bitset_words; i++) {
+ bd[block->num].liveout[i] = 0u;
+ bd[block->num].livein[i] = ~0u;
+ }
+ }
+ }
+}
+
+/**
+ * Walk the set of instructions in the block, marking which entries in the acp
+ * are killed by the block.
+ */
+void
+fs_copy_prop_dataflow::run()
+{
+ bool progress;
+
+ do {
+ progress = false;
+
+ /* Update liveout for all blocks. */
+ foreach_block (block, cfg) {
+ if (block->parents.is_empty())
+ continue;
+
+ for (int i = 0; i < bitset_words; i++) {
+ const BITSET_WORD old_liveout = bd[block->num].liveout[i];
+
+ bd[block->num].liveout[i] =
+ bd[block->num].copy[i] | (bd[block->num].livein[i] &
+ ~bd[block->num].kill[i]);
+
+ if (old_liveout != bd[block->num].liveout[i])
+ progress = true;
+ }
+ }
+
+ /* Update livein for all blocks. If a copy is live out of all parent
+ * blocks, it's live coming in to this block.
+ */
+ foreach_block (block, cfg) {
+ if (block->parents.is_empty())
+ continue;
+
+ for (int i = 0; i < bitset_words; i++) {
+ const BITSET_WORD old_livein = bd[block->num].livein[i];
+
+ bd[block->num].livein[i] = ~0u;
+ foreach_list_typed(bblock_link, parent_link, link, &block->parents) {
+ bblock_t *parent = parent_link->block;
+ bd[block->num].livein[i] &= bd[parent->num].liveout[i];
+ }
+
+ if (old_livein != bd[block->num].livein[i])
+ progress = true;
+ }
+ }
+ } while (progress);
+}
+
+void
+fs_copy_prop_dataflow::dump_block_data() const
+{
+ foreach_block (block, cfg) {
+ fprintf(stderr, "Block %d [%d, %d] (parents ", block->num,
+ block->start_ip, block->end_ip);
+ foreach_list_typed(bblock_link, link, link, &block->parents) {
+ bblock_t *parent = link->block;
+ fprintf(stderr, "%d ", parent->num);
+ }
+ fprintf(stderr, "):\n");
+ fprintf(stderr, " livein = 0x");
+ for (int i = 0; i < bitset_words; i++)
+ fprintf(stderr, "%08x", bd[block->num].livein[i]);
+ fprintf(stderr, ", liveout = 0x");
+ for (int i = 0; i < bitset_words; i++)
+ fprintf(stderr, "%08x", bd[block->num].liveout[i]);
+ fprintf(stderr, ",\n copy = 0x");
+ for (int i = 0; i < bitset_words; i++)
+ fprintf(stderr, "%08x", bd[block->num].copy[i]);
+ fprintf(stderr, ", kill = 0x");
+ for (int i = 0; i < bitset_words; i++)
+ fprintf(stderr, "%08x", bd[block->num].kill[i]);
+ fprintf(stderr, "\n");
+ }
+}
+
+static bool
+is_logic_op(enum opcode opcode)
+{
+ return (opcode == BRW_OPCODE_AND ||
+ opcode == BRW_OPCODE_OR ||
+ opcode == BRW_OPCODE_XOR ||
+ opcode == BRW_OPCODE_NOT);
+}
+
+static bool
+can_take_stride(fs_inst *inst, unsigned arg, unsigned stride,
+ const gen_device_info *devinfo)
+{
+ if (stride > 4)
+ return false;
+
+ /* 3-source instructions can only be Align16, which restricts what strides
+ * they can take. They can only take a stride of 1 (the usual case), or 0
+ * with a special "repctrl" bit. But the repctrl bit doesn't work for
+ * 64-bit datatypes, so if the source type is 64-bit then only a stride of
+ * 1 is allowed. From the Broadwell PRM, Volume 7 "3D Media GPGPU", page
+ * 944:
+ *
+ * This is applicable to 32b datatypes and 16b datatype. 64b datatypes
+ * cannot use the replicate control.
+ */
+ if (inst->is_3src(devinfo)) {
+ if (type_sz(inst->src[arg].type) > 4)
+ return stride == 1;
+ else
+ return stride == 1 || stride == 0;
+ }
+
+ /* From the Broadwell PRM, Volume 2a "Command Reference - Instructions",
+ * page 391 ("Extended Math Function"):
+ *
+ * The following restrictions apply for align1 mode: Scalar source is
+ * supported. Source and destination horizontal stride must be the
+ * same.
+ *
+ * From the Haswell PRM Volume 2b "Command Reference - Instructions", page
+ * 134 ("Extended Math Function"):
+ *
+ * Scalar source is supported. Source and destination horizontal stride
+ * must be 1.
+ *
+ * and similar language exists for IVB and SNB. Pre-SNB, math instructions
+ * are sends, so the sources are moved to MRF's and there are no
+ * restrictions.
+ */
+ if (inst->is_math()) {
+ if (devinfo->gen == 6 || devinfo->gen == 7) {
+ assert(inst->dst.stride == 1);
+ return stride == 1 || stride == 0;
+ } else if (devinfo->gen >= 8) {
+ return stride == inst->dst.stride || stride == 0;
+ }
+ }
+
+ return true;
+}
+
+bool
+fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
+{
+ if (inst->src[arg].file != VGRF)
+ return false;
+
+ if (entry->src.file == IMM)
+ return false;
+ assert(entry->src.file == VGRF || entry->src.file == UNIFORM ||
+ entry->src.file == ATTR);
+
+ if (entry->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
+ inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD)
+ return false;
+
+ assert(entry->dst.file == VGRF);
+ if (inst->src[arg].nr != entry->dst.nr)
+ return false;
+
+ /* Bail if inst is reading a range that isn't contained in the range
+ * that entry is writing.
+ */
+ if (!region_contained_in(inst->src[arg], inst->size_read(arg),
+ entry->dst, entry->size_written))
+ return false;
+
+ /* we can't generally copy-propagate UD negations because we
+ * can end up accessing the resulting values as signed integers
+ * instead. See also resolve_ud_negate() and comment in
+ * fs_generator::generate_code.
+ */
+ if (entry->src.type == BRW_REGISTER_TYPE_UD &&
+ entry->src.negate)
+ return false;
+
+ bool has_source_modifiers = entry->src.abs || entry->src.negate;
+
+ if ((has_source_modifiers || entry->src.file == UNIFORM ||
+ !entry->src.is_contiguous()) &&
+ !inst->can_do_source_mods(devinfo))
+ return false;
+
+ if (has_source_modifiers &&
+ inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE)
+ return false;
+
+ /* Bail if the result of composing both strides would exceed the
+ * hardware limit.
+ */
+ if (!can_take_stride(inst, arg, entry->src.stride * inst->src[arg].stride,
+ devinfo))
+ return false;
+
+ /* Bail if the instruction type is larger than the execution type of the
+ * copy, what implies that each channel is reading multiple channels of the
+ * destination of the copy, and simply replacing the sources would give a
+ * program with different semantics.
+ */
+ if (type_sz(entry->dst.type) < type_sz(inst->src[arg].type))
+ return false;
+
+ /* Bail if the result of composing both strides cannot be expressed
+ * as another stride. This avoids, for example, trying to transform
+ * this:
+ *
+ * MOV (8) rX<1>UD rY<0;1,0>UD
+ * FOO (8) ... rX<8;8,1>UW
+ *
+ * into this:
+ *
+ * FOO (8) ... rY<0;1,0>UW
+ *
+ * Which would have different semantics.
+ */
+ if (entry->src.stride != 1 &&
+ (inst->src[arg].stride *
+ type_sz(inst->src[arg].type)) % type_sz(entry->src.type) != 0)
+ return false;
+
+ /* Since semantics of source modifiers are type-dependent we need to
+ * ensure that the meaning of the instruction remains the same if we
+ * change the type. If the sizes of the types are different the new
+ * instruction will read a different amount of data than the original
+ * and the semantics will always be different.
+ */
+ if (has_source_modifiers &&
+ entry->dst.type != inst->src[arg].type &&
+ (!inst->can_change_types() ||
+ type_sz(entry->dst.type) != type_sz(inst->src[arg].type)))
+ return false;
+
+ if (devinfo->gen >= 8 && (entry->src.negate || entry->src.abs) &&
+ is_logic_op(inst->opcode)) {
+ return false;
+ }
+
+ if (entry->saturate) {
+ switch(inst->opcode) {
+ case BRW_OPCODE_SEL:
+ if ((inst->conditional_mod != BRW_CONDITIONAL_GE &&
+ inst->conditional_mod != BRW_CONDITIONAL_L) ||
+ inst->src[1].file != IMM ||
+ inst->src[1].f < 0.0 ||
+ inst->src[1].f > 1.0) {
+ return false;
+ }
+ break;
+ default:
+ return false;
+ }
+ }
+
+ inst->src[arg].file = entry->src.file;
+ inst->src[arg].nr = entry->src.nr;
+ inst->src[arg].stride *= entry->src.stride;
+ inst->saturate = inst->saturate || entry->saturate;
+
+ /* Compute the offset of inst->src[arg] relative to entry->dst */
+ const unsigned rel_offset = inst->src[arg].offset - entry->dst.offset;
+
+ /* Compute the first component of the copy that the instruction is
+ * reading, and the base byte offset within that component.
+ */
+ assert(entry->dst.offset % REG_SIZE == 0 && entry->dst.stride == 1);
+ const unsigned component = rel_offset / type_sz(entry->dst.type);
+ const unsigned suboffset = rel_offset % type_sz(entry->dst.type);
+
+ /* Calculate the byte offset at the origin of the copy of the given
+ * component and suboffset.
+ */
+ inst->src[arg].offset = suboffset +
+ component * entry->src.stride * type_sz(entry->src.type) +
+ entry->src.offset;
+
+ if (has_source_modifiers) {
+ if (entry->dst.type != inst->src[arg].type) {
+ /* We are propagating source modifiers from a MOV with a different
+ * type. If we got here, then we can just change the source and
+ * destination types of the instruction and keep going.
+ */
+ assert(inst->can_change_types());
+ for (int i = 0; i < inst->sources; i++) {
+ inst->src[i].type = entry->dst.type;
+ }
+ inst->dst.type = entry->dst.type;
+ }
+
+ if (!inst->src[arg].abs) {
+ inst->src[arg].abs = entry->src.abs;
+ inst->src[arg].negate ^= entry->src.negate;
+ }
+ }
+
+ return true;
+}
+
+
+bool
+fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
+{
+ bool progress = false;
+
+ if (entry->src.file != IMM)
+ return false;
+ if (type_sz(entry->src.type) > 4)
+ return false;
+ if (entry->saturate)
+ return false;
+
+ for (int i = inst->sources - 1; i >= 0; i--) {
+ if (inst->src[i].file != VGRF)
+ continue;
+
+ assert(entry->dst.file == VGRF);
+ if (inst->src[i].nr != entry->dst.nr)
+ continue;
+
+ /* Bail if inst is reading a range that isn't contained in the range
+ * that entry is writing.
+ */
+ if (!region_contained_in(inst->src[i], inst->size_read(i),
+ entry->dst, entry->size_written))
+ continue;
+
+ /* If the type sizes don't match each channel of the instruction is
+ * either extracting a portion of the constant (which could be handled
+ * with some effort but the code below doesn't) or reading multiple
+ * channels of the source at once.
+ */
+ if (type_sz(inst->src[i].type) != type_sz(entry->dst.type))
+ continue;
+
+ fs_reg val = entry->src;
+ val.type = inst->src[i].type;
+
+ if (inst->src[i].abs) {
+ if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
+ !brw_abs_immediate(val.type, &val.as_brw_reg())) {
+ continue;
+ }
+ }
+
+ if (inst->src[i].negate) {
+ if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
+ !brw_negate_immediate(val.type, &val.as_brw_reg())) {
+ continue;
+ }
+ }
+
+ switch (inst->opcode) {
+ case BRW_OPCODE_MOV:
+ case SHADER_OPCODE_LOAD_PAYLOAD:
+ case FS_OPCODE_PACK:
+ inst->src[i] = val;
+ progress = true;
+ break;
+
+ case SHADER_OPCODE_INT_QUOTIENT:
+ case SHADER_OPCODE_INT_REMAINDER:
+ /* FINISHME: Promote non-float constants and remove this. */
+ if (devinfo->gen < 8)
+ break;
+ /* fallthrough */
+ case SHADER_OPCODE_POW:
+ /* Allow constant propagation into src1 (except on Gen 6 which
+ * doesn't support scalar source math), and let constant combining
+ * promote the constant on Gen < 8.
+ */
+ if (devinfo->gen == 6)
+ break;
+ /* fallthrough */
+ case BRW_OPCODE_BFI1:
+ case BRW_OPCODE_ASR:
+ case BRW_OPCODE_SHL:
+ case BRW_OPCODE_SHR:
+ case BRW_OPCODE_SUBB:
+ if (i == 1) {
+ inst->src[i] = val;
+ progress = true;
+ }
+ break;
+
+ case BRW_OPCODE_MACH:
+ case BRW_OPCODE_MUL:
+ case SHADER_OPCODE_MULH:
+ case BRW_OPCODE_ADD:
+ case BRW_OPCODE_OR:
+ case BRW_OPCODE_AND:
+ case BRW_OPCODE_XOR:
+ case BRW_OPCODE_ADDC:
+ if (i == 1) {
+ inst->src[i] = val;
+ progress = true;
+ } else if (i == 0 && inst->src[1].file != IMM) {
+ /* Fit this constant in by commuting the operands.
+ * Exception: we can't do this for 32-bit integer MUL/MACH
+ * because it's asymmetric.
+ *
+ * The BSpec says for Broadwell that
+ *
+ * "When multiplying DW x DW, the dst cannot be accumulator."
+ *
+ * Integer MUL with a non-accumulator destination will be lowered
+ * by lower_integer_multiplication(), so don't restrict it.
+ */
+ if (((inst->opcode == BRW_OPCODE_MUL &&
+ inst->dst.is_accumulator()) ||
+ inst->opcode == BRW_OPCODE_MACH) &&
+ (inst->src[1].type == BRW_REGISTER_TYPE_D ||
+ inst->src[1].type == BRW_REGISTER_TYPE_UD))
+ break;
+ inst->src[0] = inst->src[1];
+ inst->src[1] = val;
+ progress = true;
+ }
+ break;
+
+ case BRW_OPCODE_CMP:
+ case BRW_OPCODE_IF:
+ if (i == 1) {
+ inst->src[i] = val;
+ progress = true;
+ } else if (i == 0 && inst->src[1].file != IMM) {
+ enum brw_conditional_mod new_cmod;
+
+ new_cmod = brw_swap_cmod(inst->conditional_mod);
+ if (new_cmod != BRW_CONDITIONAL_NONE) {
+ /* Fit this constant in by swapping the operands and
+ * flipping the test
+ */
+ inst->src[0] = inst->src[1];
+ inst->src[1] = val;
+ inst->conditional_mod = new_cmod;
+ progress = true;
+ }
+ }
+ break;
+
+ case BRW_OPCODE_SEL:
+ if (i == 1) {
+ inst->src[i] = val;
+ progress = true;
+ } else if (i == 0 && inst->src[1].file != IMM) {
+ inst->src[0] = inst->src[1];
+ inst->src[1] = val;
+
+ /* If this was predicated, flipping operands means
+ * we also need to flip the predicate.
+ */
+ if (inst->conditional_mod == BRW_CONDITIONAL_NONE) {
+ inst->predicate_inverse =
+ !inst->predicate_inverse;
+ }
+ progress = true;
+ }
+ break;
+
+ case SHADER_OPCODE_UNTYPED_ATOMIC:
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+ case SHADER_OPCODE_TYPED_ATOMIC:
+ case SHADER_OPCODE_TYPED_SURFACE_READ:
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+ /* We only propagate into the surface argument of the
+ * instruction. Everything else goes through LOAD_PAYLOAD.
+ */
+ if (i == 1) {
+ inst->src[i] = val;
+ progress = true;
+ }
+ break;
+
+ case FS_OPCODE_FB_WRITE_LOGICAL:
+ /* The stencil and omask sources of FS_OPCODE_FB_WRITE_LOGICAL are
+ * bit-cast using a strided region so they cannot be immediates.
+ */
+ if (i != FB_WRITE_LOGICAL_SRC_SRC_STENCIL &&
+ i != FB_WRITE_LOGICAL_SRC_OMASK) {
+ inst->src[i] = val;
+ progress = true;
+ }
+ break;
+
+ case SHADER_OPCODE_TEX_LOGICAL:
+ case SHADER_OPCODE_TXD_LOGICAL:
+ case SHADER_OPCODE_TXF_LOGICAL:
+ case SHADER_OPCODE_TXL_LOGICAL:
+ case SHADER_OPCODE_TXS_LOGICAL:
+ case FS_OPCODE_TXB_LOGICAL:
+ case SHADER_OPCODE_TXF_CMS_LOGICAL:
+ case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+ case SHADER_OPCODE_TXF_UMS_LOGICAL:
+ case SHADER_OPCODE_TXF_MCS_LOGICAL:
+ case SHADER_OPCODE_LOD_LOGICAL:
+ case SHADER_OPCODE_TG4_LOGICAL:
+ case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+ case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+ case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+ case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+ inst->src[i] = val;
+ progress = true;
+ break;
+
+ case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+ case SHADER_OPCODE_BROADCAST:
+ inst->src[i] = val;
+ progress = true;
+ break;
+
+ case BRW_OPCODE_MAD:
+ case BRW_OPCODE_LRP:
+ inst->src[i] = val;
+ progress = true;
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ return progress;
+}
+
+static bool
+can_propagate_from(fs_inst *inst)
+{
+ return (inst->opcode == BRW_OPCODE_MOV &&
+ inst->dst.file == VGRF &&
+ ((inst->src[0].file == VGRF &&
+ !regions_overlap(inst->dst, inst->size_written,
+ inst->src[0], inst->size_read(0))) ||
+ inst->src[0].file == ATTR ||
+ inst->src[0].file == UNIFORM ||
+ inst->src[0].file == IMM) &&
+ inst->src[0].type == inst->dst.type &&
+ !inst->is_partial_write());
+}
+
+/* Walks a basic block and does copy propagation on it using the acp
+ * list.
+ */
+bool
+fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block,
+ exec_list *acp)
+{
+ bool progress = false;
+
+ foreach_inst_in_block(fs_inst, inst, block) {
+ /* Try propagating into this instruction. */
+ for (int i = 0; i < inst->sources; i++) {
+ if (inst->src[i].file != VGRF)
+ continue;
+
+ foreach_in_list(acp_entry, entry, &acp[inst->src[i].nr % ACP_HASH_SIZE]) {
+ if (try_constant_propagate(inst, entry))
+ progress = true;
+ else if (try_copy_propagate(inst, i, entry))
+ progress = true;
+ }
+ }
+
+ /* kill the destination from the ACP */
+ if (inst->dst.file == VGRF) {
+ foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.nr % ACP_HASH_SIZE]) {
+ if (regions_overlap(entry->dst, entry->size_written,
+ inst->dst, inst->size_written))
+ entry->remove();
+ }
+
+ /* Oops, we only have the chaining hash based on the destination, not
+ * the source, so walk across the entire table.
+ */
+ for (int i = 0; i < ACP_HASH_SIZE; i++) {
+ foreach_in_list_safe(acp_entry, entry, &acp[i]) {
+ /* Make sure we kill the entry if this instruction overwrites
+ * _any_ of the registers that it reads
+ */
+ if (regions_overlap(entry->src, entry->size_read,
+ inst->dst, inst->size_written))
+ entry->remove();
+ }
+ }
+ }
+
+ /* If this instruction's source could potentially be folded into the
+ * operand of another instruction, add it to the ACP.
+ */
+ if (can_propagate_from(inst)) {
+ acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
+ entry->dst = inst->dst;
+ entry->src = inst->src[0];
+ entry->size_written = inst->size_written;
+ entry->size_read = inst->size_read(0);
+ entry->opcode = inst->opcode;
+ entry->saturate = inst->saturate;
+ acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry);
+ } else if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
+ inst->dst.file == VGRF) {
+ int offset = 0;
+ for (int i = 0; i < inst->sources; i++) {
+ int effective_width = i < inst->header_size ? 8 : inst->exec_size;
+ assert(effective_width * type_sz(inst->src[i].type) % REG_SIZE == 0);
+ const unsigned size_written = effective_width *
+ type_sz(inst->src[i].type);
+ if (inst->src[i].file == VGRF) {
+ acp_entry *entry = rzalloc(copy_prop_ctx, acp_entry);
+ entry->dst = byte_offset(inst->dst, offset);
+ entry->src = inst->src[i];
+ entry->size_written = size_written;
+ entry->size_read = inst->size_read(i);
+ entry->opcode = inst->opcode;
+ if (!entry->dst.equals(inst->src[i])) {
+ acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry);
+ } else {
+ ralloc_free(entry);
+ }
+ }
+ offset += size_written;
+ }
+ }
+ }
+
+ return progress;
+}
+
+bool
+fs_visitor::opt_copy_propagation()
+{
+ bool progress = false;
+ void *copy_prop_ctx = ralloc_context(NULL);
+ exec_list *out_acp[cfg->num_blocks];
+
+ for (int i = 0; i < cfg->num_blocks; i++)
+ out_acp[i] = new exec_list [ACP_HASH_SIZE];
+
+ /* First, walk through each block doing local copy propagation and getting
+ * the set of copies available at the end of the block.
+ */
+ foreach_block (block, cfg) {
+ progress = opt_copy_propagation_local(copy_prop_ctx, block,
+ out_acp[block->num]) || progress;
+ }
+
+ /* Do dataflow analysis for those available copies. */
+ fs_copy_prop_dataflow dataflow(copy_prop_ctx, cfg, out_acp);
+
+ /* Next, re-run local copy propagation, this time with the set of copies
+ * provided by the dataflow analysis available at the start of a block.
+ */
+ foreach_block (block, cfg) {
+ exec_list in_acp[ACP_HASH_SIZE];
+
+ for (int i = 0; i < dataflow.num_acp; i++) {
+ if (BITSET_TEST(dataflow.bd[block->num].livein, i)) {
+ struct acp_entry *entry = dataflow.acp[i];
+ in_acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry);
+ }
+ }
+
+ progress = opt_copy_propagation_local(copy_prop_ctx, block, in_acp) ||
+ progress;
+ }
+
+ for (int i = 0; i < cfg->num_blocks; i++)
+ delete [] out_acp[i];
+ ralloc_free(copy_prop_ctx);
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
diff --git a/src/intel/compiler/brw_fs_cse.cpp b/src/intel/compiler/brw_fs_cse.cpp
new file mode 100644
index 00000000000..48220efd730
--- /dev/null
+++ b/src/intel/compiler/brw_fs_cse.cpp
@@ -0,0 +1,380 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+
+/** @file brw_fs_cse.cpp
+ *
+ * Support for local common subexpression elimination.
+ *
+ * See Muchnick's Advanced Compiler Design and Implementation, section
+ * 13.1 (p378).
+ */
+
+using namespace brw;
+
+namespace {
+struct aeb_entry : public exec_node {
+ /** The instruction that generates the expression value. */
+ fs_inst *generator;
+
+ /** The temporary where the value is stored. */
+ fs_reg tmp;
+};
+}
+
+static bool
+is_expression(const fs_visitor *v, const fs_inst *const inst)
+{
+ switch (inst->opcode) {
+ case BRW_OPCODE_MOV:
+ case BRW_OPCODE_SEL:
+ case BRW_OPCODE_NOT:
+ case BRW_OPCODE_AND:
+ case BRW_OPCODE_OR:
+ case BRW_OPCODE_XOR:
+ case BRW_OPCODE_SHR:
+ case BRW_OPCODE_SHL:
+ case BRW_OPCODE_ASR:
+ case BRW_OPCODE_CMP:
+ case BRW_OPCODE_CMPN:
+ case BRW_OPCODE_ADD:
+ case BRW_OPCODE_MUL:
+ case SHADER_OPCODE_MULH:
+ case BRW_OPCODE_FRC:
+ case BRW_OPCODE_RNDU:
+ case BRW_OPCODE_RNDD:
+ case BRW_OPCODE_RNDE:
+ case BRW_OPCODE_RNDZ:
+ case BRW_OPCODE_LINE:
+ case BRW_OPCODE_PLN:
+ case BRW_OPCODE_MAD:
+ case BRW_OPCODE_LRP:
+ case FS_OPCODE_FB_READ_LOGICAL:
+ case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+ case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
+ case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
+ case FS_OPCODE_CINTERP:
+ case FS_OPCODE_LINTERP:
+ case SHADER_OPCODE_FIND_LIVE_CHANNEL:
+ case SHADER_OPCODE_BROADCAST:
+ case SHADER_OPCODE_MOV_INDIRECT:
+ case SHADER_OPCODE_TEX_LOGICAL:
+ case SHADER_OPCODE_TXD_LOGICAL:
+ case SHADER_OPCODE_TXF_LOGICAL:
+ case SHADER_OPCODE_TXL_LOGICAL:
+ case SHADER_OPCODE_TXS_LOGICAL:
+ case FS_OPCODE_TXB_LOGICAL:
+ case SHADER_OPCODE_TXF_CMS_LOGICAL:
+ case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+ case SHADER_OPCODE_TXF_UMS_LOGICAL:
+ case SHADER_OPCODE_TXF_MCS_LOGICAL:
+ case SHADER_OPCODE_LOD_LOGICAL:
+ case SHADER_OPCODE_TG4_LOGICAL:
+ case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+ case FS_OPCODE_PACK:
+ return true;
+ case SHADER_OPCODE_RCP:
+ case SHADER_OPCODE_RSQ:
+ case SHADER_OPCODE_SQRT:
+ case SHADER_OPCODE_EXP2:
+ case SHADER_OPCODE_LOG2:
+ case SHADER_OPCODE_POW:
+ case SHADER_OPCODE_INT_QUOTIENT:
+ case SHADER_OPCODE_INT_REMAINDER:
+ case SHADER_OPCODE_SIN:
+ case SHADER_OPCODE_COS:
+ return inst->mlen < 2;
+ case SHADER_OPCODE_LOAD_PAYLOAD:
+ return !inst->is_copy_payload(v->alloc);
+ default:
+ return inst->is_send_from_grf() && !inst->has_side_effects() &&
+ !inst->is_volatile();
+ }
+}
+
+static bool
+operands_match(const fs_inst *a, const fs_inst *b, bool *negate)
+{
+ fs_reg *xs = a->src;
+ fs_reg *ys = b->src;
+
+ if (a->opcode == BRW_OPCODE_MAD) {
+ return xs[0].equals(ys[0]) &&
+ ((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) ||
+ (xs[2].equals(ys[1]) && xs[1].equals(ys[2])));
+ } else if (a->opcode == BRW_OPCODE_MUL && a->dst.type == BRW_REGISTER_TYPE_F) {
+ bool xs0_negate = xs[0].negate;
+ bool xs1_negate = xs[1].file == IMM ? xs[1].f < 0.0f
+ : xs[1].negate;
+ bool ys0_negate = ys[0].negate;
+ bool ys1_negate = ys[1].file == IMM ? ys[1].f < 0.0f
+ : ys[1].negate;
+ float xs1_imm = xs[1].f;
+ float ys1_imm = ys[1].f;
+
+ xs[0].negate = false;
+ xs[1].negate = false;
+ ys[0].negate = false;
+ ys[1].negate = false;
+ xs[1].f = fabsf(xs[1].f);
+ ys[1].f = fabsf(ys[1].f);
+
+ bool ret = (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
+ (xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
+
+ xs[0].negate = xs0_negate;
+ xs[1].negate = xs[1].file == IMM ? false : xs1_negate;
+ ys[0].negate = ys0_negate;
+ ys[1].negate = ys[1].file == IMM ? false : ys1_negate;
+ xs[1].f = xs1_imm;
+ ys[1].f = ys1_imm;
+
+ *negate = (xs0_negate != xs1_negate) != (ys0_negate != ys1_negate);
+ if (*negate && (a->saturate || b->saturate))
+ return false;
+ return ret;
+ } else if (!a->is_commutative()) {
+ bool match = true;
+ for (int i = 0; i < a->sources; i++) {
+ if (!xs[i].equals(ys[i])) {
+ match = false;
+ break;
+ }
+ }
+ return match;
+ } else {
+ return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
+ (xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
+ }
+}
+
+static bool
+instructions_match(fs_inst *a, fs_inst *b, bool *negate)
+{
+ return a->opcode == b->opcode &&
+ a->force_writemask_all == b->force_writemask_all &&
+ a->exec_size == b->exec_size &&
+ a->group == b->group &&
+ a->saturate == b->saturate &&
+ a->predicate == b->predicate &&
+ a->predicate_inverse == b->predicate_inverse &&
+ a->conditional_mod == b->conditional_mod &&
+ a->flag_subreg == b->flag_subreg &&
+ a->dst.type == b->dst.type &&
+ a->offset == b->offset &&
+ a->mlen == b->mlen &&
+ a->size_written == b->size_written &&
+ a->base_mrf == b->base_mrf &&
+ a->eot == b->eot &&
+ a->header_size == b->header_size &&
+ a->shadow_compare == b->shadow_compare &&
+ a->pi_noperspective == b->pi_noperspective &&
+ a->target == b->target &&
+ a->sources == b->sources &&
+ operands_match(a, b, negate);
+}
+
+static void
+create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate)
+{
+ unsigned written = regs_written(inst);
+ unsigned dst_width =
+ DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
+ fs_inst *copy;
+
+ if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD ||
+ written != dst_width) {
+ fs_reg *payload;
+ int sources, header_size;
+ if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
+ sources = inst->sources;
+ header_size = inst->header_size;
+ } else {
+ assert(written % dst_width == 0);
+ sources = written / dst_width;
+ header_size = 0;
+ }
+
+ assert(src.file == VGRF);
+ payload = ralloc_array(bld.shader->mem_ctx, fs_reg, sources);
+ for (int i = 0; i < header_size; i++) {
+ payload[i] = src;
+ src.offset += REG_SIZE;
+ }
+ for (int i = header_size; i < sources; i++) {
+ payload[i] = src;
+ src = offset(src, bld, 1);
+ }
+ copy = bld.LOAD_PAYLOAD(inst->dst, payload, sources, header_size);
+ } else {
+ copy = bld.MOV(inst->dst, src);
+ copy->group = inst->group;
+ copy->force_writemask_all = inst->force_writemask_all;
+ copy->src[0].negate = negate;
+ }
+ assert(regs_written(copy) == written);
+}
+
+bool
+fs_visitor::opt_cse_local(bblock_t *block)
+{
+ bool progress = false;
+ exec_list aeb;
+
+ void *cse_ctx = ralloc_context(NULL);
+
+ int ip = block->start_ip;
+ foreach_inst_in_block(fs_inst, inst, block) {
+ /* Skip some cases. */
+ if (is_expression(this, inst) && !inst->is_partial_write() &&
+ ((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) ||
+ inst->dst.is_null()))
+ {
+ bool found = false;
+ bool negate = false;
+
+ foreach_in_list_use_after(aeb_entry, entry, &aeb) {
+ /* Match current instruction's expression against those in AEB. */
+ if (!(entry->generator->dst.is_null() && !inst->dst.is_null()) &&
+ instructions_match(inst, entry->generator, &negate)) {
+ found = true;
+ progress = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ if (inst->opcode != BRW_OPCODE_MOV ||
+ (inst->opcode == BRW_OPCODE_MOV &&
+ inst->src[0].file == IMM &&
+ inst->src[0].type == BRW_REGISTER_TYPE_VF)) {
+ /* Our first sighting of this expression. Create an entry. */
+ aeb_entry *entry = ralloc(cse_ctx, aeb_entry);
+ entry->tmp = reg_undef;
+ entry->generator = inst;
+ aeb.push_tail(entry);
+ }
+ } else {
+ /* This is at least our second sighting of this expression.
+ * If we don't have a temporary already, make one.
+ */
+ bool no_existing_temp = entry->tmp.file == BAD_FILE;
+ if (no_existing_temp && !entry->generator->dst.is_null()) {
+ const fs_builder ibld = fs_builder(this, block, entry->generator)
+ .at(block, entry->generator->next);
+ int written = regs_written(entry->generator);
+
+ entry->tmp = fs_reg(VGRF, alloc.allocate(written),
+ entry->generator->dst.type);
+
+ create_copy_instr(ibld, entry->generator, entry->tmp, false);
+
+ entry->generator->dst = entry->tmp;
+ }
+
+ /* dest <- temp */
+ if (!inst->dst.is_null()) {
+ assert(inst->size_written == entry->generator->size_written);
+ assert(inst->dst.type == entry->tmp.type);
+ const fs_builder ibld(this, block, inst);
+
+ create_copy_instr(ibld, inst, entry->tmp, negate);
+ }
+
+ /* Set our iterator so that next time through the loop inst->next
+ * will get the instruction in the basic block after the one we've
+ * removed.
+ */
+ fs_inst *prev = (fs_inst *)inst->prev;
+
+ inst->remove(block);
+ inst = prev;
+ }
+ }
+
+ foreach_in_list_safe(aeb_entry, entry, &aeb) {
+ /* Kill all AEB entries that write a different value to or read from
+ * the flag register if we just wrote it.
+ */
+ if (inst->flags_written()) {
+ bool negate; /* dummy */
+ if (entry->generator->flags_read(devinfo) ||
+ (entry->generator->flags_written() &&
+ !instructions_match(inst, entry->generator, &negate))) {
+ entry->remove();
+ ralloc_free(entry);
+ continue;
+ }
+ }
+
+ for (int i = 0; i < entry->generator->sources; i++) {
+ fs_reg *src_reg = &entry->generator->src[i];
+
+ /* Kill all AEB entries that use the destination we just
+ * overwrote.
+ */
+ if (regions_overlap(inst->dst, inst->size_written,
+ entry->generator->src[i],
+ entry->generator->size_read(i))) {
+ entry->remove();
+ ralloc_free(entry);
+ break;
+ }
+
+ /* Kill any AEB entries using registers that don't get reused any
+ * more -- a sure sign they'll fail operands_match().
+ */
+ if (src_reg->file == VGRF && virtual_grf_end[src_reg->nr] < ip) {
+ entry->remove();
+ ralloc_free(entry);
+ break;
+ }
+ }
+ }
+
+ ip++;
+ }
+
+ ralloc_free(cse_ctx);
+
+ return progress;
+}
+
+bool
+fs_visitor::opt_cse()
+{
+ bool progress = false;
+
+ calculate_live_intervals();
+
+ foreach_block (block, cfg) {
+ progress = opt_cse_local(block) || progress;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
diff --git a/src/intel/compiler/brw_fs_dead_code_eliminate.cpp b/src/intel/compiler/brw_fs_dead_code_eliminate.cpp
new file mode 100644
index 00000000000..7adb4278919
--- /dev/null
+++ b/src/intel/compiler/brw_fs_dead_code_eliminate.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_live_variables.h"
+#include "brw_cfg.h"
+
+/** @file brw_fs_dead_code_eliminate.cpp
+ *
+ * Dataflow-aware dead code elimination.
+ *
+ * Walks the instruction list from the bottom, removing instructions that
+ * have results that both aren't used in later blocks and haven't been read
+ * yet in the tail end of this block.
+ */
+
+/**
+ * Is it safe to eliminate the instruction?
+ */
+static bool
+can_eliminate(const fs_inst *inst, BITSET_WORD *flag_live)
+{
+ return !inst->is_control_flow() &&
+ !inst->has_side_effects() &&
+ !(flag_live[0] & inst->flags_written()) &&
+ !inst->writes_accumulator;
+}
+
+/**
+ * Is it safe to omit the write, making the destination ARF null?
+ */
+static bool
+can_omit_write(const fs_inst *inst)
+{
+ switch (inst->opcode) {
+ case SHADER_OPCODE_UNTYPED_ATOMIC:
+ case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+ case SHADER_OPCODE_TYPED_ATOMIC:
+ case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+ return true;
+ default:
+ /* We can eliminate the destination write for ordinary instructions,
+ * but not most SENDs.
+ */
+ if (inst->opcode < 128 && inst->mlen == 0)
+ return true;
+
+ /* It might not be safe for other virtual opcodes. */
+ return false;
+ }
+}
+
+bool
+fs_visitor::dead_code_eliminate()
+{
+ bool progress = false;
+
+ calculate_live_intervals();
+
+ int num_vars = live_intervals->num_vars;
+ BITSET_WORD *live = rzalloc_array(NULL, BITSET_WORD, BITSET_WORDS(num_vars));
+ BITSET_WORD *flag_live = rzalloc_array(NULL, BITSET_WORD, 1);
+
+ foreach_block_reverse_safe(block, cfg) {
+ memcpy(live, live_intervals->block_data[block->num].liveout,
+ sizeof(BITSET_WORD) * BITSET_WORDS(num_vars));
+ memcpy(flag_live, live_intervals->block_data[block->num].flag_liveout,
+ sizeof(BITSET_WORD));
+
+ foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
+ if (inst->dst.file == VGRF) {
+ const unsigned var = live_intervals->var_from_reg(inst->dst);
+ bool result_live = false;
+
+ for (unsigned i = 0; i < regs_written(inst); i++)
+ result_live |= BITSET_TEST(live, var + i);
+
+ if (!result_live &&
+ (can_omit_write(inst) || can_eliminate(inst, flag_live))) {
+ inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
+ progress = true;
+ }
+ }
+
+ if (inst->dst.is_null() && can_eliminate(inst, flag_live)) {
+ inst->opcode = BRW_OPCODE_NOP;
+ progress = true;
+ }
+
+ if (inst->dst.file == VGRF) {
+ if (!inst->is_partial_write()) {
+ int var = live_intervals->var_from_reg(inst->dst);
+ for (unsigned i = 0; i < regs_written(inst); i++) {
+ BITSET_CLEAR(live, var + i);
+ }
+ }
+ }
+
+ if (!inst->predicate && inst->exec_size >= 8)
+ flag_live[0] &= ~inst->flags_written();
+
+ if (inst->opcode == BRW_OPCODE_NOP) {
+ inst->remove(block);
+ continue;
+ }
+
+ for (int i = 0; i < inst->sources; i++) {
+ if (inst->src[i].file == VGRF) {
+ int var = live_intervals->var_from_reg(inst->src[i]);
+
+ for (unsigned j = 0; j < regs_read(inst, i); j++) {
+ BITSET_SET(live, var + j);
+ }
+ }
+ }
+
+ flag_live[0] |= inst->flags_read(devinfo);
+ }
+ }
+
+ ralloc_free(live);
+ ralloc_free(flag_live);
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
new file mode 100644
index 00000000000..aeed6a11977
--- /dev/null
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -0,0 +1,2126 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_generator.cpp
+ *
+ * This file supports generating code from the FS LIR to the actual
+ * native instructions.
+ */
+
+#include "brw_eu.h"
+#include "brw_fs.h"
+#include "brw_cfg.h"
+
+static enum brw_reg_file
+brw_file_from_reg(fs_reg *reg)
+{
+ switch (reg->file) {
+ case ARF:
+ return BRW_ARCHITECTURE_REGISTER_FILE;
+ case FIXED_GRF:
+ case VGRF:
+ return BRW_GENERAL_REGISTER_FILE;
+ case MRF:
+ return BRW_MESSAGE_REGISTER_FILE;
+ case IMM:
+ return BRW_IMMEDIATE_VALUE;
+ case BAD_FILE:
+ case ATTR:
+ case UNIFORM:
+ unreachable("not reached");
+ }
+ return BRW_ARCHITECTURE_REGISTER_FILE;
+}
+
+static struct brw_reg
+brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen, bool compressed)
+{
+ struct brw_reg brw_reg;
+
+ switch (reg->file) {
+ case MRF:
+ assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(gen));
+ /* Fallthrough */
+ case VGRF:
+ if (reg->stride == 0) {
+ brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
+ } else {
+ /* From the Haswell PRM:
+ *
+ * "VertStride must be used to cross GRF register boundaries. This
+ * rule implies that elements within a 'Width' cannot cross GRF
+ * boundaries."
+ *
+ * The maximum width value that could satisfy this restriction is:
+ */
+ const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));
+
+ /* Because the hardware can only split source regions at a whole
+ * multiple of width during decompression (i.e. vertically), clamp
+ * the value obtained above to the physical execution size of a
+ * single decompressed chunk of the instruction:
+ */
+ const unsigned phys_width = compressed ? inst->exec_size / 2 :
+ inst->exec_size;
+
+ /* XXX - The equation above is strictly speaking not correct on
+ * hardware that supports unbalanced GRF writes -- On Gen9+
+ * each decompressed chunk of the instruction may have a
+ * different execution size when the number of components
+ * written to each destination GRF is not the same.
+ */
+ const unsigned width = MIN2(reg_width, phys_width);
+ brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
+ brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
+ }
+
+ brw_reg = retype(brw_reg, reg->type);
+ brw_reg = byte_offset(brw_reg, reg->offset);
+ brw_reg.abs = reg->abs;
+ brw_reg.negate = reg->negate;
+ break;
+ case ARF:
+ case FIXED_GRF:
+ case IMM:
+ assert(reg->offset == 0);
+ brw_reg = reg->as_brw_reg();
+ break;
+ case BAD_FILE:
+ /* Probably unused. */
+ brw_reg = brw_null_reg();
+ break;
+ case ATTR:
+ case UNIFORM:
+ unreachable("not reached");
+ }
+
+ return brw_reg;
+}
+
+fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
+ void *mem_ctx,
+ const void *key,
+ struct brw_stage_prog_data *prog_data,
+ unsigned promoted_constants,
+ bool runtime_check_aads_emit,
+ gl_shader_stage stage)
+
+ : compiler(compiler), log_data(log_data),
+ devinfo(compiler->devinfo), key(key),
+ prog_data(prog_data),
+ promoted_constants(promoted_constants),
+ runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
+ stage(stage), mem_ctx(mem_ctx)
+{
+ p = rzalloc(mem_ctx, struct brw_codegen);
+ brw_init_codegen(devinfo, p, mem_ctx);
+}
+
+fs_generator::~fs_generator()
+{
+}
+
+class ip_record : public exec_node {
+public:
+ DECLARE_RALLOC_CXX_OPERATORS(ip_record)
+
+ ip_record(int ip)
+ {
+ this->ip = ip;
+ }
+
+ int ip;
+};
+
+bool
+fs_generator::patch_discard_jumps_to_fb_writes()
+{
+ if (devinfo->gen < 6 || this->discard_halt_patches.is_empty())
+ return false;
+
+ int scale = brw_jump_scale(p->devinfo);
+
+ /* There is a somewhat strange undocumented requirement of using
+ * HALT, according to the simulator. If some channel has HALTed to
+ * a particular UIP, then by the end of the program, every channel
+ * must have HALTed to that UIP. Furthermore, the tracking is a
+ * stack, so you can't do the final halt of a UIP after starting
+ * halting to a new UIP.
+ *
+ * Symptoms of not emitting this instruction on actual hardware
+ * included GPU hangs and sparkly rendering on the piglit discard
+ * tests.
+ */
+ brw_inst *last_halt = gen6_HALT(p);
+ brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
+ brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
+
+ int ip = p->nr_insn;
+
+ foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
+ brw_inst *patch = &p->store[patch_ip->ip];
+
+ assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
+ /* HALT takes a half-instruction distance from the pre-incremented IP. */
+ brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
+ }
+
+ this->discard_halt_patches.make_empty();
+ return true;
+}
+
+void
+fs_generator::fire_fb_write(fs_inst *inst,
+ struct brw_reg payload,
+ struct brw_reg implied_header,
+ GLuint nr)
+{
+ uint32_t msg_control;
+
+ struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+
+ if (devinfo->gen < 6) {
+ brw_push_insn_state(p);
+ brw_set_default_exec_size(p, BRW_EXECUTE_8);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0));
+ brw_pop_insn_state(p);
+ }
+
+ if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
+ msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
+ else if (prog_data->dual_src_blend) {
+ if (!inst->group)
+ msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
+ else
+ msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
+ } else if (inst->exec_size == 16)
+ msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
+ else
+ msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
+
+ uint32_t surf_index =
+ prog_data->binding_table.render_target_start + inst->target;
+
+ bool last_render_target = inst->eot ||
+ (prog_data->dual_src_blend && dispatch_width == 16);
+
+
+ brw_fb_WRITE(p,
+ payload,
+ implied_header,
+ msg_control,
+ surf_index,
+ nr,
+ 0,
+ inst->eot,
+ last_render_target,
+ inst->header_size != 0);
+
+ brw_mark_surface_used(&prog_data->base, surf_index);
+}
+
+void
+fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
+{
+ struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+ const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key;
+ struct brw_reg implied_header;
+
+ if (devinfo->gen < 8 && !devinfo->is_haswell) {
+ brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+ }
+
+ if (inst->base_mrf >= 0)
+ payload = brw_message_reg(inst->base_mrf);
+
+ /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
+ * move, here's g1.
+ */
+ if (inst->header_size != 0) {
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ brw_set_default_flag_reg(p, 0, 0);
+
+ /* On HSW, the GPU will use the predicate on SENDC, unless the header is
+ * present.
+ */
+ if (prog_data->uses_kill) {
+ struct brw_reg pixel_mask;
+
+ if (devinfo->gen >= 6)
+ pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
+ else
+ pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+
+ brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
+ }
+
+ if (devinfo->gen >= 6) {
+ brw_push_insn_state(p);
+ brw_set_default_exec_size(p, BRW_EXECUTE_16);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+ brw_MOV(p,
+ retype(payload, BRW_REGISTER_TYPE_UD),
+ retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+ brw_pop_insn_state(p);
+
+ if (inst->target > 0 && key->replicate_alpha) {
+ /* Set "Source0 Alpha Present to RenderTarget" bit in message
+ * header.
+ */
+ brw_OR(p,
+ vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
+ vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+ brw_imm_ud(0x1 << 11));
+ }
+
+ if (inst->target > 0) {
+ /* Set the render target index for choosing BLEND_STATE. */
+ brw_MOV(p, retype(vec1(suboffset(payload, 2)),
+ BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(inst->target));
+ }
+
+ /* Set computes stencil to render target */
+ if (prog_data->computed_stencil) {
+ brw_OR(p,
+ vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
+ vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+ brw_imm_ud(0x1 << 14));
+ }
+
+ implied_header = brw_null_reg();
+ } else {
+ implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
+ }
+
+ brw_pop_insn_state(p);
+ } else {
+ implied_header = brw_null_reg();
+ }
+
+ if (!runtime_check_aads_emit) {
+ fire_fb_write(inst, payload, implied_header, inst->mlen);
+ } else {
+ /* This can only happen in gen < 6 */
+ assert(devinfo->gen < 6);
+
+ struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
+
+ /* Check runtime bit to detect if we have to send AA data or not */
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ brw_AND(p,
+ v1_null_ud,
+ retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(1<<26));
+ brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+
+ int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
+ brw_inst_set_exec_size(p->devinfo, brw_last_inst, BRW_EXECUTE_1);
+ {
+ /* Don't send AA data */
+ fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
+ }
+ brw_land_fwd_jump(p, jmp);
+ fire_fb_write(inst, payload, implied_header, inst->mlen);
+ }
+}
+
+void
+fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst,
+ struct brw_reg payload)
+{
+ assert(inst->size_written % REG_SIZE == 0);
+ struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+ const unsigned surf_index =
+ prog_data->binding_table.render_target_start + inst->target;
+
+ gen9_fb_READ(p, dst, payload, surf_index,
+ inst->header_size, inst->size_written / REG_SIZE,
+ prog_data->persample_dispatch);
+
+ brw_mark_surface_used(&prog_data->base, surf_index);
+}
+
+void
+fs_generator::generate_mov_indirect(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg reg,
+ struct brw_reg indirect_byte_offset)
+{
+ assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
+ assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
+
+ unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
+
+ if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) {
+ imm_byte_offset += indirect_byte_offset.ud;
+
+ reg.nr = imm_byte_offset / REG_SIZE;
+ reg.subnr = imm_byte_offset % REG_SIZE;
+ brw_MOV(p, dst, reg);
+ } else {
+ /* Prior to Broadwell, there are only 8 address registers. */
+ assert(inst->exec_size == 8 || devinfo->gen >= 8);
+
+ /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
+ struct brw_reg addr = vec8(brw_address_reg(0));
+
+ /* The destination stride of an instruction (in bytes) must be greater
+ * than or equal to the size of the rest of the instruction. Since the
+ * address register is of type UW, we can't use a D-type instruction.
+ * In order to get around this, re retype to UW and use a stride.
+ */
+ indirect_byte_offset =
+ retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
+
+ /* There are a number of reasons why we don't use the base offset here.
+ * One reason is that the field is only 9 bits which means we can only
+ * use it to access the first 16 GRFs. Also, from the Haswell PRM
+ * section "Register Region Restrictions":
+ *
+ * "The lower bits of the AddressImmediate must not overflow to
+ * change the register address. The lower 5 bits of Address
+ * Immediate when added to lower 5 bits of address register gives
+ * the sub-register offset. The upper bits of Address Immediate
+ * when added to upper bits of address register gives the register
+ * address. Any overflow from sub-register offset is dropped."
+ *
+ * Since the indirect may cause us to cross a register boundary, this
+ * makes the base offset almost useless. We could try and do something
+ * clever where we use a actual base offset if base_offset % 32 == 0 but
+ * that would mean we were generating different code depending on the
+ * base offset. Instead, for the sake of consistency, we'll just do the
+ * add ourselves. This restriction is only listed in the Haswell PRM
+ * but empirical testing indicates that it applies on all older
+ * generations and is lifted on Broadwell.
+ *
+ * In the end, while base_offset is nice to look at in the generated
+ * code, using it saves us 0 instructions and would require quite a bit
+ * of case-by-case work. It's just not worth it.
+ */
+ brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
+ struct brw_reg ind_src = brw_VxH_indirect(0, 0);
+
+ brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type));
+
+ if (devinfo->gen == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE &&
+ !inst->get_next()->is_tail_sentinel() &&
+ ((fs_inst *)inst->get_next())->mlen > 0) {
+ /* From the Sandybridge PRM:
+ *
+ * "[Errata: DevSNB(SNB)] If MRF register is updated by any
+ * instruction that “indexed/indirect” source AND is followed by a
+ * send, the instruction requires a “Switch”. This is to avoid
+ * race condition where send may dispatch before MRF is updated."
+ */
+ brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH);
+ }
+ }
+}
+
+void
+fs_generator::generate_urb_read(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg header)
+{
+ assert(inst->size_written % REG_SIZE == 0);
+ assert(header.file == BRW_GENERAL_REGISTER_FILE);
+ assert(header.type == BRW_REGISTER_TYPE_UD);
+
+ brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+ brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
+ brw_set_src0(p, send, header);
+ brw_set_src1(p, send, brw_imm_ud(0u));
+
+ brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB);
+ brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ);
+
+ if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT)
+ brw_inst_set_urb_per_slot_offset(p->devinfo, send, true);
+
+ brw_inst_set_mlen(p->devinfo, send, inst->mlen);
+ brw_inst_set_rlen(p->devinfo, send, inst->size_written / REG_SIZE);
+ brw_inst_set_header_present(p->devinfo, send, true);
+ brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset);
+}
+
+void
+fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
+{
+ brw_inst *insn;
+
+ insn = brw_next_insn(p, BRW_OPCODE_SEND);
+
+ brw_set_dest(p, insn, brw_null_reg());
+ brw_set_src0(p, insn, payload);
+ brw_set_src1(p, insn, brw_imm_d(0));
+
+ brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB);
+ brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE);
+
+ if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
+ inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
+ brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true);
+
+ if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
+ inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
+ brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true);
+
+ brw_inst_set_mlen(p->devinfo, insn, inst->mlen);
+ brw_inst_set_rlen(p->devinfo, insn, 0);
+ brw_inst_set_eot(p->devinfo, insn, inst->eot);
+ brw_inst_set_header_present(p->devinfo, insn, true);
+ brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset);
+}
+
+void
+fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
+{
+ struct brw_inst *insn;
+
+ insn = brw_next_insn(p, BRW_OPCODE_SEND);
+
+ brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
+ brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW));
+ brw_set_src1(p, insn, brw_imm_d(0));
+
+ /* Terminate a compute shader by sending a message to the thread spawner.
+ */
+ brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER);
+ brw_inst_set_mlen(devinfo, insn, 1);
+ brw_inst_set_rlen(devinfo, insn, 0);
+ brw_inst_set_eot(devinfo, insn, inst->eot);
+ brw_inst_set_header_present(devinfo, insn, false);
+
+ brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
+ brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */
+
+ /* Note that even though the thread has a URB resource associated with it,
+ * we set the "do not dereference URB" bit, because the URB resource is
+ * managed by the fixed-function unit, so it will free it automatically.
+ */
+ brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */
+
+ brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
+}
+
+void
+fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src)
+{
+ brw_barrier(p, src);
+ brw_WAIT(p);
+}
+
+void
+fs_generator::generate_linterp(fs_inst *inst,
+ struct brw_reg dst, struct brw_reg *src)
+{
+ /* PLN reads:
+ * / in SIMD16 \
+ * -----------------------------------
+ * | src1+0 | src1+1 | src1+2 | src1+3 |
+ * |-----------------------------------|
+ * |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
+ * -----------------------------------
+ *
+ * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys:
+ *
+ * -----------------------------------
+ * | src1+0 | src1+1 | src1+2 | src1+3 |
+ * |-----------------------------------|
+ * |(x0, x1)|(y0, y1)| | | in SIMD8
+ * |-----------------------------------|
+ * |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16
+ * -----------------------------------
+ *
+ * See also: emit_interpolation_setup_gen4().
+ */
+ struct brw_reg delta_x = src[0];
+ struct brw_reg delta_y = offset(src[0], inst->exec_size / 8);
+ struct brw_reg interp = src[1];
+
+ if (devinfo->has_pln &&
+ (devinfo->gen >= 7 || (delta_x.nr & 1) == 0)) {
+ brw_PLN(p, dst, interp, delta_x);
+ } else {
+ brw_LINE(p, brw_null_reg(), interp, delta_x);
+ brw_MAC(p, dst, suboffset(interp, 1), delta_y);
+ }
+}
+
+void
+fs_generator::generate_get_buffer_size(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg src,
+ struct brw_reg surf_index)
+{
+ assert(devinfo->gen >= 7);
+ assert(surf_index.file == BRW_IMMEDIATE_VALUE);
+
+ uint32_t simd_mode;
+ int rlen = 4;
+
+ switch (inst->exec_size) {
+ case 8:
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
+ break;
+ case 16:
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+ break;
+ default:
+ unreachable("Invalid width for texture instruction");
+ }
+
+ if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
+ rlen = 8;
+ dst = vec16(dst);
+ }
+
+ brw_SAMPLE(p,
+ retype(dst, BRW_REGISTER_TYPE_UW),
+ inst->base_mrf,
+ src,
+ surf_index.ud,
+ 0,
+ GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
+ rlen, /* response length */
+ inst->mlen,
+ inst->header_size > 0,
+ simd_mode,
+ BRW_SAMPLER_RETURN_FORMAT_SINT32);
+
+ brw_mark_surface_used(prog_data, surf_index.ud);
+}
+
+void
+fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
+ struct brw_reg surface_index,
+ struct brw_reg sampler_index)
+{
+ assert(inst->size_written % REG_SIZE == 0);
+ int msg_type = -1;
+ uint32_t simd_mode;
+ uint32_t return_format;
+ bool is_combined_send = inst->eot;
+
+ switch (dst.type) {
+ case BRW_REGISTER_TYPE_D:
+ return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
+ break;
+ case BRW_REGISTER_TYPE_UD:
+ return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
+ break;
+ default:
+ return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
+ break;
+ }
+
+ /* Stomp the resinfo output type to UINT32. On gens 4-5, the output type
+ * is set as part of the message descriptor. On gen4, the PRM seems to
+ * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
+ * later gens UINT32 is required. Once you hit Sandy Bridge, the bit is
+ * gone from the message descriptor entirely and you just get UINT32 all
+ * the time regasrdless. Since we can really only do non-UINT32 on gen4,
+ * just stomp it to UINT32 all the time.
+ */
+ if (inst->opcode == SHADER_OPCODE_TXS)
+ return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
+
+ switch (inst->exec_size) {
+ case 8:
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
+ break;
+ case 16:
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+ break;
+ default:
+ unreachable("Invalid width for texture instruction");
+ }
+
+ if (devinfo->gen >= 5) {
+ switch (inst->opcode) {
+ case SHADER_OPCODE_TEX:
+ if (inst->shadow_compare) {
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
+ } else {
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
+ }
+ break;
+ case FS_OPCODE_TXB:
+ if (inst->shadow_compare) {
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
+ } else {
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
+ }
+ break;
+ case SHADER_OPCODE_TXL:
+ if (inst->shadow_compare) {
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
+ } else {
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
+ }
+ break;
+ case SHADER_OPCODE_TXL_LZ:
+ assert(devinfo->gen >= 9);
+ if (inst->shadow_compare) {
+ msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ;
+ } else {
+ msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LZ;
+ }
+ break;
+ case SHADER_OPCODE_TXS:
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
+ break;
+ case SHADER_OPCODE_TXD:
+ if (inst->shadow_compare) {
+ /* Gen7.5+. Otherwise, lowered in NIR */
+ assert(devinfo->gen >= 8 || devinfo->is_haswell);
+ msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
+ } else {
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
+ }
+ break;
+ case SHADER_OPCODE_TXF:
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
+ break;
+ case SHADER_OPCODE_TXF_LZ:
+ assert(devinfo->gen >= 9);
+ msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
+ break;
+ case SHADER_OPCODE_TXF_CMS_W:
+ assert(devinfo->gen >= 9);
+ msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
+ break;
+ case SHADER_OPCODE_TXF_CMS:
+ if (devinfo->gen >= 7)
+ msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
+ else
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
+ break;
+ case SHADER_OPCODE_TXF_UMS:
+ assert(devinfo->gen >= 7);
+ msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
+ break;
+ case SHADER_OPCODE_TXF_MCS:
+ assert(devinfo->gen >= 7);
+ msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
+ break;
+ case SHADER_OPCODE_LOD:
+ msg_type = GEN5_SAMPLER_MESSAGE_LOD;
+ break;
+ case SHADER_OPCODE_TG4:
+ if (inst->shadow_compare) {
+ assert(devinfo->gen >= 7);
+ msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
+ } else {
+ assert(devinfo->gen >= 6);
+ msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
+ }
+ break;
+ case SHADER_OPCODE_TG4_OFFSET:
+ assert(devinfo->gen >= 7);
+ if (inst->shadow_compare) {
+ msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
+ } else {
+ msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
+ }
+ break;
+ case SHADER_OPCODE_SAMPLEINFO:
+ msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
+ break;
+ default:
+ unreachable("not reached");
+ }
+ } else {
+ switch (inst->opcode) {
+ case SHADER_OPCODE_TEX:
+ /* Note that G45 and older determines shadow compare and dispatch width
+ * from message length for most messages.
+ */
+ if (inst->exec_size == 8) {
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
+ if (inst->shadow_compare) {
+ assert(inst->mlen == 6);
+ } else {
+ assert(inst->mlen <= 4);
+ }
+ } else {
+ if (inst->shadow_compare) {
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
+ assert(inst->mlen == 9);
+ } else {
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
+ assert(inst->mlen <= 7 && inst->mlen % 2 == 1);
+ }
+ }
+ break;
+ case FS_OPCODE_TXB:
+ if (inst->shadow_compare) {
+ assert(inst->exec_size == 8);
+ assert(inst->mlen == 6);
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
+ } else {
+ assert(inst->mlen == 9);
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+ }
+ break;
+ case SHADER_OPCODE_TXL:
+ if (inst->shadow_compare) {
+ assert(inst->exec_size == 8);
+ assert(inst->mlen == 6);
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
+ } else {
+ assert(inst->mlen == 9);
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+ }
+ break;
+ case SHADER_OPCODE_TXD:
+ /* There is no sample_d_c message; comparisons are done manually */
+ assert(inst->exec_size == 8);
+ assert(inst->mlen == 7 || inst->mlen == 10);
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
+ break;
+ case SHADER_OPCODE_TXF:
+ assert(inst->mlen <= 9 && inst->mlen % 2 == 1);
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+ break;
+ case SHADER_OPCODE_TXS:
+ assert(inst->mlen == 3);
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+ break;
+ default:
+ unreachable("not reached");
+ }
+ }
+ assert(msg_type != -1);
+
+ if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
+ dst = vec16(dst);
+ }
+
+ assert(devinfo->gen < 7 || inst->header_size == 0 ||
+ src.file == BRW_GENERAL_REGISTER_FILE);
+
+ assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
+
+ /* Load the message header if present. If there's a texture offset,
+ * we need to set it up explicitly and load the offset bitfield.
+ * Otherwise, we can use an implied move from g0 to the first message reg.
+ */
+ if (inst->header_size != 0) {
+ if (devinfo->gen < 6 && !inst->offset) {
+ /* Set up an implied move from g0 to the MRF. */
+ src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
+ } else {
+ struct brw_reg header_reg;
+
+ if (devinfo->gen >= 7) {
+ header_reg = src;
+ } else {
+ assert(inst->base_mrf != -1);
+ header_reg = brw_message_reg(inst->base_mrf);
+ }
+
+ brw_push_insn_state(p);
+ brw_set_default_exec_size(p, BRW_EXECUTE_8);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ /* Explicitly set up the message header by copying g0 to the MRF. */
+ brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
+
+ if (inst->offset) {
+ /* Set the offset bits in DWord 2. */
+ brw_MOV(p, get_element_ud(header_reg, 2),
+ brw_imm_ud(inst->offset));
+ } else if (stage != MESA_SHADER_VERTEX &&
+ stage != MESA_SHADER_FRAGMENT) {
+ /* The vertex and fragment stages have g0.2 set to 0, so
+ * header0.2 is 0 when g0 is copied. Other stages may not, so we
+ * must set it to 0 to avoid setting undesirable bits in the
+ * message.
+ */
+ brw_MOV(p, get_element_ud(header_reg, 2), brw_imm_ud(0));
+ }
+
+ brw_adjust_sampler_state_pointer(p, header_reg, sampler_index);
+ brw_pop_insn_state(p);
+ }
+ }
+
+ uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
+ inst->opcode == SHADER_OPCODE_TG4_OFFSET)
+ ? prog_data->binding_table.gather_texture_start
+ : prog_data->binding_table.texture_start;
+
+ if (surface_index.file == BRW_IMMEDIATE_VALUE &&
+ sampler_index.file == BRW_IMMEDIATE_VALUE) {
+ uint32_t surface = surface_index.ud;
+ uint32_t sampler = sampler_index.ud;
+
+ brw_SAMPLE(p,
+ retype(dst, BRW_REGISTER_TYPE_UW),
+ inst->base_mrf,
+ src,
+ surface + base_binding_table_index,
+ sampler % 16,
+ msg_type,
+ inst->size_written / REG_SIZE,
+ inst->mlen,
+ inst->header_size != 0,
+ simd_mode,
+ return_format);
+
+ brw_mark_surface_used(prog_data, surface + base_binding_table_index);
+ } else {
+ /* Non-const sampler index */
+
+ struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
+ struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
+ struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
+
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ if (brw_regs_equal(&surface_reg, &sampler_reg)) {
+ brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
+ } else {
+ if (sampler_reg.file == BRW_IMMEDIATE_VALUE) {
+ brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8));
+ } else {
+ brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
+ brw_OR(p, addr, addr, surface_reg);
+ }
+ }
+ if (base_binding_table_index)
+ brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
+ brw_AND(p, addr, addr, brw_imm_ud(0xfff));
+
+ brw_pop_insn_state(p);
+
+ /* dst = send(offset, a0.0 | <descriptor>) */
+ brw_inst *insn = brw_send_indirect_message(
+ p, BRW_SFID_SAMPLER, dst, src, addr);
+ brw_set_sampler_message(p, insn,
+ 0 /* surface */,
+ 0 /* sampler */,
+ msg_type,
+ inst->size_written / REG_SIZE,
+ inst->mlen /* mlen */,
+ inst->header_size != 0 /* header */,
+ simd_mode,
+ return_format);
+
+ /* visitor knows more than we do about the surface limit required,
+ * so has already done marking.
+ */
+ }
+
+ if (is_combined_send) {
+ brw_inst_set_eot(p->devinfo, brw_last_inst, true);
+ brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC);
+ }
+}
+
+
+/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
+ * looking like:
+ *
+ * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
+ *
+ * Ideally, we want to produce:
+ *
+ * DDX DDY
+ * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
+ * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
+ * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
+ * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
+ * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
+ * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
+ * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
+ * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
+ *
+ * and add another set of two more subspans if in 16-pixel dispatch mode.
+ *
+ * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
+ * for each pair, and vertstride = 2 jumps us 2 elements after processing a
+ * pair. But the ideal approximation may impose a huge performance cost on
+ * sample_d. On at least Haswell, sample_d instruction does some
+ * optimizations if the same LOD is used for all pixels in the subspan.
+ *
+ * For DDY, we need to use ALIGN16 mode since it's capable of doing the
+ * appropriate swizzling.
+ */
+void
+fs_generator::generate_ddx(enum opcode opcode,
+ struct brw_reg dst, struct brw_reg src)
+{
+ unsigned vstride, width;
+
+ if (opcode == FS_OPCODE_DDX_FINE) {
+ /* produce accurate derivatives */
+ vstride = BRW_VERTICAL_STRIDE_2;
+ width = BRW_WIDTH_2;
+ } else {
+ /* replicate the derivative at the top-left pixel to other pixels */
+ vstride = BRW_VERTICAL_STRIDE_4;
+ width = BRW_WIDTH_4;
+ }
+
+ struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
+ src.negate, src.abs,
+ BRW_REGISTER_TYPE_F,
+ vstride,
+ width,
+ BRW_HORIZONTAL_STRIDE_0,
+ BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+ struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
+ src.negate, src.abs,
+ BRW_REGISTER_TYPE_F,
+ vstride,
+ width,
+ BRW_HORIZONTAL_STRIDE_0,
+ BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+ brw_ADD(p, dst, src0, negate(src1));
+}
+
+/* The negate_value boolean is used to negate the derivative computation for
+ * FBOs, since they place the origin at the upper left instead of the lower
+ * left.
+ */
+void
+fs_generator::generate_ddy(enum opcode opcode,
+ struct brw_reg dst, struct brw_reg src)
+{
+ if (opcode == FS_OPCODE_DDY_FINE) {
+ /* produce accurate derivatives */
+ struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
+ src.negate, src.abs,
+ BRW_REGISTER_TYPE_F,
+ BRW_VERTICAL_STRIDE_4,
+ BRW_WIDTH_4,
+ BRW_HORIZONTAL_STRIDE_1,
+ BRW_SWIZZLE_XYXY, WRITEMASK_XYZW);
+ struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
+ src.negate, src.abs,
+ BRW_REGISTER_TYPE_F,
+ BRW_VERTICAL_STRIDE_4,
+ BRW_WIDTH_4,
+ BRW_HORIZONTAL_STRIDE_1,
+ BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ brw_ADD(p, dst, negate(src0), src1);
+ brw_pop_insn_state(p);
+ } else {
+ /* replicate the derivative at the top-left pixel to other pixels */
+ struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
+ src.negate, src.abs,
+ BRW_REGISTER_TYPE_F,
+ BRW_VERTICAL_STRIDE_4,
+ BRW_WIDTH_4,
+ BRW_HORIZONTAL_STRIDE_0,
+ BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+ struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
+ src.negate, src.abs,
+ BRW_REGISTER_TYPE_F,
+ BRW_VERTICAL_STRIDE_4,
+ BRW_WIDTH_4,
+ BRW_HORIZONTAL_STRIDE_0,
+ BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+ brw_ADD(p, dst, negate(src0), src1);
+ }
+}
+
+void
+fs_generator::generate_discard_jump(fs_inst *inst)
+{
+ assert(devinfo->gen >= 6);
+
+ /* This HALT will be patched up at FB write time to point UIP at the end of
+ * the program, and at brw_uip_jip() JIP will be set to the end of the
+ * current block (or the program).
+ */
+ this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
+ gen6_HALT(p);
+}
+
+void
+fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
+{
+ /* The 32-wide messages only respect the first 16-wide half of the channel
+ * enable signals which are replicated identically for the second group of
+ * 16 channels, so we cannot use them unless the write is marked
+ * force_writemask_all.
+ */
+ const unsigned lower_size = inst->force_writemask_all ? inst->exec_size :
+ MIN2(16, inst->exec_size);
+ const unsigned block_size = 4 * lower_size / REG_SIZE;
+ assert(inst->mlen != 0);
+
+ brw_push_insn_state(p);
+ brw_set_default_exec_size(p, cvt(lower_size) - 1);
+ brw_set_default_compression(p, lower_size > 8);
+
+ for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
+ brw_set_default_group(p, inst->group + lower_size * i);
+
+ brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0),
+ retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD));
+
+ brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
+ block_size,
+ inst->offset + block_size * REG_SIZE * i);
+ }
+
+ brw_pop_insn_state(p);
+}
+
+void
+fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
+{
+ assert(inst->exec_size <= 16 || inst->force_writemask_all);
+ assert(inst->mlen != 0);
+
+ brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
+ inst->exec_size / 8, inst->offset);
+}
+
+void
+fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
+{
+ assert(inst->exec_size <= 16 || inst->force_writemask_all);
+
+ gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
+}
+
+void
+fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg index,
+ struct brw_reg offset)
+{
+ assert(type_sz(dst.type) == 4);
+ assert(inst->mlen != 0);
+
+ assert(index.file == BRW_IMMEDIATE_VALUE &&
+ index.type == BRW_REGISTER_TYPE_UD);
+ uint32_t surf_index = index.ud;
+
+ assert(offset.file == BRW_IMMEDIATE_VALUE &&
+ offset.type == BRW_REGISTER_TYPE_UD);
+ uint32_t read_offset = offset.ud;
+
+ brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
+ read_offset, surf_index);
+}
+
+void
+fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg index,
+ struct brw_reg payload)
+{
+ assert(index.type == BRW_REGISTER_TYPE_UD);
+ assert(payload.file == BRW_GENERAL_REGISTER_FILE);
+ assert(type_sz(dst.type) == 4);
+
+ if (index.file == BRW_IMMEDIATE_VALUE) {
+ const uint32_t surf_index = index.ud;
+
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+ brw_pop_insn_state(p);
+
+ brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
+ brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
+ brw_set_dp_read_message(p, send, surf_index,
+ BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size),
+ GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
+ GEN6_SFID_DATAPORT_CONSTANT_CACHE,
+ 1, /* mlen */
+ true, /* header */
+ DIV_ROUND_UP(inst->size_written, REG_SIZE));
+
+ } else {
+ struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
+
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+ /* a0.0 = surf_index & 0xff */
+ brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
+ brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
+ brw_set_dest(p, insn_and, addr);
+ brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
+ brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
+
+ /* dst = send(payload, a0.0 | <descriptor>) */
+ brw_inst *insn = brw_send_indirect_message(
+ p, GEN6_SFID_DATAPORT_CONSTANT_CACHE,
+ retype(dst, BRW_REGISTER_TYPE_UD),
+ retype(payload, BRW_REGISTER_TYPE_UD), addr);
+ brw_set_dp_read_message(p, insn, 0 /* surface */,
+ BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size),
+ GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
+ GEN6_SFID_DATAPORT_CONSTANT_CACHE,
+ 1, /* mlen */
+ true, /* header */
+ DIV_ROUND_UP(inst->size_written, REG_SIZE));
+
+ brw_pop_insn_state(p);
+ }
+}
+
+void
+fs_generator::generate_varying_pull_constant_load_gen4(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg index)
+{
+ assert(devinfo->gen < 7); /* Should use the gen7 variant. */
+ assert(inst->header_size != 0);
+ assert(inst->mlen);
+
+ assert(index.file == BRW_IMMEDIATE_VALUE &&
+ index.type == BRW_REGISTER_TYPE_UD);
+ uint32_t surf_index = index.ud;
+
+ uint32_t simd_mode, rlen, msg_type;
+ if (inst->exec_size == 16) {
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+ rlen = 8;
+ } else {
+ assert(inst->exec_size == 8);
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
+ rlen = 4;
+ }
+
+ if (devinfo->gen >= 5)
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
+ else {
+ /* We always use the SIMD16 message so that we only have to load U, and
+ * not V or R.
+ */
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
+ assert(inst->mlen == 3);
+ assert(inst->size_written == 8 * REG_SIZE);
+ rlen = 8;
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+ }
+
+ struct brw_reg header = brw_vec8_grf(0, 0);
+ gen6_resolve_implied_move(p, &header, inst->base_mrf);
+
+ brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+ brw_inst_set_compression(devinfo, send, false);
+ brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
+ brw_set_src0(p, send, header);
+ if (devinfo->gen < 6)
+ brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf);
+
+ /* Our surface is set up as floats, regardless of what actual data is
+ * stored in it.
+ */
+ uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
+ brw_set_sampler_message(p, send,
+ surf_index,
+ 0, /* sampler (unused) */
+ msg_type,
+ rlen,
+ inst->mlen,
+ inst->header_size != 0,
+ simd_mode,
+ return_format);
+}
+
+void
+fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg index,
+ struct brw_reg offset)
+{
+ assert(devinfo->gen >= 7);
+ /* Varying-offset pull constant loads are treated as a normal expression on
+ * gen7, so the fact that it's a send message is hidden at the IR level.
+ */
+ assert(inst->header_size == 0);
+ assert(!inst->mlen);
+ assert(index.type == BRW_REGISTER_TYPE_UD);
+
+ uint32_t simd_mode, rlen, mlen;
+ if (inst->exec_size == 16) {
+ mlen = 2;
+ rlen = 8;
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+ } else {
+ assert(inst->exec_size == 8);
+ mlen = 1;
+ rlen = 4;
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
+ }
+
+ if (index.file == BRW_IMMEDIATE_VALUE) {
+
+ uint32_t surf_index = index.ud;
+
+ brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+ brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
+ brw_set_src0(p, send, offset);
+ brw_set_sampler_message(p, send,
+ surf_index,
+ 0, /* LD message ignores sampler unit */
+ GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+ rlen,
+ mlen,
+ false, /* no header */
+ simd_mode,
+ 0);
+
+ } else {
+
+ struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
+
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+ /* a0.0 = surf_index & 0xff */
+ brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
+ brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
+ brw_set_dest(p, insn_and, addr);
+ brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
+ brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
+
+ brw_pop_insn_state(p);
+
+ /* dst = send(offset, a0.0 | <descriptor>) */
+ brw_inst *insn = brw_send_indirect_message(
+ p, BRW_SFID_SAMPLER, retype(dst, BRW_REGISTER_TYPE_UW),
+ offset, addr);
+ brw_set_sampler_message(p, insn,
+ 0 /* surface */,
+ 0 /* sampler */,
+ GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+ rlen /* rlen */,
+ mlen /* mlen */,
+ false /* header */,
+ simd_mode,
+ 0);
+ }
+}
+
+/**
+ * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
+ * into the flags register (f0.0).
+ *
+ * Used only on Gen6 and above.
+ */
+void
+fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
+{
+ struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
+ struct brw_reg dispatch_mask;
+
+ if (devinfo->gen >= 6)
+ dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
+ else
+ dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_MOV(p, flags, dispatch_mask);
+ brw_pop_insn_state(p);
+}
+
+void
+fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg src,
+ struct brw_reg msg_data,
+ unsigned msg_type)
+{
+ assert(inst->size_written % REG_SIZE == 0);
+ assert(msg_data.type == BRW_REGISTER_TYPE_UD);
+
+ brw_pixel_interpolator_query(p,
+ retype(dst, BRW_REGISTER_TYPE_UW),
+ src,
+ inst->pi_noperspective,
+ msg_type,
+ msg_data,
+ inst->mlen,
+ inst->size_written / REG_SIZE);
+}
+
+/* Sets vstride=1, width=4, hstride=0 of register src1 during
+ * the ADD instruction.
+ */
+void
+fs_generator::generate_set_sample_id(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg src0,
+ struct brw_reg src1)
+{
+ assert(dst.type == BRW_REGISTER_TYPE_D ||
+ dst.type == BRW_REGISTER_TYPE_UD);
+ assert(src0.type == BRW_REGISTER_TYPE_D ||
+ src0.type == BRW_REGISTER_TYPE_UD);
+
+ struct brw_reg reg = stride(src1, 1, 4, 0);
+ if (devinfo->gen >= 8 || inst->exec_size == 8) {
+ brw_ADD(p, dst, src0, reg);
+ } else if (inst->exec_size == 16) {
+ brw_push_insn_state(p);
+ brw_set_default_exec_size(p, BRW_EXECUTE_8);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ brw_ADD(p, firsthalf(dst), firsthalf(src0), reg);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
+ brw_ADD(p, sechalf(dst), sechalf(src0), suboffset(reg, 2));
+ brw_pop_insn_state(p);
+ }
+}
+
+void
+fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg x,
+ struct brw_reg y)
+{
+ assert(devinfo->gen >= 7);
+ assert(dst.type == BRW_REGISTER_TYPE_UD);
+ assert(x.type == BRW_REGISTER_TYPE_F);
+ assert(y.type == BRW_REGISTER_TYPE_F);
+
+ /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
+ *
+ * Because this instruction does not have a 16-bit floating-point type,
+ * the destination data type must be Word (W).
+ *
+ * The destination must be DWord-aligned and specify a horizontal stride
+ * (HorzStride) of 2. The 16-bit result is stored in the lower word of
+ * each destination channel and the upper word is not modified.
+ */
+ struct brw_reg dst_w = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
+
+ /* Give each 32-bit channel of dst the form below, where "." means
+ * unchanged.
+ * 0x....hhhh
+ */
+ brw_F32TO16(p, dst_w, y);
+
+ /* Now the form:
+ * 0xhhhh0000
+ */
+ brw_SHL(p, dst, dst, brw_imm_ud(16u));
+
+ /* And, finally the form of packHalf2x16's output:
+ * 0xhhhhllll
+ */
+ brw_F32TO16(p, dst_w, x);
+}
+
+void
+fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg src)
+{
+ assert(devinfo->gen >= 7);
+ assert(dst.type == BRW_REGISTER_TYPE_F);
+ assert(src.type == BRW_REGISTER_TYPE_UD);
+
+ /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
+ *
+ * Because this instruction does not have a 16-bit floating-point type,
+ * the source data type must be Word (W). The destination type must be
+ * F (Float).
+ */
+ struct brw_reg src_w = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
+
+ /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
+ * For the Y case, we wish to access only the upper word; therefore
+ * a 16-bit subregister offset is needed.
+ */
+ assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
+ inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
+ if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
+ src_w.subnr += 2;
+
+ brw_F16TO32(p, dst, src_w);
+}
+
+void
+fs_generator::generate_shader_time_add(fs_inst *inst,
+ struct brw_reg payload,
+ struct brw_reg offset,
+ struct brw_reg value)
+{
+ assert(devinfo->gen >= 7);
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, true);
+
+ assert(payload.file == BRW_GENERAL_REGISTER_FILE);
+ struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
+ offset.type);
+ struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
+ value.type);
+
+ assert(offset.file == BRW_IMMEDIATE_VALUE);
+ if (value.file == BRW_GENERAL_REGISTER_FILE) {
+ value.width = BRW_WIDTH_1;
+ value.hstride = BRW_HORIZONTAL_STRIDE_0;
+ value.vstride = BRW_VERTICAL_STRIDE_0;
+ } else {
+ assert(value.file == BRW_IMMEDIATE_VALUE);
+ }
+
+ /* Trying to deal with setup of the params from the IR is crazy in the FS8
+ * case, and we don't really care about squeezing every bit of performance
+ * out of this path, so we just emit the MOVs from here.
+ */
+ brw_MOV(p, payload_offset, offset);
+ brw_MOV(p, payload_value, value);
+ brw_shader_time_add(p, payload,
+ prog_data->binding_table.shader_time_start);
+ brw_pop_insn_state(p);
+
+ brw_mark_surface_used(prog_data,
+ prog_data->binding_table.shader_time_start);
+}
+
+void
+fs_generator::enable_debug(const char *shader_name)
+{
+ debug_flag = true;
+ this->shader_name = shader_name;
+}
+
+int
+fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
+{
+ /* align to 64 byte boundary. */
+ while (p->next_insn_offset % 64)
+ brw_NOP(p);
+
+ this->dispatch_width = dispatch_width;
+
+ int start_offset = p->next_insn_offset;
+ int spill_count = 0, fill_count = 0;
+ int loop_count = 0;
+
+ struct annotation_info annotation;
+ memset(&annotation, 0, sizeof(annotation));
+
+ foreach_block_and_inst (block, fs_inst, inst, cfg) {
+ struct brw_reg src[3], dst;
+ unsigned int last_insn_offset = p->next_insn_offset;
+ bool multiple_instructions_emitted = false;
+
+ /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the
+ * "Register Region Restrictions" section: for BDW, SKL:
+ *
+ * "A POW/FDIV operation must not be followed by an instruction
+ * that requires two destination registers."
+ *
+ * The documentation is often lacking annotations for Atom parts,
+ * and empirically this affects CHV as well.
+ */
+ if (devinfo->gen >= 8 &&
+ p->nr_insn > 1 &&
+ brw_inst_opcode(devinfo, brw_last_inst) == BRW_OPCODE_MATH &&
+ brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW &&
+ inst->dst.component_size(inst->exec_size) > REG_SIZE) {
+ brw_NOP(p);
+ last_insn_offset = p->next_insn_offset;
+ }
+
+ if (unlikely(debug_flag))
+ annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset);
+
+ /* If the instruction writes to more than one register, it needs to be
+ * explicitly marked as compressed on Gen <= 5. On Gen >= 6 the
+ * hardware figures out by itself what the right compression mode is,
+ * but we still need to know whether the instruction is compressed to
+ * set up the source register regions appropriately.
+ *
+ * XXX - This is wrong for instructions that write a single register but
+ * read more than one which should strictly speaking be treated as
+ * compressed. For instructions that don't write any registers it
+ * relies on the destination being a null register of the correct
+ * type and regioning so the instruction is considered compressed
+ * or not accordingly.
+ */
+ const bool compressed =
+ inst->dst.component_size(inst->exec_size) > REG_SIZE;
+ brw_set_default_compression(p, compressed);
+ brw_set_default_group(p, inst->group);
+
+ for (unsigned int i = 0; i < inst->sources; i++) {
+ src[i] = brw_reg_from_fs_reg(inst, &inst->src[i], devinfo->gen,
+ compressed);
+
+ /* The accumulator result appears to get used for the
+ * conditional modifier generation. When negating a UD
+ * value, there is a 33rd bit generated for the sign in the
+ * accumulator value, so now you can't check, for example,
+ * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
+ */
+ assert(!inst->conditional_mod ||
+ inst->src[i].type != BRW_REGISTER_TYPE_UD ||
+ !inst->src[i].negate);
+ }
+ dst = brw_reg_from_fs_reg(inst, &inst->dst, devinfo->gen, compressed);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_predicate_control(p, inst->predicate);
+ brw_set_default_predicate_inverse(p, inst->predicate_inverse);
+ brw_set_default_flag_reg(p, 0, inst->flag_subreg);
+ brw_set_default_saturate(p, inst->saturate);
+ brw_set_default_mask_control(p, inst->force_writemask_all);
+ brw_set_default_acc_write_control(p, inst->writes_accumulator);
+ brw_set_default_exec_size(p, cvt(inst->exec_size) - 1);
+
+ assert(inst->force_writemask_all || inst->exec_size >= 4);
+ assert(inst->force_writemask_all || inst->group % inst->exec_size == 0);
+ assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
+ assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
+
+ switch (inst->opcode) {
+ case BRW_OPCODE_MOV:
+ brw_MOV(p, dst, src[0]);
+ break;
+ case BRW_OPCODE_ADD:
+ brw_ADD(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_MUL:
+ brw_MUL(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_AVG:
+ brw_AVG(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_MACH:
+ brw_MACH(p, dst, src[0], src[1]);
+ break;
+
+ case BRW_OPCODE_LINE:
+ brw_LINE(p, dst, src[0], src[1]);
+ break;
+
+ case BRW_OPCODE_MAD:
+ assert(devinfo->gen >= 6);
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ brw_MAD(p, dst, src[0], src[1], src[2]);
+ break;
+
+ case BRW_OPCODE_LRP:
+ assert(devinfo->gen >= 6);
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ brw_LRP(p, dst, src[0], src[1], src[2]);
+ break;
+
+ case BRW_OPCODE_FRC:
+ brw_FRC(p, dst, src[0]);
+ break;
+ case BRW_OPCODE_RNDD:
+ brw_RNDD(p, dst, src[0]);
+ break;
+ case BRW_OPCODE_RNDE:
+ brw_RNDE(p, dst, src[0]);
+ break;
+ case BRW_OPCODE_RNDZ:
+ brw_RNDZ(p, dst, src[0]);
+ break;
+
+ case BRW_OPCODE_AND:
+ brw_AND(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_OR:
+ brw_OR(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_XOR:
+ brw_XOR(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_NOT:
+ brw_NOT(p, dst, src[0]);
+ break;
+ case BRW_OPCODE_ASR:
+ brw_ASR(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_SHR:
+ brw_SHR(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_SHL:
+ brw_SHL(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_F32TO16:
+ assert(devinfo->gen >= 7);
+ brw_F32TO16(p, dst, src[0]);
+ break;
+ case BRW_OPCODE_F16TO32:
+ assert(devinfo->gen >= 7);
+ brw_F16TO32(p, dst, src[0]);
+ break;
+ case BRW_OPCODE_CMP:
+ if (inst->exec_size >= 16 && devinfo->gen == 7 && !devinfo->is_haswell &&
+ dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
+ /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround
+ * implemented in the compiler is not sufficient. Overriding the
+ * type when the destination is the null register is necessary but
+ * not sufficient by itself.
+ */
+ assert(dst.nr == BRW_ARF_NULL);
+ dst.type = BRW_REGISTER_TYPE_D;
+ }
+ brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
+ break;
+ case BRW_OPCODE_SEL:
+ brw_SEL(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_BFREV:
+ assert(devinfo->gen >= 7);
+ /* BFREV only supports UD type for src and dst. */
+ brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
+ retype(src[0], BRW_REGISTER_TYPE_UD));
+ break;
+ case BRW_OPCODE_FBH:
+ assert(devinfo->gen >= 7);
+ /* FBH only supports UD type for dst. */
+ brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
+ break;
+ case BRW_OPCODE_FBL:
+ assert(devinfo->gen >= 7);
+ /* FBL only supports UD type for dst. */
+ brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
+ break;
+ case BRW_OPCODE_LZD:
+ brw_LZD(p, dst, src[0]);
+ break;
+ case BRW_OPCODE_CBIT:
+ assert(devinfo->gen >= 7);
+ /* CBIT only supports UD type for dst. */
+ brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
+ break;
+ case BRW_OPCODE_ADDC:
+ assert(devinfo->gen >= 7);
+ brw_ADDC(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_SUBB:
+ assert(devinfo->gen >= 7);
+ brw_SUBB(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_MAC:
+ brw_MAC(p, dst, src[0], src[1]);
+ break;
+
+ case BRW_OPCODE_BFE:
+ assert(devinfo->gen >= 7);
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ brw_BFE(p, dst, src[0], src[1], src[2]);
+ break;
+
+ case BRW_OPCODE_BFI1:
+ assert(devinfo->gen >= 7);
+ brw_BFI1(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_BFI2:
+ assert(devinfo->gen >= 7);
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ brw_BFI2(p, dst, src[0], src[1], src[2]);
+ break;
+
+ case BRW_OPCODE_IF:
+ if (inst->src[0].file != BAD_FILE) {
+ /* The instruction has an embedded compare (only allowed on gen6) */
+ assert(devinfo->gen == 6);
+ gen6_IF(p, inst->conditional_mod, src[0], src[1]);
+ } else {
+ brw_IF(p, brw_inst_exec_size(devinfo, p->current));
+ }
+ break;
+
+ case BRW_OPCODE_ELSE:
+ brw_ELSE(p);
+ break;
+ case BRW_OPCODE_ENDIF:
+ brw_ENDIF(p);
+ break;
+
+ case BRW_OPCODE_DO:
+ brw_DO(p, brw_inst_exec_size(devinfo, p->current));
+ break;
+
+ case BRW_OPCODE_BREAK:
+ brw_BREAK(p);
+ break;
+ case BRW_OPCODE_CONTINUE:
+ brw_CONT(p);
+ break;
+
+ case BRW_OPCODE_WHILE:
+ brw_WHILE(p);
+ loop_count++;
+ break;
+
+ case SHADER_OPCODE_RCP:
+ case SHADER_OPCODE_RSQ:
+ case SHADER_OPCODE_SQRT:
+ case SHADER_OPCODE_EXP2:
+ case SHADER_OPCODE_LOG2:
+ case SHADER_OPCODE_SIN:
+ case SHADER_OPCODE_COS:
+ assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
+ if (devinfo->gen >= 6) {
+ assert(inst->mlen == 0);
+ assert(devinfo->gen >= 7 || inst->exec_size == 8);
+ gen6_math(p, dst, brw_math_function(inst->opcode),
+ src[0], brw_null_reg());
+ } else {
+ assert(inst->mlen >= 1);
+ assert(devinfo->gen == 5 || devinfo->is_g4x || inst->exec_size == 8);
+ gen4_math(p, dst,
+ brw_math_function(inst->opcode),
+ inst->base_mrf, src[0],
+ BRW_MATH_PRECISION_FULL);
+ }
+ break;
+ case SHADER_OPCODE_INT_QUOTIENT:
+ case SHADER_OPCODE_INT_REMAINDER:
+ case SHADER_OPCODE_POW:
+ assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
+ if (devinfo->gen >= 6) {
+ assert(inst->mlen == 0);
+ assert((devinfo->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) ||
+ inst->exec_size == 8);
+ gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
+ } else {
+ assert(inst->mlen >= 1);
+ assert(inst->exec_size == 8);
+ gen4_math(p, dst, brw_math_function(inst->opcode),
+ inst->base_mrf, src[0],
+ BRW_MATH_PRECISION_FULL);
+ }
+ break;
+ case FS_OPCODE_CINTERP:
+ brw_MOV(p, dst, src[0]);
+ break;
+ case FS_OPCODE_LINTERP:
+ generate_linterp(inst, dst, src);
+ break;
+ case FS_OPCODE_PIXEL_X:
+ assert(src[0].type == BRW_REGISTER_TYPE_UW);
+ src[0].subnr = 0 * type_sz(src[0].type);
+ brw_MOV(p, dst, stride(src[0], 8, 4, 1));
+ break;
+ case FS_OPCODE_PIXEL_Y:
+ assert(src[0].type == BRW_REGISTER_TYPE_UW);
+ src[0].subnr = 4 * type_sz(src[0].type);
+ brw_MOV(p, dst, stride(src[0], 8, 4, 1));
+ break;
+ case FS_OPCODE_GET_BUFFER_SIZE:
+ generate_get_buffer_size(inst, dst, src[0], src[1]);
+ break;
+ case SHADER_OPCODE_TEX:
+ case FS_OPCODE_TXB:
+ case SHADER_OPCODE_TXD:
+ case SHADER_OPCODE_TXF:
+ case SHADER_OPCODE_TXF_LZ:
+ case SHADER_OPCODE_TXF_CMS:
+ case SHADER_OPCODE_TXF_CMS_W:
+ case SHADER_OPCODE_TXF_UMS:
+ case SHADER_OPCODE_TXF_MCS:
+ case SHADER_OPCODE_TXL:
+ case SHADER_OPCODE_TXL_LZ:
+ case SHADER_OPCODE_TXS:
+ case SHADER_OPCODE_LOD:
+ case SHADER_OPCODE_TG4:
+ case SHADER_OPCODE_TG4_OFFSET:
+ case SHADER_OPCODE_SAMPLEINFO:
+ generate_tex(inst, dst, src[0], src[1], src[2]);
+ break;
+ case FS_OPCODE_DDX_COARSE:
+ case FS_OPCODE_DDX_FINE:
+ generate_ddx(inst->opcode, dst, src[0]);
+ break;
+ case FS_OPCODE_DDY_COARSE:
+ case FS_OPCODE_DDY_FINE:
+ generate_ddy(inst->opcode, dst, src[0]);
+ break;
+
+ case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
+ generate_scratch_write(inst, src[0]);
+ spill_count++;
+ break;
+
+ case SHADER_OPCODE_GEN4_SCRATCH_READ:
+ generate_scratch_read(inst, dst);
+ fill_count++;
+ break;
+
+ case SHADER_OPCODE_GEN7_SCRATCH_READ:
+ generate_scratch_read_gen7(inst, dst);
+ fill_count++;
+ break;
+
+ case SHADER_OPCODE_MOV_INDIRECT:
+ generate_mov_indirect(inst, dst, src[0], src[1]);
+ break;
+
+ case SHADER_OPCODE_URB_READ_SIMD8:
+ case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
+ generate_urb_read(inst, dst, src[0]);
+ break;
+
+ case SHADER_OPCODE_URB_WRITE_SIMD8:
+ case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+ case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+ case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
+ generate_urb_write(inst, src[0]);
+ break;
+
+ case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+ assert(inst->force_writemask_all);
+ generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
+ break;
+
+ case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
+ assert(inst->force_writemask_all);
+ generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
+ break;
+
+ case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
+ generate_varying_pull_constant_load_gen4(inst, dst, src[0]);
+ break;
+
+ case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
+ generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
+ break;
+
+ case FS_OPCODE_REP_FB_WRITE:
+ case FS_OPCODE_FB_WRITE:
+ generate_fb_write(inst, src[0]);
+ break;
+
+ case FS_OPCODE_FB_READ:
+ generate_fb_read(inst, dst, src[0]);
+ break;
+
+ case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
+ generate_mov_dispatch_to_flags(inst);
+ break;
+
+ case FS_OPCODE_DISCARD_JUMP:
+ generate_discard_jump(inst);
+ break;
+
+ case SHADER_OPCODE_SHADER_TIME_ADD:
+ generate_shader_time_add(inst, src[0], src[1], src[2]);
+ break;
+
+ case SHADER_OPCODE_UNTYPED_ATOMIC:
+ assert(src[2].file == BRW_IMMEDIATE_VALUE);
+ brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud,
+ inst->mlen, !inst->dst.is_null());
+ break;
+
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+ assert(src[2].file == BRW_IMMEDIATE_VALUE);
+ brw_untyped_surface_read(p, dst, src[0], src[1],
+ inst->mlen, src[2].ud);
+ break;
+
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+ assert(src[2].file == BRW_IMMEDIATE_VALUE);
+ brw_untyped_surface_write(p, src[0], src[1],
+ inst->mlen, src[2].ud);
+ break;
+
+ case SHADER_OPCODE_TYPED_ATOMIC:
+ assert(src[2].file == BRW_IMMEDIATE_VALUE);
+ brw_typed_atomic(p, dst, src[0], src[1],
+ src[2].ud, inst->mlen, !inst->dst.is_null());
+ break;
+
+ case SHADER_OPCODE_TYPED_SURFACE_READ:
+ assert(src[2].file == BRW_IMMEDIATE_VALUE);
+ brw_typed_surface_read(p, dst, src[0], src[1],
+ inst->mlen, src[2].ud);
+ break;
+
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+ assert(src[2].file == BRW_IMMEDIATE_VALUE);
+ brw_typed_surface_write(p, src[0], src[1], inst->mlen, src[2].ud);
+ break;
+
+ case SHADER_OPCODE_MEMORY_FENCE:
+ brw_memory_fence(p, dst);
+ break;
+
+ case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
+ const struct brw_reg mask =
+ brw_stage_has_packed_dispatch(devinfo, stage,
+ prog_data) ? brw_imm_ud(~0u) :
+ stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
+ brw_dmask_reg();
+ brw_find_live_channel(p, dst, mask);
+ break;
+ }
+
+ case SHADER_OPCODE_BROADCAST:
+ assert(inst->force_writemask_all);
+ brw_broadcast(p, dst, src[0], src[1]);
+ break;
+
+ case FS_OPCODE_SET_SAMPLE_ID:
+ generate_set_sample_id(inst, dst, src[0], src[1]);
+ break;
+
+ case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+ generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
+ break;
+
+ case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
+ case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
+ generate_unpack_half_2x16_split(inst, dst, src[0]);
+ break;
+
+ case FS_OPCODE_PLACEHOLDER_HALT:
+ /* This is the place where the final HALT needs to be inserted if
+ * we've emitted any discards. If not, this will emit no code.
+ */
+ if (!patch_discard_jumps_to_fb_writes()) {
+ if (unlikely(debug_flag)) {
+ annotation.ann_count--;
+ }
+ }
+ break;
+
+ case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+ generate_pixel_interpolator_query(inst, dst, src[0], src[1],
+ GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
+ break;
+
+ case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+ generate_pixel_interpolator_query(inst, dst, src[0], src[1],
+ GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
+ break;
+
+ case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+ generate_pixel_interpolator_query(inst, dst, src[0], src[1],
+ GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
+ break;
+
+ case CS_OPCODE_CS_TERMINATE:
+ generate_cs_terminate(inst, src[0]);
+ break;
+
+ case SHADER_OPCODE_BARRIER:
+ generate_barrier(inst, src[0]);
+ break;
+
+ case BRW_OPCODE_DIM:
+ assert(devinfo->is_haswell);
+ assert(src[0].type == BRW_REGISTER_TYPE_DF);
+ assert(dst.type == BRW_REGISTER_TYPE_DF);
+ brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
+ break;
+
+ default:
+ unreachable("Unsupported opcode");
+
+ case SHADER_OPCODE_LOAD_PAYLOAD:
+ unreachable("Should be lowered by lower_load_payload()");
+ }
+
+ if (multiple_instructions_emitted)
+ continue;
+
+ if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
+ assert(p->next_insn_offset == last_insn_offset + 16 ||
+ !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
+ "emitting more than 1 instruction");
+
+ brw_inst *last = &p->store[last_insn_offset / 16];
+
+ if (inst->conditional_mod)
+ brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
+ brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
+ brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
+ }
+ }
+
+ brw_set_uip_jip(p, start_offset);
+ annotation_finalize(&annotation, p->next_insn_offset);
+
+#ifndef NDEBUG
+ bool validated = brw_validate_instructions(p, start_offset, &annotation);
+#else
+ if (unlikely(debug_flag))
+ brw_validate_instructions(p, start_offset, &annotation);
+#endif
+
+ int before_size = p->next_insn_offset - start_offset;
+ brw_compact_instructions(p, start_offset, annotation.ann_count,
+ annotation.ann);
+ int after_size = p->next_insn_offset - start_offset;
+
+ if (unlikely(debug_flag)) {
+ fprintf(stderr, "Native code for %s\n"
+ "SIMD%d shader: %d instructions. %d loops. %u cycles. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d"
+ " bytes (%.0f%%)\n",
+ shader_name, dispatch_width, before_size / 16, loop_count, cfg->cycle_count,
+ spill_count, fill_count, promoted_constants, before_size, after_size,
+ 100.0f * (before_size - after_size) / before_size);
+
+ dump_assembly(p->store, annotation.ann_count, annotation.ann,
+ p->devinfo);
+ ralloc_free(annotation.mem_ctx);
+ }
+ assert(validated);
+
+ compiler->shader_debug_log(log_data,
+ "%s SIMD%d shader: %d inst, %d loops, %u cycles, "
+ "%d:%d spills:fills, Promoted %u constants, "
+ "compacted %d to %d bytes.",
+ _mesa_shader_stage_to_abbrev(stage),
+ dispatch_width, before_size / 16,
+ loop_count, cfg->cycle_count, spill_count,
+ fill_count, promoted_constants, before_size,
+ after_size);
+
+ return start_offset;
+}
+
+const unsigned *
+fs_generator::get_assembly(unsigned int *assembly_size)
+{
+ return brw_get_program(p, assembly_size);
+}
diff --git a/src/intel/compiler/brw_fs_live_variables.cpp b/src/intel/compiler/brw_fs_live_variables.cpp
new file mode 100644
index 00000000000..c449672a519
--- /dev/null
+++ b/src/intel/compiler/brw_fs_live_variables.cpp
@@ -0,0 +1,334 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ * Eric Anholt <[email protected]>
+ *
+ */
+
+#include "brw_cfg.h"
+#include "brw_fs_live_variables.h"
+
+using namespace brw;
+
+#define MAX_INSTRUCTION (1 << 30)
+
+/** @file brw_fs_live_variables.cpp
+ *
+ * Support for calculating liveness information about virtual GRFs.
+ *
+ * This produces a live interval for each whole virtual GRF. We could
+ * choose to expose per-component live intervals for VGRFs of size > 1,
+ * but we currently do not. It is easier for the consumers of this
+ * information to work with whole VGRFs.
+ *
+ * However, we internally track use/def information at the per-GRF level for
+ * greater accuracy. Large VGRFs may be accessed piecemeal over many
+ * (possibly non-adjacent) instructions. In this case, examining a single
+ * instruction is insufficient to decide whether a whole VGRF is ultimately
+ * used or defined. Tracking individual components allows us to easily
+ * assemble this information.
+ *
+ * See Muchnick's Advanced Compiler Design and Implementation, section
+ * 14.1 (p444).
+ */
+
+void
+fs_live_variables::setup_one_read(struct block_data *bd, fs_inst *inst,
+ int ip, const fs_reg &reg)
+{
+ int var = var_from_reg(reg);
+ assert(var < num_vars);
+
+ start[var] = MIN2(start[var], ip);
+ end[var] = MAX2(end[var], ip);
+
+ /* The use[] bitset marks when the block makes use of a variable (VGRF
+ * channel) without having completely defined that variable within the
+ * block.
+ */
+ if (!BITSET_TEST(bd->def, var))
+ BITSET_SET(bd->use, var);
+}
+
+void
+fs_live_variables::setup_one_write(struct block_data *bd, fs_inst *inst,
+ int ip, const fs_reg &reg)
+{
+ int var = var_from_reg(reg);
+ assert(var < num_vars);
+
+ start[var] = MIN2(start[var], ip);
+ end[var] = MAX2(end[var], ip);
+
+ /* The def[] bitset marks when an initialization in a block completely
+ * screens off previous updates of that variable (VGRF channel).
+ */
+ if (inst->dst.file == VGRF && !inst->is_partial_write()) {
+ if (!BITSET_TEST(bd->use, var))
+ BITSET_SET(bd->def, var);
+ }
+}
+
+/**
+ * Sets up the use[] and def[] bitsets.
+ *
+ * The basic-block-level live variable analysis needs to know which
+ * variables get used before they're completely defined, and which
+ * variables are completely defined before they're used.
+ *
+ * These are tracked at the per-component level, rather than whole VGRFs.
+ */
+void
+fs_live_variables::setup_def_use()
+{
+ int ip = 0;
+
+ foreach_block (block, cfg) {
+ assert(ip == block->start_ip);
+ if (block->num > 0)
+ assert(cfg->blocks[block->num - 1]->end_ip == ip - 1);
+
+ struct block_data *bd = &block_data[block->num];
+
+ foreach_inst_in_block(fs_inst, inst, block) {
+ /* Set use[] for this instruction */
+ for (unsigned int i = 0; i < inst->sources; i++) {
+ fs_reg reg = inst->src[i];
+
+ if (reg.file != VGRF)
+ continue;
+
+ for (unsigned j = 0; j < regs_read(inst, i); j++) {
+ setup_one_read(bd, inst, ip, reg);
+ reg.offset += REG_SIZE;
+ }
+ }
+
+ bd->flag_use[0] |= inst->flags_read(v->devinfo) & ~bd->flag_def[0];
+
+ /* Set def[] for this instruction */
+ if (inst->dst.file == VGRF) {
+ fs_reg reg = inst->dst;
+ for (unsigned j = 0; j < regs_written(inst); j++) {
+ setup_one_write(bd, inst, ip, reg);
+ reg.offset += REG_SIZE;
+ }
+ }
+
+ if (!inst->predicate && inst->exec_size >= 8)
+ bd->flag_def[0] |= inst->flags_written() & ~bd->flag_use[0];
+
+ ip++;
+ }
+ }
+}
+
+/**
+ * The algorithm incrementally sets bits in liveout and livein,
+ * propagating it through control flow. It will eventually terminate
+ * because it only ever adds bits, and stops when no bits are added in
+ * a pass.
+ */
+void
+fs_live_variables::compute_live_variables()
+{
+ bool cont = true;
+
+ while (cont) {
+ cont = false;
+
+ foreach_block_reverse (block, cfg) {
+ struct block_data *bd = &block_data[block->num];
+
+ /* Update liveout */
+ foreach_list_typed(bblock_link, child_link, link, &block->children) {
+ struct block_data *child_bd = &block_data[child_link->block->num];
+
+ for (int i = 0; i < bitset_words; i++) {
+ BITSET_WORD new_liveout = (child_bd->livein[i] &
+ ~bd->liveout[i]);
+ if (new_liveout) {
+ bd->liveout[i] |= new_liveout;
+ cont = true;
+ }
+ }
+ BITSET_WORD new_liveout = (child_bd->flag_livein[0] &
+ ~bd->flag_liveout[0]);
+ if (new_liveout) {
+ bd->flag_liveout[0] |= new_liveout;
+ cont = true;
+ }
+ }
+
+ /* Update livein */
+ for (int i = 0; i < bitset_words; i++) {
+ BITSET_WORD new_livein = (bd->use[i] |
+ (bd->liveout[i] &
+ ~bd->def[i]));
+ if (new_livein & ~bd->livein[i]) {
+ bd->livein[i] |= new_livein;
+ cont = true;
+ }
+ }
+ BITSET_WORD new_livein = (bd->flag_use[0] |
+ (bd->flag_liveout[0] &
+ ~bd->flag_def[0]));
+ if (new_livein & ~bd->flag_livein[0]) {
+ bd->flag_livein[0] |= new_livein;
+ cont = true;
+ }
+ }
+ }
+}
+
+/**
+ * Extend the start/end ranges for each variable to account for the
+ * new information calculated from control flow.
+ */
+void
+fs_live_variables::compute_start_end()
+{
+ foreach_block (block, cfg) {
+ struct block_data *bd = &block_data[block->num];
+
+ for (int i = 0; i < num_vars; i++) {
+ if (BITSET_TEST(bd->livein, i)) {
+ start[i] = MIN2(start[i], block->start_ip);
+ end[i] = MAX2(end[i], block->start_ip);
+ }
+
+ if (BITSET_TEST(bd->liveout, i)) {
+ start[i] = MIN2(start[i], block->end_ip);
+ end[i] = MAX2(end[i], block->end_ip);
+ }
+ }
+ }
+}
+
+fs_live_variables::fs_live_variables(fs_visitor *v, const cfg_t *cfg)
+ : v(v), cfg(cfg)
+{
+ mem_ctx = ralloc_context(NULL);
+
+ num_vgrfs = v->alloc.count;
+ num_vars = 0;
+ var_from_vgrf = rzalloc_array(mem_ctx, int, num_vgrfs);
+ for (int i = 0; i < num_vgrfs; i++) {
+ var_from_vgrf[i] = num_vars;
+ num_vars += v->alloc.sizes[i];
+ }
+
+ vgrf_from_var = rzalloc_array(mem_ctx, int, num_vars);
+ for (int i = 0; i < num_vgrfs; i++) {
+ for (unsigned j = 0; j < v->alloc.sizes[i]; j++) {
+ vgrf_from_var[var_from_vgrf[i] + j] = i;
+ }
+ }
+
+ start = ralloc_array(mem_ctx, int, num_vars);
+ end = rzalloc_array(mem_ctx, int, num_vars);
+ for (int i = 0; i < num_vars; i++) {
+ start[i] = MAX_INSTRUCTION;
+ end[i] = -1;
+ }
+
+ block_data= rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks);
+
+ bitset_words = BITSET_WORDS(num_vars);
+ for (int i = 0; i < cfg->num_blocks; i++) {
+ block_data[i].def = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+ block_data[i].use = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+ block_data[i].livein = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+ block_data[i].liveout = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+
+ block_data[i].flag_def[0] = 0;
+ block_data[i].flag_use[0] = 0;
+ block_data[i].flag_livein[0] = 0;
+ block_data[i].flag_liveout[0] = 0;
+ }
+
+ setup_def_use();
+ compute_live_variables();
+ compute_start_end();
+}
+
+fs_live_variables::~fs_live_variables()
+{
+ ralloc_free(mem_ctx);
+}
+
+void
+fs_visitor::invalidate_live_intervals()
+{
+ ralloc_free(live_intervals);
+ live_intervals = NULL;
+}
+
+/**
+ * Compute the live intervals for each virtual GRF.
+ *
+ * This uses the per-component use/def data, but combines it to produce
+ * information about whole VGRFs.
+ */
+void
+fs_visitor::calculate_live_intervals()
+{
+ if (this->live_intervals)
+ return;
+
+ int num_vgrfs = this->alloc.count;
+ ralloc_free(this->virtual_grf_start);
+ ralloc_free(this->virtual_grf_end);
+ virtual_grf_start = ralloc_array(mem_ctx, int, num_vgrfs);
+ virtual_grf_end = ralloc_array(mem_ctx, int, num_vgrfs);
+
+ for (int i = 0; i < num_vgrfs; i++) {
+ virtual_grf_start[i] = MAX_INSTRUCTION;
+ virtual_grf_end[i] = -1;
+ }
+
+ this->live_intervals = new(mem_ctx) fs_live_variables(this, cfg);
+
+ /* Merge the per-component live ranges to whole VGRF live ranges. */
+ for (int i = 0; i < live_intervals->num_vars; i++) {
+ int vgrf = live_intervals->vgrf_from_var[i];
+ virtual_grf_start[vgrf] = MIN2(virtual_grf_start[vgrf],
+ live_intervals->start[i]);
+ virtual_grf_end[vgrf] = MAX2(virtual_grf_end[vgrf],
+ live_intervals->end[i]);
+ }
+}
+
+bool
+fs_live_variables::vars_interfere(int a, int b)
+{
+ return !(end[b] <= start[a] ||
+ end[a] <= start[b]);
+}
+
+bool
+fs_visitor::virtual_grf_interferes(int a, int b)
+{
+ return !(virtual_grf_end[a] <= virtual_grf_start[b] ||
+ virtual_grf_end[b] <= virtual_grf_start[a]);
+}
diff --git a/src/intel/compiler/brw_fs_live_variables.h b/src/intel/compiler/brw_fs_live_variables.h
new file mode 100644
index 00000000000..91d1e42cbc1
--- /dev/null
+++ b/src/intel/compiler/brw_fs_live_variables.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ * Eric Anholt <[email protected]>
+ *
+ */
+
+#include "brw_fs.h"
+#include "util/bitset.h"
+
+struct cfg_t;
+
+namespace brw {
+
+struct block_data {
+ /**
+ * Which variables are defined before being used in the block.
+ *
+ * Note that for our purposes, "defined" means unconditionally, completely
+ * defined.
+ */
+ BITSET_WORD *def;
+
+ /**
+ * Which variables are used before being defined in the block.
+ */
+ BITSET_WORD *use;
+
+ /** Which defs reach the entry point of the block. */
+ BITSET_WORD *livein;
+
+ /** Which defs reach the exit point of the block. */
+ BITSET_WORD *liveout;
+
+ BITSET_WORD flag_def[1];
+ BITSET_WORD flag_use[1];
+ BITSET_WORD flag_livein[1];
+ BITSET_WORD flag_liveout[1];
+};
+
+class fs_live_variables {
+public:
+ DECLARE_RALLOC_CXX_OPERATORS(fs_live_variables)
+
+ fs_live_variables(fs_visitor *v, const cfg_t *cfg);
+ ~fs_live_variables();
+
+ bool vars_interfere(int a, int b);
+ int var_from_reg(const fs_reg &reg) const
+ {
+ return var_from_vgrf[reg.nr] + reg.offset / REG_SIZE;
+ }
+
+ /** Map from virtual GRF number to index in block_data arrays. */
+ int *var_from_vgrf;
+
+ /**
+ * Map from any index in block_data to the virtual GRF containing it.
+ *
+ * For alloc.sizes of [1, 2, 3], vgrf_from_var would contain
+ * [0, 1, 1, 2, 2, 2].
+ */
+ int *vgrf_from_var;
+
+ int num_vars;
+ int num_vgrfs;
+ int bitset_words;
+
+ /** @{
+ * Final computed live ranges for each var (each component of each virtual
+ * GRF).
+ */
+ int *start;
+ int *end;
+ /** @} */
+
+ /** Per-basic-block information on live variables */
+ struct block_data *block_data;
+
+protected:
+ void setup_def_use();
+ void setup_one_read(struct block_data *bd, fs_inst *inst, int ip,
+ const fs_reg &reg);
+ void setup_one_write(struct block_data *bd, fs_inst *inst, int ip,
+ const fs_reg &reg);
+ void compute_live_variables();
+ void compute_start_end();
+
+ fs_visitor *v;
+ const cfg_t *cfg;
+ void *mem_ctx;
+
+};
+
+} /* namespace brw */
diff --git a/src/intel/compiler/brw_fs_lower_d2x.cpp b/src/intel/compiler/brw_fs_lower_d2x.cpp
new file mode 100644
index 00000000000..a2db1154615
--- /dev/null
+++ b/src/intel/compiler/brw_fs_lower_d2x.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright © 2015 Connor Abbott
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "brw_fs_builder.h"
+
+using namespace brw;
+
+bool
+fs_visitor::lower_d2x()
+{
+ bool progress = false;
+
+ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+ if (inst->opcode != BRW_OPCODE_MOV)
+ continue;
+
+ if (inst->dst.type != BRW_REGISTER_TYPE_F &&
+ inst->dst.type != BRW_REGISTER_TYPE_D &&
+ inst->dst.type != BRW_REGISTER_TYPE_UD)
+ continue;
+
+ if (inst->src[0].type != BRW_REGISTER_TYPE_DF &&
+ inst->src[0].type != BRW_REGISTER_TYPE_UQ &&
+ inst->src[0].type != BRW_REGISTER_TYPE_Q)
+ continue;
+
+ assert(inst->dst.file == VGRF);
+ assert(inst->saturate == false);
+ fs_reg dst = inst->dst;
+
+ const fs_builder ibld(this, block, inst);
+
+ /* From the Broadwell PRM, 3D Media GPGPU, "Double Precision Float to
+ * Single Precision Float":
+ *
+ * The upper Dword of every Qword will be written with undefined
+ * value when converting DF to F.
+ *
+ * So we need to allocate a temporary that's two registers, and then do
+ * a strided MOV to get the lower DWord of every Qword that has the
+ * result.
+ */
+ fs_reg temp = ibld.vgrf(inst->src[0].type, 1);
+ fs_reg strided_temp = subscript(temp, inst->dst.type, 0);
+ ibld.MOV(strided_temp, inst->src[0]);
+ ibld.MOV(dst, strided_temp);
+
+ inst->remove(block);
+ progress = true;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
diff --git a/src/intel/compiler/brw_fs_lower_pack.cpp b/src/intel/compiler/brw_fs_lower_pack.cpp
new file mode 100644
index 00000000000..7afaae095bd
--- /dev/null
+++ b/src/intel/compiler/brw_fs_lower_pack.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright © 2015 Connor Abbott
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "brw_fs_builder.h"
+
+using namespace brw;
+
+bool
+fs_visitor::lower_pack()
+{
+ bool progress = false;
+
+ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+ if (inst->opcode != FS_OPCODE_PACK)
+ continue;
+
+ assert(inst->dst.file == VGRF);
+ assert(inst->saturate == false);
+ fs_reg dst = inst->dst;
+
+ const fs_builder ibld(this, block, inst);
+ for (unsigned i = 0; i < inst->sources; i++)
+ ibld.MOV(subscript(dst, inst->src[i].type, i), inst->src[i]);
+
+ inst->remove(block);
+ progress = true;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
new file mode 100644
index 00000000000..d403dec5357
--- /dev/null
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -0,0 +1,4679 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/glsl/ir.h"
+#include "brw_fs.h"
+#include "brw_fs_surface_builder.h"
+#include "brw_nir.h"
+
+using namespace brw;
+using namespace brw::surface_access;
+
+void
+fs_visitor::emit_nir_code()
+{
+ /* emit the arrays used for inputs and outputs - load/store intrinsics will
+ * be converted to reads/writes of these arrays
+ */
+ nir_setup_outputs();
+ nir_setup_uniforms();
+ nir_emit_system_values();
+
+ /* get the main function and emit it */
+ nir_foreach_function(function, nir) {
+ assert(strcmp(function->name, "main") == 0);
+ assert(function->impl);
+ nir_emit_impl(function->impl);
+ }
+}
+
+void
+fs_visitor::nir_setup_outputs()
+{
+ if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT)
+ return;
+
+ nir_foreach_variable(var, &nir->outputs) {
+ const unsigned vec4s =
+ var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4)
+ : type_size_vec4(var->type);
+ fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * vec4s);
+ for (unsigned i = 0; i < vec4s; i++) {
+ if (outputs[var->data.driver_location + i].file == BAD_FILE)
+ outputs[var->data.driver_location + i] = offset(reg, bld, 4 * i);
+ }
+ }
+}
+
+void
+fs_visitor::nir_setup_uniforms()
+{
+ if (dispatch_width != min_dispatch_width)
+ return;
+
+ uniforms = nir->num_uniforms / 4;
+}
+
+static bool
+emit_system_values_block(nir_block *block, fs_visitor *v)
+{
+ fs_reg *reg;
+
+ nir_foreach_instr(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ switch (intrin->intrinsic) {
+ case nir_intrinsic_load_vertex_id:
+ unreachable("should be lowered by lower_vertex_id().");
+
+ case nir_intrinsic_load_vertex_id_zero_base:
+ assert(v->stage == MESA_SHADER_VERTEX);
+ reg = &v->nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
+ if (reg->file == BAD_FILE)
+ *reg = *v->emit_vs_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
+ break;
+
+ case nir_intrinsic_load_base_vertex:
+ assert(v->stage == MESA_SHADER_VERTEX);
+ reg = &v->nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
+ if (reg->file == BAD_FILE)
+ *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_VERTEX);
+ break;
+
+ case nir_intrinsic_load_instance_id:
+ assert(v->stage == MESA_SHADER_VERTEX);
+ reg = &v->nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
+ if (reg->file == BAD_FILE)
+ *reg = *v->emit_vs_system_value(SYSTEM_VALUE_INSTANCE_ID);
+ break;
+
+ case nir_intrinsic_load_base_instance:
+ assert(v->stage == MESA_SHADER_VERTEX);
+ reg = &v->nir_system_values[SYSTEM_VALUE_BASE_INSTANCE];
+ if (reg->file == BAD_FILE)
+ *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_INSTANCE);
+ break;
+
+ case nir_intrinsic_load_draw_id:
+ assert(v->stage == MESA_SHADER_VERTEX);
+ reg = &v->nir_system_values[SYSTEM_VALUE_DRAW_ID];
+ if (reg->file == BAD_FILE)
+ *reg = *v->emit_vs_system_value(SYSTEM_VALUE_DRAW_ID);
+ break;
+
+ case nir_intrinsic_load_invocation_id:
+ if (v->stage == MESA_SHADER_TESS_CTRL)
+ break;
+ assert(v->stage == MESA_SHADER_GEOMETRY);
+ reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
+ if (reg->file == BAD_FILE) {
+ const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);
+ fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+ fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ abld.SHR(iid, g1, brw_imm_ud(27u));
+ *reg = iid;
+ }
+ break;
+
+ case nir_intrinsic_load_sample_pos:
+ assert(v->stage == MESA_SHADER_FRAGMENT);
+ reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
+ if (reg->file == BAD_FILE)
+ *reg = *v->emit_samplepos_setup();
+ break;
+
+ case nir_intrinsic_load_sample_id:
+ assert(v->stage == MESA_SHADER_FRAGMENT);
+ reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
+ if (reg->file == BAD_FILE)
+ *reg = *v->emit_sampleid_setup();
+ break;
+
+ case nir_intrinsic_load_sample_mask_in:
+ assert(v->stage == MESA_SHADER_FRAGMENT);
+ assert(v->devinfo->gen >= 7);
+ reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
+ if (reg->file == BAD_FILE)
+ *reg = *v->emit_samplemaskin_setup();
+ break;
+
+ case nir_intrinsic_load_work_group_id:
+ assert(v->stage == MESA_SHADER_COMPUTE);
+ reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID];
+ if (reg->file == BAD_FILE)
+ *reg = *v->emit_cs_work_group_id_setup();
+ break;
+
+ case nir_intrinsic_load_helper_invocation:
+ assert(v->stage == MESA_SHADER_FRAGMENT);
+ reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION];
+ if (reg->file == BAD_FILE) {
+ const fs_builder abld =
+ v->bld.annotate("gl_HelperInvocation", NULL);
+
+ /* On Gen6+ (gl_HelperInvocation is only exposed on Gen7+) the
+ * pixel mask is in g1.7 of the thread payload.
+ *
+ * We move the per-channel pixel enable bit to the low bit of each
+ * channel by shifting the byte containing the pixel mask by the
+ * vector immediate 0x76543210UV.
+ *
+ * The region of <1,8,0> reads only 1 byte (the pixel masks for
+ * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
+ * masks for 2 and 3) in SIMD16.
+ */
+ fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);
+ abld.SHR(shifted,
+ stride(byte_offset(retype(brw_vec1_grf(1, 0),
+ BRW_REGISTER_TYPE_UB), 28),
+ 1, 8, 0),
+ brw_imm_v(0x76543210));
+
+ /* A set bit in the pixel mask means the channel is enabled, but
+ * that is the opposite of gl_HelperInvocation so we need to invert
+ * the mask.
+ *
+ * The negate source-modifier bit of logical instructions on Gen8+
+ * performs 1's complement negation, so we can use that instead of
+ * a NOT instruction.
+ */
+ fs_reg inverted = negate(shifted);
+ if (v->devinfo->gen < 8) {
+ inverted = abld.vgrf(BRW_REGISTER_TYPE_UW);
+ abld.NOT(inverted, shifted);
+ }
+
+ /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
+ * with 1 and negating.
+ */
+ fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ abld.AND(anded, inverted, brw_imm_uw(1));
+
+ fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1);
+ abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D)));
+ *reg = dst;
+ }
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ return true;
+}
+
+void
+fs_visitor::nir_emit_system_values()
+{
+ nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
+ for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
+ nir_system_values[i] = fs_reg();
+ }
+
+ nir_foreach_function(function, nir) {
+ assert(strcmp(function->name, "main") == 0);
+ assert(function->impl);
+ nir_foreach_block(block, function->impl) {
+ emit_system_values_block(block, this);
+ }
+ }
+}
+
+void
+fs_visitor::nir_emit_impl(nir_function_impl *impl)
+{
+ nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc);
+ for (unsigned i = 0; i < impl->reg_alloc; i++) {
+ nir_locals[i] = fs_reg();
+ }
+
+ foreach_list_typed(nir_register, reg, node, &impl->registers) {
+ unsigned array_elems =
+ reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
+ unsigned size = array_elems * reg->num_components;
+ const brw_reg_type reg_type =
+ reg->bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF;
+ nir_locals[reg->index] = bld.vgrf(reg_type, size);
+ }
+
+ nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg,
+ impl->ssa_alloc);
+
+ nir_emit_cf_list(&impl->body);
+}
+
+void
+fs_visitor::nir_emit_cf_list(exec_list *list)
+{
+ exec_list_validate(list);
+ foreach_list_typed(nir_cf_node, node, node, list) {
+ switch (node->type) {
+ case nir_cf_node_if:
+ nir_emit_if(nir_cf_node_as_if(node));
+ break;
+
+ case nir_cf_node_loop:
+ nir_emit_loop(nir_cf_node_as_loop(node));
+ break;
+
+ case nir_cf_node_block:
+ nir_emit_block(nir_cf_node_as_block(node));
+ break;
+
+ default:
+ unreachable("Invalid CFG node block");
+ }
+ }
+}
+
+void
+fs_visitor::nir_emit_if(nir_if *if_stmt)
+{
+ /* first, put the condition into f0 */
+ fs_inst *inst = bld.MOV(bld.null_reg_d(),
+ retype(get_nir_src(if_stmt->condition),
+ BRW_REGISTER_TYPE_D));
+ inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+ bld.IF(BRW_PREDICATE_NORMAL);
+
+ nir_emit_cf_list(&if_stmt->then_list);
+
+ /* note: if the else is empty, dead CF elimination will remove it */
+ bld.emit(BRW_OPCODE_ELSE);
+
+ nir_emit_cf_list(&if_stmt->else_list);
+
+ bld.emit(BRW_OPCODE_ENDIF);
+}
+
+void
+fs_visitor::nir_emit_loop(nir_loop *loop)
+{
+ bld.emit(BRW_OPCODE_DO);
+
+ nir_emit_cf_list(&loop->body);
+
+ bld.emit(BRW_OPCODE_WHILE);
+}
+
+void
+fs_visitor::nir_emit_block(nir_block *block)
+{
+ nir_foreach_instr(instr, block) {
+ nir_emit_instr(instr);
+ }
+}
+
+void
+fs_visitor::nir_emit_instr(nir_instr *instr)
+{
+ const fs_builder abld = bld.annotate(NULL, instr);
+
+ switch (instr->type) {
+ case nir_instr_type_alu:
+ nir_emit_alu(abld, nir_instr_as_alu(instr));
+ break;
+
+ case nir_instr_type_intrinsic:
+ switch (stage) {
+ case MESA_SHADER_VERTEX:
+ nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
+ break;
+ case MESA_SHADER_TESS_CTRL:
+ nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr));
+ break;
+ case MESA_SHADER_TESS_EVAL:
+ nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
+ break;
+ case MESA_SHADER_GEOMETRY:
+ nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));
+ break;
+ case MESA_SHADER_FRAGMENT:
+ nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr));
+ break;
+ case MESA_SHADER_COMPUTE:
+ nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr));
+ break;
+ default:
+ unreachable("unsupported shader stage");
+ }
+ break;
+
+ case nir_instr_type_tex:
+ nir_emit_texture(abld, nir_instr_as_tex(instr));
+ break;
+
+ case nir_instr_type_load_const:
+ nir_emit_load_const(abld, nir_instr_as_load_const(instr));
+ break;
+
+ case nir_instr_type_ssa_undef:
+ /* We create a new VGRF for undefs on every use (by handling
+ * them in get_nir_src()), rather than for each definition.
+ * This helps register coalescing eliminate MOVs from undef.
+ */
+ break;
+
+ case nir_instr_type_jump:
+ nir_emit_jump(abld, nir_instr_as_jump(instr));
+ break;
+
+ default:
+ unreachable("unknown instruction type");
+ }
+}
+
+/**
+ * Recognizes a parent instruction of nir_op_extract_* and changes the type to
+ * match instr.
+ */
+bool
+fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
+ const fs_reg &result)
+{
+ if (!instr->src[0].src.is_ssa ||
+ !instr->src[0].src.ssa->parent_instr)
+ return false;
+
+ if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
+ return false;
+
+ nir_alu_instr *src0 =
+ nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+
+ if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&
+ src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
+ return false;
+
+ nir_const_value *element = nir_src_as_const_value(src0->src[1].src);
+ assert(element != NULL);
+
+ /* Element type to extract.*/
+ const brw_reg_type type = brw_int_type(
+ src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1,
+ src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8);
+
+ fs_reg op0 = get_nir_src(src0->src[0].src);
+ op0.type = brw_type_for_nir_type(devinfo,
+ (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
+ nir_src_bit_size(src0->src[0].src)));
+ op0 = offset(op0, bld, src0->src[0].swizzle[0]);
+
+ set_saturate(instr->dest.saturate,
+ bld.MOV(result, subscript(op0, type, element->u32[0])));
+ return true;
+}
+
+bool
+fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
+ const fs_reg &result)
+{
+ if (!instr->src[0].src.is_ssa ||
+ instr->src[0].src.ssa->parent_instr->type != nir_instr_type_intrinsic)
+ return false;
+
+ nir_intrinsic_instr *src0 =
+ nir_instr_as_intrinsic(instr->src[0].src.ssa->parent_instr);
+
+ if (src0->intrinsic != nir_intrinsic_load_front_face)
+ return false;
+
+ nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
+ if (!value1 || fabsf(value1->f32[0]) != 1.0f)
+ return false;
+
+ nir_const_value *value2 = nir_src_as_const_value(instr->src[2].src);
+ if (!value2 || fabsf(value2->f32[0]) != 1.0f)
+ return false;
+
+ fs_reg tmp = vgrf(glsl_type::int_type);
+
+ if (devinfo->gen >= 6) {
+ /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
+ fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
+
+ /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
+ *
+ * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W
+ * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
+ *
+ * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
+ *
+ * This negation looks like it's safe in practice, because bits 0:4 will
+ * surely be TRIANGLES
+ */
+
+ if (value1->f32[0] == -1.0f) {
+ g0.negate = true;
+ }
+
+ bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
+ g0, brw_imm_uw(0x3f80));
+ } else {
+ /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
+ fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
+
+ /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
+ *
+ * or(8) tmp<1>D g1.6<0,1,0>D 0x3f800000D
+ * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
+ *
+ * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
+ *
+ * This negation looks like it's safe in practice, because bits 0:4 will
+ * surely be TRIANGLES
+ */
+
+ if (value1->f32[0] == -1.0f) {
+ g1_6.negate = true;
+ }
+
+ bld.OR(tmp, g1_6, brw_imm_d(0x3f800000));
+ }
+ bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000));
+
+ return true;
+}
+
+static void
+emit_find_msb_using_lzd(const fs_builder &bld,
+ const fs_reg &result,
+ const fs_reg &src,
+ bool is_signed)
+{
+ fs_inst *inst;
+ fs_reg temp = src;
+
+ if (is_signed) {
+ /* LZD of an absolute value source almost always does the right
+ * thing. There are two problem values:
+ *
+ * * 0x80000000. Since abs(0x80000000) == 0x80000000, LZD returns
+ * 0. However, findMSB(int(0x80000000)) == 30.
+ *
+ * * 0xffffffff. Since abs(0xffffffff) == 1, LZD returns
+ * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
+ *
+ * For a value of zero or negative one, -1 will be returned.
+ *
+ * * Negative powers of two. LZD(abs(-(1<<x))) returns x, but
+ * findMSB(-(1<<x)) should return x-1.
+ *
+ * For all negative number cases, including 0x80000000 and
+ * 0xffffffff, the correct value is obtained from LZD if instead of
+ * negating the (already negative) value the logical-not is used. A
+ * conditonal logical-not can be achieved in two instructions.
+ */
+ temp = bld.vgrf(BRW_REGISTER_TYPE_D);
+
+ bld.ASR(temp, src, brw_imm_d(31));
+ bld.XOR(temp, temp, src);
+ }
+
+ bld.LZD(retype(result, BRW_REGISTER_TYPE_UD),
+ retype(temp, BRW_REGISTER_TYPE_UD));
+
+ /* LZD counts from the MSB side, while GLSL's findMSB() wants the count
+ * from the LSB side. Subtract the result from 31 to convert the MSB
+ * count into an LSB count. If no bits are set, LZD will return 32.
+ * 31-32 = -1, which is exactly what findMSB() is supposed to return.
+ */
+ inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31));
+ inst->src[0].negate = true;
+}
+
+void
+fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
+{
+ struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
+ fs_inst *inst;
+
+ fs_reg result = get_nir_dest(instr->dest.dest);
+ result.type = brw_type_for_nir_type(devinfo,
+ (nir_alu_type)(nir_op_infos[instr->op].output_type |
+ nir_dest_bit_size(instr->dest.dest)));
+
+ fs_reg op[4];
+ for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+ op[i] = get_nir_src(instr->src[i].src);
+ op[i].type = brw_type_for_nir_type(devinfo,
+ (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
+ nir_src_bit_size(instr->src[i].src)));
+ op[i].abs = instr->src[i].abs;
+ op[i].negate = instr->src[i].negate;
+ }
+
+ /* We get a bunch of mov's out of the from_ssa pass and they may still
+ * be vectorized. We'll handle them as a special-case. We'll also
+ * handle vecN here because it's basically the same thing.
+ */
+ switch (instr->op) {
+ case nir_op_imov:
+ case nir_op_fmov:
+ case nir_op_vec2:
+ case nir_op_vec3:
+ case nir_op_vec4: {
+ fs_reg temp = result;
+ bool need_extra_copy = false;
+ for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+ if (!instr->src[i].src.is_ssa &&
+ instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {
+ need_extra_copy = true;
+ temp = bld.vgrf(result.type, 4);
+ break;
+ }
+ }
+
+ for (unsigned i = 0; i < 4; i++) {
+ if (!(instr->dest.write_mask & (1 << i)))
+ continue;
+
+ if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
+ inst = bld.MOV(offset(temp, bld, i),
+ offset(op[0], bld, instr->src[0].swizzle[i]));
+ } else {
+ inst = bld.MOV(offset(temp, bld, i),
+ offset(op[i], bld, instr->src[i].swizzle[0]));
+ }
+ inst->saturate = instr->dest.saturate;
+ }
+
+ /* In this case the source and destination registers were the same,
+ * so we need to insert an extra set of moves in order to deal with
+ * any swizzling.
+ */
+ if (need_extra_copy) {
+ for (unsigned i = 0; i < 4; i++) {
+ if (!(instr->dest.write_mask & (1 << i)))
+ continue;
+
+ bld.MOV(offset(result, bld, i), offset(temp, bld, i));
+ }
+ }
+ return;
+ }
+ default:
+ break;
+ }
+
+ /* At this point, we have dealt with any instruction that operates on
+ * more than a single channel. Therefore, we can just adjust the source
+ * and destination registers for that channel and emit the instruction.
+ */
+ unsigned channel = 0;
+ if (nir_op_infos[instr->op].output_size == 0) {
+ /* Since NIR is doing the scalarizing for us, we should only ever see
+ * vectorized operations with a single channel.
+ */
+ assert(_mesa_bitcount(instr->dest.write_mask) == 1);
+ channel = ffs(instr->dest.write_mask) - 1;
+
+ result = offset(result, bld, channel);
+ }
+
+ for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+ assert(nir_op_infos[instr->op].input_sizes[i] < 2);
+ op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
+ }
+
+ switch (instr->op) {
+ case nir_op_i2f:
+ case nir_op_u2f:
+ case nir_op_i642d:
+ case nir_op_u642d:
+ if (optimize_extract_to_float(instr, result))
+ return;
+ inst = bld.MOV(result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_f2d:
+ case nir_op_i2d:
+ case nir_op_u2d:
+ /* CHV PRM, vol07, 3D Media GPGPU Engine, Register Region Restrictions:
+ *
+ * "When source or destination is 64b (...), regioning in Align1
+ * must follow these rules:
+ *
+ * 1. Source and destination horizontal stride must be aligned to
+ * the same qword.
+ * (...)"
+ *
+ * This means that 32-bit to 64-bit conversions need to have the 32-bit
+ * data elements aligned to 64-bit. This restriction does not apply to
+ * BDW and later.
+ */
+ if (nir_dest_bit_size(instr->dest.dest) == 64 &&
+ nir_src_bit_size(instr->src[0].src) == 32 &&
+ (devinfo->is_cherryview || devinfo->is_broxton)) {
+ fs_reg tmp = bld.vgrf(result.type, 1);
+ tmp = subscript(tmp, op[0].type, 0);
+ inst = bld.MOV(tmp, op[0]);
+ inst = bld.MOV(result, tmp);
+ inst->saturate = instr->dest.saturate;
+ break;
+ }
+ /* fallthrough */
+ case nir_op_f2i64:
+ case nir_op_f2u64:
+ case nir_op_i2i64:
+ case nir_op_i2u64:
+ case nir_op_u2i64:
+ case nir_op_u2u64:
+ case nir_op_b2i64:
+ case nir_op_d2f:
+ case nir_op_d2i:
+ case nir_op_d2u:
+ case nir_op_i642f:
+ case nir_op_u642f:
+ case nir_op_u2i32:
+ case nir_op_i2i32:
+ case nir_op_u2u32:
+ case nir_op_i2u32:
+ if (instr->op == nir_op_b2i64) {
+ bld.MOV(result, negate(op[0]));
+ } else {
+ inst = bld.MOV(result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ }
+ break;
+
+ case nir_op_f2i:
+ case nir_op_f2u:
+ bld.MOV(result, op[0]);
+ break;
+
+ case nir_op_fsign: {
+ if (op[0].abs) {
+ /* Straightforward since the source can be assumed to be
+ * non-negative.
+ */
+ set_condmod(BRW_CONDITIONAL_NZ, bld.MOV(result, op[0]));
+ set_predicate(BRW_PREDICATE_NORMAL, bld.MOV(result, brw_imm_f(1.0f)));
+
+ } else if (type_sz(op[0].type) < 8) {
+ /* AND(val, 0x80000000) gives the sign bit.
+ *
+ * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
+ * zero.
+ */
+ bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
+
+ fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
+ op[0].type = BRW_REGISTER_TYPE_UD;
+ result.type = BRW_REGISTER_TYPE_UD;
+ bld.AND(result_int, op[0], brw_imm_ud(0x80000000u));
+
+ inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ if (instr->dest.saturate) {
+ inst = bld.MOV(result, result);
+ inst->saturate = true;
+ }
+ } else {
+ /* For doubles we do the same but we need to consider:
+ *
+ * - 2-src instructions can't operate with 64-bit immediates
+ * - The sign is encoded in the high 32-bit of each DF
+ * - CMP with DF requires special handling in SIMD16
+ * - We need to produce a DF result.
+ */
+
+ /* 2-src instructions can't have 64-bit immediates, so put 0.0 in
+ * a register and compare with that.
+ */
+ fs_reg tmp = vgrf(glsl_type::double_type);
+ bld.MOV(tmp, setup_imm_df(bld, 0.0));
+
+ /* A direct DF CMP using the flag register (null dst) won't work in
+ * SIMD16 because the CMP will be split in two by lower_simd_width,
+ * resulting in two CMP instructions with the same dst (NULL),
+ * leading to dead code elimination of the first one. In SIMD8,
+ * however, there is no need to split the CMP and we can save some
+ * work.
+ */
+ fs_reg dst_tmp = vgrf(glsl_type::double_type);
+ bld.CMP(dst_tmp, op[0], tmp, BRW_CONDITIONAL_NZ);
+
+ /* In SIMD16 we want to avoid using a NULL dst register with DF CMP,
+ * so we store the result of the comparison in a vgrf instead and
+ * then we generate a UD comparison from that that won't have to
+ * be split by lower_simd_width. This is what NIR does to handle
+ * double comparisons in the general case.
+ */
+ if (bld.dispatch_width() == 16 ) {
+ fs_reg dst_tmp_ud = retype(dst_tmp, BRW_REGISTER_TYPE_UD);
+ bld.MOV(dst_tmp_ud, subscript(dst_tmp, BRW_REGISTER_TYPE_UD, 0));
+ bld.CMP(bld.null_reg_ud(),
+ dst_tmp_ud, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
+ }
+
+ /* Get the high 32-bit of each double component where the sign is */
+ fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
+ bld.MOV(result_int, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
+
+ /* Get the sign bit */
+ bld.AND(result_int, result_int, brw_imm_ud(0x80000000u));
+
+ /* Add 1.0 to the sign, predicated to skip the case of op[0] == 0.0 */
+ inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
+ inst->predicate = BRW_PREDICATE_NORMAL;
+
+ /* Convert from 32-bit float to 64-bit double */
+ result.type = BRW_REGISTER_TYPE_DF;
+ inst = bld.MOV(result, retype(result_int, BRW_REGISTER_TYPE_F));
+
+ if (instr->dest.saturate) {
+ inst = bld.MOV(result, result);
+ inst->saturate = true;
+ }
+ }
+ break;
+ }
+
+ case nir_op_isign:
+ /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
+ * -> non-negative val generates 0x00000000.
+ * Predicated OR sets 1 if val is positive.
+ */
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_G);
+ bld.ASR(result, op[0], brw_imm_d(31));
+ inst = bld.OR(result, result, brw_imm_d(1));
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ break;
+
+ case nir_op_frcp:
+ inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_fexp2:
+ inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_flog2:
+ inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_fsin:
+ inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_fcos:
+ inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_fddx:
+ if (fs_key->high_quality_derivatives) {
+ inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
+ } else {
+ inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
+ }
+ inst->saturate = instr->dest.saturate;
+ break;
+ case nir_op_fddx_fine:
+ inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+ case nir_op_fddx_coarse:
+ inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+ case nir_op_fddy:
+ if (fs_key->high_quality_derivatives) {
+ inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
+ } else {
+ inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
+ }
+ inst->saturate = instr->dest.saturate;
+ break;
+ case nir_op_fddy_fine:
+ inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+ case nir_op_fddy_coarse:
+ inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_iadd:
+ case nir_op_fadd:
+ inst = bld.ADD(result, op[0], op[1]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_fmul:
+ inst = bld.MUL(result, op[0], op[1]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_imul:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ bld.MUL(result, op[0], op[1]);
+ break;
+
+ case nir_op_imul_high:
+ case nir_op_umul_high:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
+ break;
+
+ case nir_op_idiv:
+ case nir_op_udiv:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
+ break;
+
+ case nir_op_uadd_carry:
+ unreachable("Should have been lowered by carry_to_arith().");
+
+ case nir_op_usub_borrow:
+ unreachable("Should have been lowered by borrow_to_arith().");
+
+ case nir_op_umod:
+ case nir_op_irem:
+ /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
+ * appears that our hardware just does the right thing for signed
+ * remainder.
+ */
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
+ break;
+
+ case nir_op_imod: {
+ /* Get a regular C-style remainder. If a % b == 0, set the predicate. */
+ bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
+
+ /* Math instructions don't support conditional mod */
+ inst = bld.MOV(bld.null_reg_d(), result);
+ inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+ /* Now, we need to determine if signs of the sources are different.
+ * When we XOR the sources, the top bit is 0 if they are the same and 1
+ * if they are different. We can then use a conditional modifier to
+ * turn that into a predicate. This leads us to an XOR.l instruction.
+ *
+ * Technically, according to the PRM, you're not allowed to use .l on a
+ * XOR instruction. However, emperical experiments and Curro's reading
+ * of the simulator source both indicate that it's safe.
+ */
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
+ inst = bld.XOR(tmp, op[0], op[1]);
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ inst->conditional_mod = BRW_CONDITIONAL_L;
+
+ /* If the result of the initial remainder operation is non-zero and the
+ * two sources have different signs, add in a copy of op[1] to get the
+ * final integer modulus value.
+ */
+ inst = bld.ADD(result, result, op[1]);
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ break;
+ }
+
+ case nir_op_flt:
+ case nir_op_fge:
+ case nir_op_feq:
+ case nir_op_fne: {
+ fs_reg dest = result;
+ if (nir_src_bit_size(instr->src[0].src) > 32) {
+ dest = bld.vgrf(BRW_REGISTER_TYPE_DF, 1);
+ }
+ brw_conditional_mod cond;
+ switch (instr->op) {
+ case nir_op_flt:
+ cond = BRW_CONDITIONAL_L;
+ break;
+ case nir_op_fge:
+ cond = BRW_CONDITIONAL_GE;
+ break;
+ case nir_op_feq:
+ cond = BRW_CONDITIONAL_Z;
+ break;
+ case nir_op_fne:
+ cond = BRW_CONDITIONAL_NZ;
+ break;
+ default:
+ unreachable("bad opcode");
+ }
+ bld.CMP(dest, op[0], op[1], cond);
+ if (nir_src_bit_size(instr->src[0].src) > 32) {
+ bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
+ }
+ break;
+ }
+
+ case nir_op_ilt:
+ case nir_op_ult:
+ case nir_op_ige:
+ case nir_op_uge:
+ case nir_op_ieq:
+ case nir_op_ine: {
+ fs_reg dest = result;
+ if (nir_src_bit_size(instr->src[0].src) > 32) {
+ dest = bld.vgrf(BRW_REGISTER_TYPE_UQ, 1);
+ }
+
+ brw_conditional_mod cond;
+ switch (instr->op) {
+ case nir_op_ilt:
+ case nir_op_ult:
+ cond = BRW_CONDITIONAL_L;
+ break;
+ case nir_op_ige:
+ case nir_op_uge:
+ cond = BRW_CONDITIONAL_GE;
+ break;
+ case nir_op_ieq:
+ cond = BRW_CONDITIONAL_Z;
+ break;
+ case nir_op_ine:
+ cond = BRW_CONDITIONAL_NZ;
+ break;
+ default:
+ unreachable("bad opcode");
+ }
+ bld.CMP(dest, op[0], op[1], cond);
+ if (nir_src_bit_size(instr->src[0].src) > 32) {
+ bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
+ }
+ break;
+ }
+
+ case nir_op_inot:
+ if (devinfo->gen >= 8) {
+ op[0] = resolve_source_modifiers(op[0]);
+ }
+ bld.NOT(result, op[0]);
+ break;
+ case nir_op_ixor:
+ if (devinfo->gen >= 8) {
+ op[0] = resolve_source_modifiers(op[0]);
+ op[1] = resolve_source_modifiers(op[1]);
+ }
+ bld.XOR(result, op[0], op[1]);
+ break;
+ case nir_op_ior:
+ if (devinfo->gen >= 8) {
+ op[0] = resolve_source_modifiers(op[0]);
+ op[1] = resolve_source_modifiers(op[1]);
+ }
+ bld.OR(result, op[0], op[1]);
+ break;
+ case nir_op_iand:
+ if (devinfo->gen >= 8) {
+ op[0] = resolve_source_modifiers(op[0]);
+ op[1] = resolve_source_modifiers(op[1]);
+ }
+ bld.AND(result, op[0], op[1]);
+ break;
+
+ case nir_op_fdot2:
+ case nir_op_fdot3:
+ case nir_op_fdot4:
+ case nir_op_ball_fequal2:
+ case nir_op_ball_iequal2:
+ case nir_op_ball_fequal3:
+ case nir_op_ball_iequal3:
+ case nir_op_ball_fequal4:
+ case nir_op_ball_iequal4:
+ case nir_op_bany_fnequal2:
+ case nir_op_bany_inequal2:
+ case nir_op_bany_fnequal3:
+ case nir_op_bany_inequal3:
+ case nir_op_bany_fnequal4:
+ case nir_op_bany_inequal4:
+ unreachable("Lowered by nir_lower_alu_reductions");
+
+ case nir_op_fnoise1_1:
+ case nir_op_fnoise1_2:
+ case nir_op_fnoise1_3:
+ case nir_op_fnoise1_4:
+ case nir_op_fnoise2_1:
+ case nir_op_fnoise2_2:
+ case nir_op_fnoise2_3:
+ case nir_op_fnoise2_4:
+ case nir_op_fnoise3_1:
+ case nir_op_fnoise3_2:
+ case nir_op_fnoise3_3:
+ case nir_op_fnoise3_4:
+ case nir_op_fnoise4_1:
+ case nir_op_fnoise4_2:
+ case nir_op_fnoise4_3:
+ case nir_op_fnoise4_4:
+ unreachable("not reached: should be handled by lower_noise");
+
+ case nir_op_ldexp:
+ unreachable("not reached: should be handled by ldexp_to_arith()");
+
+ case nir_op_fsqrt:
+ inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_frsq:
+ inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_b2i:
+ case nir_op_b2f:
+ bld.MOV(result, negate(op[0]));
+ break;
+
+ case nir_op_f2b:
+ bld.CMP(result, op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
+ break;
+
+ case nir_op_i642b:
+ case nir_op_d2b: {
+ /* two-argument instructions can't take 64-bit immediates */
+ fs_reg zero;
+ fs_reg tmp;
+
+ if (instr->op == nir_op_d2b) {
+ zero = vgrf(glsl_type::double_type);
+ tmp = vgrf(glsl_type::double_type);
+ } else {
+ zero = vgrf(glsl_type::int64_t_type);
+ tmp = vgrf(glsl_type::int64_t_type);
+ }
+
+ bld.MOV(zero, setup_imm_df(bld, 0.0));
+ /* A SIMD16 execution needs to be split in two instructions, so use
+ * a vgrf instead of the flag register as dst so instruction splitting
+ * works
+ */
+ bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ);
+ bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0));
+ break;
+ }
+ case nir_op_i2b:
+ bld.CMP(result, op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
+ break;
+
+ case nir_op_ftrunc:
+ inst = bld.RNDZ(result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_fceil: {
+ op[0].negate = !op[0].negate;
+ fs_reg temp = vgrf(glsl_type::float_type);
+ bld.RNDD(temp, op[0]);
+ temp.negate = true;
+ inst = bld.MOV(result, temp);
+ inst->saturate = instr->dest.saturate;
+ break;
+ }
+ case nir_op_ffloor:
+ inst = bld.RNDD(result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+ case nir_op_ffract:
+ inst = bld.FRC(result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+ case nir_op_fround_even:
+ inst = bld.RNDE(result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_fquantize2f16: {
+ fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);
+ fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);
+ fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);
+
+ /* The destination stride must be at least as big as the source stride. */
+ tmp16.type = BRW_REGISTER_TYPE_W;
+ tmp16.stride = 2;
+
+ /* Check for denormal */
+ fs_reg abs_src0 = op[0];
+ abs_src0.abs = true;
+ bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
+ BRW_CONDITIONAL_L);
+ /* Get the appropriately signed zero */
+ bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),
+ retype(op[0], BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(0x80000000));
+ /* Do the actual F32 -> F16 -> F32 conversion */
+ bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]);
+ bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16);
+ /* Select that or zero based on normal status */
+ inst = bld.SEL(result, zero, tmp32);
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ inst->saturate = instr->dest.saturate;
+ break;
+ }
+
+ case nir_op_imin:
+ case nir_op_umin:
+ case nir_op_fmin:
+ inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_imax:
+ case nir_op_umax:
+ case nir_op_fmax:
+ inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_pack_snorm_2x16:
+ case nir_op_pack_snorm_4x8:
+ case nir_op_pack_unorm_2x16:
+ case nir_op_pack_unorm_4x8:
+ case nir_op_unpack_snorm_2x16:
+ case nir_op_unpack_snorm_4x8:
+ case nir_op_unpack_unorm_2x16:
+ case nir_op_unpack_unorm_4x8:
+ case nir_op_unpack_half_2x16:
+ case nir_op_pack_half_2x16:
+ unreachable("not reached: should be handled by lower_packing_builtins");
+
+ case nir_op_unpack_half_2x16_split_x:
+ inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+ case nir_op_unpack_half_2x16_split_y:
+ inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, result, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_pack_64_2x32_split:
+ bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
+ break;
+
+ case nir_op_unpack_64_2x32_split_x:
+ case nir_op_unpack_64_2x32_split_y: {
+ if (instr->op == nir_op_unpack_64_2x32_split_x)
+ bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0));
+ else
+ bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
+ break;
+ }
+
+ case nir_op_fpow:
+ inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_bitfield_reverse:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ bld.BFREV(result, op[0]);
+ break;
+
+ case nir_op_bit_count:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ bld.CBIT(result, op[0]);
+ break;
+
+ case nir_op_ufind_msb: {
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ emit_find_msb_using_lzd(bld, result, op[0], false);
+ break;
+ }
+
+ case nir_op_ifind_msb: {
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+
+ if (devinfo->gen < 7) {
+ emit_find_msb_using_lzd(bld, result, op[0], true);
+ } else {
+ bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
+
+ /* FBH counts from the MSB side, while GLSL's findMSB() wants the
+ * count from the LSB side. If FBH didn't return an error
+ * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB
+ * count into an LSB count.
+ */
+ bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
+
+ inst = bld.ADD(result, result, brw_imm_d(31));
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ inst->src[0].negate = true;
+ }
+ break;
+ }
+
+ case nir_op_find_lsb:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+
+ if (devinfo->gen < 7) {
+ fs_reg temp = vgrf(glsl_type::int_type);
+
+ /* (x & -x) generates a value that consists of only the LSB of x.
+ * For all powers of 2, findMSB(y) == findLSB(y).
+ */
+ fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D);
+ fs_reg negated_src = src;
+
+ /* One must be negated, and the other must be non-negated. It
+ * doesn't matter which is which.
+ */
+ negated_src.negate = true;
+ src.negate = false;
+
+ bld.AND(temp, src, negated_src);
+ emit_find_msb_using_lzd(bld, result, temp, false);
+ } else {
+ bld.FBL(result, op[0]);
+ }
+ break;
+
+ case nir_op_ubitfield_extract:
+ case nir_op_ibitfield_extract:
+ unreachable("should have been lowered");
+ case nir_op_ubfe:
+ case nir_op_ibfe:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ bld.BFE(result, op[2], op[1], op[0]);
+ break;
+ case nir_op_bfm:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ bld.BFI1(result, op[0], op[1]);
+ break;
+ case nir_op_bfi:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ bld.BFI2(result, op[0], op[1], op[2]);
+ break;
+
+ case nir_op_bitfield_insert:
+ unreachable("not reached: should have been lowered");
+
+ case nir_op_ishl:
+ bld.SHL(result, op[0], op[1]);
+ break;
+ case nir_op_ishr:
+ bld.ASR(result, op[0], op[1]);
+ break;
+ case nir_op_ushr:
+ bld.SHR(result, op[0], op[1]);
+ break;
+
+ case nir_op_pack_half_2x16_split:
+ bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
+ break;
+
+ case nir_op_ffma:
+ inst = bld.MAD(result, op[2], op[1], op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_flrp:
+ inst = bld.LRP(result, op[0], op[1], op[2]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_bcsel:
+ if (optimize_frontfacing_ternary(instr, result))
+ return;
+
+ bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
+ inst = bld.SEL(result, op[1], op[2]);
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ break;
+
+ case nir_op_extract_u8:
+ case nir_op_extract_i8: {
+ const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
+ nir_const_value *byte = nir_src_as_const_value(instr->src[1].src);
+ assert(byte != NULL);
+ bld.MOV(result, subscript(op[0], type, byte->u32[0]));
+ break;
+ }
+
+ case nir_op_extract_u16:
+ case nir_op_extract_i16: {
+ const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
+ nir_const_value *word = nir_src_as_const_value(instr->src[1].src);
+ assert(word != NULL);
+ bld.MOV(result, subscript(op[0], type, word->u32[0]));
+ break;
+ }
+
+ default:
+ unreachable("unhandled instruction");
+ }
+
+ /* If we need to do a boolean resolve, replace the result with -(x & 1)
+ * to sign extend the low bit to 0/~0
+ */
+ if (devinfo->gen <= 5 &&
+ (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
+ fs_reg masked = vgrf(glsl_type::int_type);
+ bld.AND(masked, result, brw_imm_d(1));
+ masked.negate = true;
+ bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
+ }
+}
+
+void
+fs_visitor::nir_emit_load_const(const fs_builder &bld,
+ nir_load_const_instr *instr)
+{
+ const brw_reg_type reg_type =
+ instr->def.bit_size == 32 ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF;
+ fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
+
+ switch (instr->def.bit_size) {
+ case 32:
+ for (unsigned i = 0; i < instr->def.num_components; i++)
+ bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i32[i]));
+ break;
+
+ case 64:
+ for (unsigned i = 0; i < instr->def.num_components; i++)
+ bld.MOV(offset(reg, bld, i),
+ setup_imm_df(bld, instr->value.f64[i]));
+ break;
+
+ default:
+ unreachable("Invalid bit size");
+ }
+
+ nir_ssa_values[instr->def.index] = reg;
+}
+
+fs_reg
+fs_visitor::get_nir_src(const nir_src &src)
+{
+ fs_reg reg;
+ if (src.is_ssa) {
+ if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
+ const brw_reg_type reg_type = src.ssa->bit_size == 32 ?
+ BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF;
+ reg = bld.vgrf(reg_type, src.ssa->num_components);
+ } else {
+ reg = nir_ssa_values[src.ssa->index];
+ }
+ } else {
+ /* We don't handle indirects on locals */
+ assert(src.reg.indirect == NULL);
+ reg = offset(nir_locals[src.reg.reg->index], bld,
+ src.reg.base_offset * src.reg.reg->num_components);
+ }
+
+ /* to avoid floating-point denorm flushing problems, set the type by
+ * default to D - instructions that need floating point semantics will set
+ * this to F if they need to
+ */
+ return retype(reg, BRW_REGISTER_TYPE_D);
+}
+
+/**
+ * Return an IMM for constants; otherwise call get_nir_src() as normal.
+ */
+fs_reg
+fs_visitor::get_nir_src_imm(const nir_src &src)
+{
+ nir_const_value *val = nir_src_as_const_value(src);
+ return val ? fs_reg(brw_imm_d(val->i32[0])) : get_nir_src(src);
+}
+
+fs_reg
+fs_visitor::get_nir_dest(const nir_dest &dest)
+{
+ if (dest.is_ssa) {
+ const brw_reg_type reg_type =
+ dest.ssa.bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF;
+ nir_ssa_values[dest.ssa.index] =
+ bld.vgrf(reg_type, dest.ssa.num_components);
+ return nir_ssa_values[dest.ssa.index];
+ } else {
+ /* We don't handle indirects on locals */
+ assert(dest.reg.indirect == NULL);
+ return offset(nir_locals[dest.reg.reg->index], bld,
+ dest.reg.base_offset * dest.reg.reg->num_components);
+ }
+}
+
+fs_reg
+fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
+{
+ fs_reg image(UNIFORM, deref->var->data.driver_location / 4,
+ BRW_REGISTER_TYPE_UD);
+ fs_reg indirect;
+ unsigned indirect_max = 0;
+
+ for (const nir_deref *tail = &deref->deref; tail->child;
+ tail = tail->child) {
+ const nir_deref_array *deref_array = nir_deref_as_array(tail->child);
+ assert(tail->child->deref_type == nir_deref_type_array);
+ const unsigned size = glsl_get_length(tail->type);
+ const unsigned element_size = type_size_scalar(deref_array->deref.type);
+ const unsigned base = MIN2(deref_array->base_offset, size - 1);
+ image = offset(image, bld, base * element_size);
+
+ if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
+ fs_reg tmp = vgrf(glsl_type::uint_type);
+
+ /* Accessing an invalid surface index with the dataport can result
+ * in a hang. According to the spec "if the index used to
+ * select an individual element is negative or greater than or
+ * equal to the size of the array, the results of the operation
+ * are undefined but may not lead to termination" -- which is one
+ * of the possible outcomes of the hang. Clamp the index to
+ * prevent access outside of the array bounds.
+ */
+ bld.emit_minmax(tmp, retype(get_nir_src(deref_array->indirect),
+ BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(size - base - 1), BRW_CONDITIONAL_L);
+
+ indirect_max += element_size * (tail->type->length - 1);
+
+ bld.MUL(tmp, tmp, brw_imm_ud(element_size * 4));
+ if (indirect.file == BAD_FILE) {
+ indirect = tmp;
+ } else {
+ bld.ADD(indirect, indirect, tmp);
+ }
+ }
+ }
+
+ if (indirect.file == BAD_FILE) {
+ return image;
+ } else {
+ /* Emit a pile of MOVs to load the uniform into a temporary. The
+ * dead-code elimination pass will get rid of what we don't use.
+ */
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, BRW_IMAGE_PARAM_SIZE);
+ for (unsigned j = 0; j < BRW_IMAGE_PARAM_SIZE; j++) {
+ bld.emit(SHADER_OPCODE_MOV_INDIRECT,
+ offset(tmp, bld, j), offset(image, bld, j),
+ indirect, brw_imm_ud((indirect_max + 1) * 4));
+ }
+ return tmp;
+ }
+}
+
+void
+fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
+ unsigned wr_mask)
+{
+ for (unsigned i = 0; i < 4; i++) {
+ if (!((wr_mask >> i) & 1))
+ continue;
+
+ fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
+ new_inst->dst = offset(new_inst->dst, bld, i);
+ for (unsigned j = 0; j < new_inst->sources; j++)
+ if (new_inst->src[j].file == VGRF)
+ new_inst->src[j] = offset(new_inst->src[j], bld, i);
+
+ bld.emit(new_inst);
+ }
+}
+
+/**
+ * Get the matching channel register datatype for an image intrinsic of the
+ * specified GLSL image type.
+ */
+static brw_reg_type
+get_image_base_type(const glsl_type *type)
+{
+ switch ((glsl_base_type)type->sampled_type) {
+ case GLSL_TYPE_UINT:
+ return BRW_REGISTER_TYPE_UD;
+ case GLSL_TYPE_INT:
+ return BRW_REGISTER_TYPE_D;
+ case GLSL_TYPE_FLOAT:
+ return BRW_REGISTER_TYPE_F;
+ default:
+ unreachable("Not reached.");
+ }
+}
+
+/**
+ * Get the appropriate atomic op for an image atomic intrinsic.
+ */
+static unsigned
+get_image_atomic_op(nir_intrinsic_op op, const glsl_type *type)
+{
+ switch (op) {
+ case nir_intrinsic_image_atomic_add:
+ return BRW_AOP_ADD;
+ case nir_intrinsic_image_atomic_min:
+ return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
+ BRW_AOP_IMIN : BRW_AOP_UMIN);
+ case nir_intrinsic_image_atomic_max:
+ return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
+ BRW_AOP_IMAX : BRW_AOP_UMAX);
+ case nir_intrinsic_image_atomic_and:
+ return BRW_AOP_AND;
+ case nir_intrinsic_image_atomic_or:
+ return BRW_AOP_OR;
+ case nir_intrinsic_image_atomic_xor:
+ return BRW_AOP_XOR;
+ case nir_intrinsic_image_atomic_exchange:
+ return BRW_AOP_MOV;
+ case nir_intrinsic_image_atomic_comp_swap:
+ return BRW_AOP_CMPWR;
+ default:
+ unreachable("Not reachable.");
+ }
+}
+
+static fs_inst *
+emit_pixel_interpolater_send(const fs_builder &bld,
+ enum opcode opcode,
+ const fs_reg &dst,
+ const fs_reg &src,
+ const fs_reg &desc,
+ glsl_interp_mode interpolation)
+{
+ struct brw_wm_prog_data *wm_prog_data =
+ brw_wm_prog_data(bld.shader->stage_prog_data);
+ fs_inst *inst;
+ fs_reg payload;
+ int mlen;
+
+ if (src.file == BAD_FILE) {
+ /* Dummy payload */
+ payload = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
+ mlen = 1;
+ } else {
+ payload = src;
+ mlen = 2 * bld.dispatch_width() / 8;
+ }
+
+ inst = bld.emit(opcode, dst, payload, desc);
+ inst->mlen = mlen;
+ /* 2 floats per slot returned */
+ inst->size_written = 2 * dst.component_size(inst->exec_size);
+ inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE;
+
+ wm_prog_data->pulls_bary = true;
+
+ return inst;
+}
+
+/**
+ * Computes 1 << x, given a D/UD register containing some value x.
+ */
+static fs_reg
+intexp2(const fs_builder &bld, const fs_reg &x)
+{
+ assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
+
+ fs_reg result = bld.vgrf(x.type, 1);
+ fs_reg one = bld.vgrf(x.type, 1);
+
+ bld.MOV(one, retype(brw_imm_d(1), one.type));
+ bld.SHL(result, one, x);
+ return result;
+}
+
+void
+fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
+{
+ assert(stage == MESA_SHADER_GEOMETRY);
+
+ struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
+
+ if (gs_compile->control_data_header_size_bits == 0)
+ return;
+
+ /* We can only do EndPrimitive() functionality when the control data
+ * consists of cut bits. Fortunately, the only time it isn't is when the
+ * output type is points, in which case EndPrimitive() is a no-op.
+ */
+ if (gs_prog_data->control_data_format !=
+ GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
+ return;
+ }
+
+ /* Cut bits use one bit per vertex. */
+ assert(gs_compile->control_data_bits_per_vertex == 1);
+
+ fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
+ vertex_count.type = BRW_REGISTER_TYPE_UD;
+
+ /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
+ * vertex n, 0 otherwise. So all we need to do here is mark bit
+ * (vertex_count - 1) % 32 in the cut_bits register to indicate that
+ * EndPrimitive() was called after emitting vertex (vertex_count - 1);
+ * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
+ *
+ * Note that if EndPrimitive() is called before emitting any vertices, this
+ * will cause us to set bit 31 of the control_data_bits register to 1.
+ * That's fine because:
+ *
+ * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
+ * output, so the hardware will ignore cut bit 31.
+ *
+ * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
+ * last vertex, so setting cut bit 31 has no effect (since the primitive
+ * is automatically ended when the GS terminates).
+ *
+ * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
+ * control_data_bits register to 0 when the first vertex is emitted.
+ */
+
+ const fs_builder abld = bld.annotate("end primitive");
+
+ /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
+ fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
+ fs_reg mask = intexp2(abld, prev_count);
+ /* Note: we're relying on the fact that the GEN SHL instruction only pays
+ * attention to the lower 5 bits of its second source argument, so on this
+ * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
+ * ((vertex_count - 1) % 32).
+ */
+ abld.OR(this->control_data_bits, this->control_data_bits, mask);
+}
+
+void
+fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
+{
+ assert(stage == MESA_SHADER_GEOMETRY);
+ assert(gs_compile->control_data_bits_per_vertex != 0);
+
+ struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
+
+ const fs_builder abld = bld.annotate("emit control data bits");
+ const fs_builder fwa_bld = bld.exec_all();
+
+ /* We use a single UD register to accumulate control data bits (32 bits
+ * for each of the SIMD8 channels). So we need to write a DWord (32 bits)
+ * at a time.
+ *
+ * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
+ * We have select a 128-bit group via the Global and Per-Slot Offsets, then
+ * use the Channel Mask phase to enable/disable which DWord within that
+ * group to write. (Remember, different SIMD8 channels may have emitted
+ * different numbers of vertices, so we may need per-slot offsets.)
+ *
+ * Channel masking presents an annoying problem: we may have to replicate
+ * the data up to 4 times:
+ *
+ * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
+ *
+ * To avoid penalizing shaders that emit a small number of vertices, we
+ * can avoid these sometimes: if the size of the control data header is
+ * <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land
+ * land in the same 128-bit group, so we can skip per-slot offsets.
+ *
+ * Similarly, if the control data header is <= 32 bits, there is only one
+ * DWord, so we can skip channel masks.
+ */
+ enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
+
+ fs_reg channel_mask, per_slot_offset;
+
+ if (gs_compile->control_data_header_size_bits > 32) {
+ opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
+ channel_mask = vgrf(glsl_type::uint_type);
+ }
+
+ if (gs_compile->control_data_header_size_bits > 128) {
+ opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT;
+ per_slot_offset = vgrf(glsl_type::uint_type);
+ }
+
+ /* Figure out which DWord we're trying to write to using the formula:
+ *
+ * dword_index = (vertex_count - 1) * bits_per_vertex / 32
+ *
+ * Since bits_per_vertex is a power of two, and is known at compile
+ * time, this can be optimized to:
+ *
+ * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
+ */
+ if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {
+ fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
+ unsigned log2_bits_per_vertex =
+ util_last_bit(gs_compile->control_data_bits_per_vertex);
+ abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
+
+ if (per_slot_offset.file != BAD_FILE) {
+ /* Set the per-slot offset to dword_index / 4, so that we'll write to
+ * the appropriate OWord within the control data header.
+ */
+ abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u));
+ }
+
+ /* Set the channel masks to 1 << (dword_index % 4), so that we'll
+ * write to the appropriate DWORD within the OWORD.
+ */
+ fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ fwa_bld.AND(channel, dword_index, brw_imm_ud(3u));
+ channel_mask = intexp2(fwa_bld, channel);
+ /* Then the channel masks need to be in bits 23:16. */
+ fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u));
+ }
+
+ /* Store the control data bits in the message payload and send it. */
+ int mlen = 2;
+ if (channel_mask.file != BAD_FILE)
+ mlen += 4; /* channel masks, plus 3 extra copies of the data */
+ if (per_slot_offset.file != BAD_FILE)
+ mlen++;
+
+ fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
+ fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
+ int i = 0;
+ sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+ if (per_slot_offset.file != BAD_FILE)
+ sources[i++] = per_slot_offset;
+ if (channel_mask.file != BAD_FILE)
+ sources[i++] = channel_mask;
+ while (i < mlen) {
+ sources[i++] = this->control_data_bits;
+ }
+
+ abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
+ fs_inst *inst = abld.emit(opcode, reg_undef, payload);
+ inst->mlen = mlen;
+ /* We need to increment Global Offset by 256-bits to make room for
+ * Broadwell's extra "Vertex Count" payload at the beginning of the
+ * URB entry. Since this is an OWord message, Global Offset is counted
+ * in 128-bit units, so we must set it to 2.
+ */
+ if (gs_prog_data->static_vertex_count == -1)
+ inst->offset = 2;
+}
+
+void
+fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
+ unsigned stream_id)
+{
+ /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
+
+ /* Note: we are calling this *before* increasing vertex_count, so
+ * this->vertex_count == vertex_count - 1 in the formula above.
+ */
+
+ /* Stream mode uses 2 bits per vertex */
+ assert(gs_compile->control_data_bits_per_vertex == 2);
+
+ /* Must be a valid stream */
+ assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
+
+ /* Control data bits are initialized to 0 so we don't have to set any
+ * bits when sending vertices to stream 0.
+ */
+ if (stream_id == 0)
+ return;
+
+ const fs_builder abld = bld.annotate("set stream control data bits", NULL);
+
+ /* reg::sid = stream_id */
+ fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ abld.MOV(sid, brw_imm_ud(stream_id));
+
+ /* reg:shift_count = 2 * (vertex_count - 1) */
+ fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ abld.SHL(shift_count, vertex_count, brw_imm_ud(1u));
+
+ /* Note: we're relying on the fact that the GEN SHL instruction only pays
+ * attention to the lower 5 bits of its second source argument, so on this
+ * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
+ * stream_id << ((2 * (vertex_count - 1)) % 32).
+ */
+ fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ abld.SHL(mask, sid, shift_count);
+ abld.OR(this->control_data_bits, this->control_data_bits, mask);
+}
+
+void
+fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
+ unsigned stream_id)
+{
+ assert(stage == MESA_SHADER_GEOMETRY);
+
+ struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
+
+ fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
+ vertex_count.type = BRW_REGISTER_TYPE_UD;
+
+ /* Haswell and later hardware ignores the "Render Stream Select" bits
+ * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
+ * and instead sends all primitives down the pipeline for rasterization.
+ * If the SOL stage is enabled, "Render Stream Select" is honored and
+ * primitives bound to non-zero streams are discarded after stream output.
+ *
+ * Since the only purpose of primives sent to non-zero streams is to
+ * be recorded by transform feedback, we can simply discard all geometry
+ * bound to these streams when transform feedback is disabled.
+ */
+ if (stream_id > 0 && !nir->info->has_transform_feedback_varyings)
+ return;
+
+ /* If we're outputting 32 control data bits or less, then we can wait
+ * until the shader is over to output them all. Otherwise we need to
+ * output them as we go. Now is the time to do it, since we're about to
+ * output the vertex_count'th vertex, so it's guaranteed that the
+ * control data bits associated with the (vertex_count - 1)th vertex are
+ * correct.
+ */
+ if (gs_compile->control_data_header_size_bits > 32) {
+ const fs_builder abld =
+ bld.annotate("emit vertex: emit control data bits");
+
+ /* Only emit control data bits if we've finished accumulating a batch
+ * of 32 bits. This is the case when:
+ *
+ * (vertex_count * bits_per_vertex) % 32 == 0
+ *
+ * (in other words, when the last 5 bits of vertex_count *
+ * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some
+ * integer n (which is always the case, since bits_per_vertex is
+ * always 1 or 2), this is equivalent to requiring that the last 5-n
+ * bits of vertex_count are 0:
+ *
+ * vertex_count & (2^(5-n) - 1) == 0
+ *
+ * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
+ * equivalent to:
+ *
+ * vertex_count & (32 / bits_per_vertex - 1) == 0
+ *
+ * TODO: If vertex_count is an immediate, we could do some of this math
+ * at compile time...
+ */
+ fs_inst *inst =
+ abld.AND(bld.null_reg_d(), vertex_count,
+ brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u));
+ inst->conditional_mod = BRW_CONDITIONAL_Z;
+
+ abld.IF(BRW_PREDICATE_NORMAL);
+ /* If vertex_count is 0, then no control data bits have been
+ * accumulated yet, so we can skip emitting them.
+ */
+ abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
+ BRW_CONDITIONAL_NEQ);
+ abld.IF(BRW_PREDICATE_NORMAL);
+ emit_gs_control_data_bits(vertex_count);
+ abld.emit(BRW_OPCODE_ENDIF);
+
+ /* Reset control_data_bits to 0 so we can start accumulating a new
+ * batch.
+ *
+ * Note: in the case where vertex_count == 0, this neutralizes the
+ * effect of any call to EndPrimitive() that the shader may have
+ * made before outputting its first vertex.
+ */
+ inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u));
+ inst->force_writemask_all = true;
+ abld.emit(BRW_OPCODE_ENDIF);
+ }
+
+ emit_urb_writes(vertex_count);
+
+ /* In stream mode we have to set control data bits for all vertices
+ * unless we have disabled control data bits completely (which we do
+ * do for GL_POINTS outputs that don't use streams).
+ */
+ if (gs_compile->control_data_header_size_bits > 0 &&
+ gs_prog_data->control_data_format ==
+ GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
+ set_gs_stream_control_data_bits(vertex_count, stream_id);
+ }
+}
+
+void
+fs_visitor::emit_gs_input_load(const fs_reg &dst,
+ const nir_src &vertex_src,
+ unsigned base_offset,
+ const nir_src &offset_src,
+ unsigned num_components,
+ unsigned first_component)
+{
+ struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
+
+ nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
+ nir_const_value *offset_const = nir_src_as_const_value(offset_src);
+ const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
+
+ /* Offset 0 is the VUE header, which contains VARYING_SLOT_LAYER [.y],
+ * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w]. Only
+ * gl_PointSize is available as a GS input, however, so it must be that.
+ */
+ const bool is_point_size = (base_offset == 0);
+
+ /* TODO: figure out push input layout for invocations == 1 */
+ if (gs_prog_data->invocations == 1 &&
+ offset_const != NULL && vertex_const != NULL &&
+ 4 * (base_offset + offset_const->u32[0]) < push_reg_count) {
+ int imm_offset = (base_offset + offset_const->u32[0]) * 4 +
+ vertex_const->u32[0] * push_reg_count;
+ /* This input was pushed into registers. */
+ if (is_point_size) {
+ /* gl_PointSize comes in .w */
+ bld.MOV(dst, fs_reg(ATTR, imm_offset + 3, dst.type));
+ } else {
+ for (unsigned i = 0; i < num_components; i++) {
+ bld.MOV(offset(dst, bld, i),
+ fs_reg(ATTR, imm_offset + i + first_component, dst.type));
+ }
+ }
+ return;
+ }
+
+ /* Resort to the pull model. Ensure the VUE handles are provided. */
+ gs_prog_data->base.include_vue_handles = true;
+
+ unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
+ fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+ if (gs_prog_data->invocations == 1) {
+ if (vertex_const) {
+ /* The vertex index is constant; just select the proper URB handle. */
+ icp_handle =
+ retype(brw_vec8_grf(first_icp_handle + vertex_const->i32[0], 0),
+ BRW_REGISTER_TYPE_UD);
+ } else {
+ /* The vertex index is non-constant. We need to use indirect
+ * addressing to fetch the proper URB handle.
+ *
+ * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
+ * indicating that channel <n> should read the handle from
+ * DWord <n>. We convert that to bytes by multiplying by 4.
+ *
+ * Next, we convert the vertex index to bytes by multiplying
+ * by 32 (shifting by 5), and add the two together. This is
+ * the final indirect byte offset.
+ */
+ fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_W, 1);
+ fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+ /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
+ bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
+ /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
+ bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
+ /* Convert vertex_index to bytes (multiply by 32) */
+ bld.SHL(vertex_offset_bytes,
+ retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(5u));
+ bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
+
+ /* Use first_icp_handle as the base offset. There is one register
+ * of URB handles per vertex, so inform the register allocator that
+ * we might read up to nir->info->gs.vertices_in registers.
+ */
+ bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
+ retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
+ fs_reg(icp_offset_bytes),
+ brw_imm_ud(nir->info->gs.vertices_in * REG_SIZE));
+ }
+ } else {
+ assert(gs_prog_data->invocations > 1);
+
+ if (vertex_const) {
+ assert(devinfo->gen >= 9 || vertex_const->i32[0] <= 5);
+ bld.MOV(icp_handle,
+ retype(brw_vec1_grf(first_icp_handle +
+ vertex_const->i32[0] / 8,
+ vertex_const->i32[0] % 8),
+ BRW_REGISTER_TYPE_UD));
+ } else {
+ /* The vertex index is non-constant. We need to use indirect
+ * addressing to fetch the proper URB handle.
+ *
+ */
+ fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+ /* Convert vertex_index to bytes (multiply by 4) */
+ bld.SHL(icp_offset_bytes,
+ retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(2u));
+
+ /* Use first_icp_handle as the base offset. There is one DWord
+ * of URB handles per vertex, so inform the register allocator that
+ * we might read up to ceil(nir->info->gs.vertices_in / 8) registers.
+ */
+ bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
+ retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
+ fs_reg(icp_offset_bytes),
+ brw_imm_ud(DIV_ROUND_UP(nir->info->gs.vertices_in, 8) *
+ REG_SIZE));
+ }
+ }
+
+ fs_inst *inst;
+
+ fs_reg tmp_dst = dst;
+ fs_reg indirect_offset = get_nir_src(offset_src);
+ unsigned num_iterations = 1;
+ unsigned orig_num_components = num_components;
+
+ if (type_sz(dst.type) == 8) {
+ if (num_components > 2) {
+ num_iterations = 2;
+ num_components = 2;
+ }
+ fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
+ tmp_dst = tmp;
+ first_component = first_component / 2;
+ }
+
+ for (unsigned iter = 0; iter < num_iterations; iter++) {
+ if (offset_const) {
+ /* Constant indexing - use global offset. */
+ if (first_component != 0) {
+ unsigned read_components = num_components + first_component;
+ fs_reg tmp = bld.vgrf(dst.type, read_components);
+ inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
+ inst->size_written = read_components *
+ tmp.component_size(inst->exec_size);
+ for (unsigned i = 0; i < num_components; i++) {
+ bld.MOV(offset(tmp_dst, bld, i),
+ offset(tmp, bld, i + first_component));
+ }
+ } else {
+ inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp_dst,
+ icp_handle);
+ inst->size_written = num_components *
+ tmp_dst.component_size(inst->exec_size);
+ }
+ inst->offset = base_offset + offset_const->u32[0];
+ inst->mlen = 1;
+ } else {
+ /* Indirect indexing - use per-slot offsets as well. */
+ const fs_reg srcs[] = { icp_handle, indirect_offset };
+ unsigned read_components = num_components + first_component;
+ fs_reg tmp = bld.vgrf(dst.type, read_components);
+ fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+ bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+ if (first_component != 0) {
+ inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
+ payload);
+ inst->size_written = read_components *
+ tmp.component_size(inst->exec_size);
+ for (unsigned i = 0; i < num_components; i++) {
+ bld.MOV(offset(tmp_dst, bld, i),
+ offset(tmp, bld, i + first_component));
+ }
+ } else {
+ inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp_dst,
+ payload);
+ inst->size_written = num_components *
+ tmp_dst.component_size(inst->exec_size);
+ }
+ inst->offset = base_offset;
+ inst->mlen = 2;
+ }
+
+ if (type_sz(dst.type) == 8) {
+ shuffle_32bit_load_result_to_64bit_data(
+ bld, tmp_dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), num_components);
+
+ for (unsigned c = 0; c < num_components; c++)
+ bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c));
+ }
+
+ if (num_iterations > 1) {
+ num_components = orig_num_components - 2;
+ if(offset_const) {
+ base_offset++;
+ } else {
+ fs_reg new_indirect = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ bld.ADD(new_indirect, indirect_offset, brw_imm_ud(1u));
+ indirect_offset = new_indirect;
+ }
+ }
+ }
+
+ if (is_point_size) {
+ /* Read the whole VUE header (because of alignment) and read .w. */
+ fs_reg tmp = bld.vgrf(dst.type, 4);
+ inst->dst = tmp;
+ inst->size_written = 4 * REG_SIZE;
+ bld.MOV(dst, offset(tmp, bld, 3));
+ }
+}
+
+fs_reg
+fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
+{
+ nir_src *offset_src = nir_get_io_offset_src(instr);
+ nir_const_value *const_value = nir_src_as_const_value(*offset_src);
+
+ if (const_value) {
+ /* The only constant offset we should find is 0. brw_nir.c's
+ * add_const_offset_to_base() will fold other constant offsets
+ * into instr->const_index[0].
+ */
+ assert(const_value->u32[0] == 0);
+ return fs_reg();
+ }
+
+ return get_nir_src(*offset_src);
+}
+
+static void
+do_untyped_vector_read(const fs_builder &bld,
+ const fs_reg dest,
+ const fs_reg surf_index,
+ const fs_reg offset_reg,
+ unsigned num_components)
+{
+ if (type_sz(dest.type) == 4) {
+ fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
+ 1 /* dims */,
+ num_components,
+ BRW_PREDICATE_NONE);
+ read_result.type = dest.type;
+ for (unsigned i = 0; i < num_components; i++)
+ bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
+ } else if (type_sz(dest.type) == 8) {
+ /* Reading a dvec, so we need to:
+ *
+ * 1. Multiply num_components by 2, to account for the fact that we
+ * need to read 64-bit components.
+ * 2. Shuffle the result of the load to form valid 64-bit elements
+ * 3. Emit a second load (for components z/w) if needed.
+ */
+ fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
+ bld.MOV(read_offset, offset_reg);
+
+ int iters = num_components <= 2 ? 1 : 2;
+
+ /* Load the dvec, the first iteration loads components x/y, the second
+ * iteration, if needed, loads components z/w
+ */
+ for (int it = 0; it < iters; it++) {
+ /* Compute number of components to read in this iteration */
+ int iter_components = MIN2(2, num_components);
+ num_components -= iter_components;
+
+ /* Read. Since this message reads 32-bit components, we need to
+ * read twice as many components.
+ */
+ fs_reg read_result = emit_untyped_read(bld, surf_index, read_offset,
+ 1 /* dims */,
+ iter_components * 2,
+ BRW_PREDICATE_NONE);
+
+ /* Shuffle the 32-bit load result into valid 64-bit data */
+ const fs_reg packed_result = bld.vgrf(dest.type, iter_components);
+ shuffle_32bit_load_result_to_64bit_data(
+ bld, packed_result, read_result, iter_components);
+
+ /* Move each component to its destination */
+ read_result = retype(read_result, BRW_REGISTER_TYPE_DF);
+ for (int c = 0; c < iter_components; c++) {
+ bld.MOV(offset(dest, bld, it * 2 + c),
+ offset(packed_result, bld, c));
+ }
+
+ bld.ADD(read_offset, read_offset, brw_imm_ud(16));
+ }
+ } else {
+ unreachable("Unsupported type");
+ }
+}
+
+void
+fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
+ nir_intrinsic_instr *instr)
+{
+ assert(stage == MESA_SHADER_VERTEX);
+
+ fs_reg dest;
+ if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+ dest = get_nir_dest(instr->dest);
+
+ switch (instr->intrinsic) {
+ case nir_intrinsic_load_vertex_id:
+ unreachable("should be lowered by lower_vertex_id()");
+
+ case nir_intrinsic_load_vertex_id_zero_base:
+ case nir_intrinsic_load_base_vertex:
+ case nir_intrinsic_load_instance_id:
+ case nir_intrinsic_load_base_instance:
+ case nir_intrinsic_load_draw_id: {
+ gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
+ fs_reg val = nir_system_values[sv];
+ assert(val.file != BAD_FILE);
+ dest.type = val.type;
+ bld.MOV(dest, val);
+ break;
+ }
+
+ case nir_intrinsic_load_input: {
+ fs_reg src = fs_reg(ATTR, instr->const_index[0], dest.type);
+ unsigned first_component = nir_intrinsic_component(instr);
+ unsigned num_components = instr->num_components;
+ enum brw_reg_type type = dest.type;
+
+ nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+ assert(const_offset && "Indirect input loads not allowed");
+ src = offset(src, bld, const_offset->u32[0]);
+
+ for (unsigned j = 0; j < num_components; j++) {
+ bld.MOV(offset(dest, bld, j), offset(src, bld, j + first_component));
+ }
+
+ if (type == BRW_REGISTER_TYPE_DF) {
+ /* Once the double vector is read, set again its original register
+ * type to continue with normal execution.
+ */
+ src = retype(src, type);
+ dest = retype(dest, type);
+ }
+
+ if (type_sz(src.type) == 8) {
+ shuffle_32bit_load_result_to_64bit_data(bld,
+ dest,
+ retype(dest, BRW_REGISTER_TYPE_F),
+ instr->num_components);
+ }
+ break;
+ }
+
+ default:
+ nir_emit_intrinsic(bld, instr);
+ break;
+ }
+}
+
+void
+fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
+ nir_intrinsic_instr *instr)
+{
+ assert(stage == MESA_SHADER_TESS_CTRL);
+ struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
+ struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
+
+ fs_reg dst;
+ if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+ dst = get_nir_dest(instr->dest);
+
+ switch (instr->intrinsic) {
+ case nir_intrinsic_load_primitive_id:
+ bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1)));
+ break;
+ case nir_intrinsic_load_invocation_id:
+ bld.MOV(retype(dst, invocation_id.type), invocation_id);
+ break;
+ case nir_intrinsic_load_patch_vertices_in:
+ bld.MOV(retype(dst, BRW_REGISTER_TYPE_D),
+ brw_imm_d(tcs_key->input_vertices));
+ break;
+
+ case nir_intrinsic_barrier: {
+ if (tcs_prog_data->instances == 1)
+ break;
+
+ fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ fs_reg m0_2 = component(m0, 2);
+
+ const fs_builder chanbld = bld.exec_all().group(1, 0);
+
+ /* Zero the message header */
+ bld.exec_all().MOV(m0, brw_imm_ud(0u));
+
+ /* Copy "Barrier ID" from r0.2, bits 16:13 */
+ chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(INTEL_MASK(16, 13)));
+
+ /* Shift it up to bits 27:24. */
+ chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
+
+ /* Set the Barrier Count and the enable bit */
+ chanbld.OR(m0_2, m0_2,
+ brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
+
+ bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
+ break;
+ }
+
+ case nir_intrinsic_load_input:
+ unreachable("nir_lower_io should never give us these.");
+ break;
+
+ case nir_intrinsic_load_per_vertex_input: {
+ fs_reg indirect_offset = get_indirect_offset(instr);
+ unsigned imm_offset = instr->const_index[0];
+
+ const nir_src &vertex_src = instr->src[0];
+ nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
+
+ fs_inst *inst;
+
+ fs_reg icp_handle;
+
+ if (vertex_const) {
+ /* Emit a MOV to resolve <0,1,0> regioning. */
+ icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ bld.MOV(icp_handle,
+ retype(brw_vec1_grf(1 + (vertex_const->i32[0] >> 3),
+ vertex_const->i32[0] & 7),
+ BRW_REGISTER_TYPE_UD));
+ } else if (tcs_prog_data->instances == 1 &&
+ vertex_src.is_ssa &&
+ vertex_src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
+ nir_instr_as_intrinsic(vertex_src.ssa->parent_instr)->intrinsic == nir_intrinsic_load_invocation_id) {
+ /* For the common case of only 1 instance, an array index of
+ * gl_InvocationID means reading g1. Skip all the indirect work.
+ */
+ icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
+ } else {
+ /* The vertex index is non-constant. We need to use indirect
+ * addressing to fetch the proper URB handle.
+ */
+ icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+ /* Each ICP handle is a single DWord (4 bytes) */
+ fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ bld.SHL(vertex_offset_bytes,
+ retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(2u));
+
+ /* Start at g1. We might read up to 4 registers. */
+ bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
+ retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes,
+ brw_imm_ud(4 * REG_SIZE));
+ }
+
+ /* We can only read two double components with each URB read, so
+ * we send two read messages in that case, each one loading up to
+ * two double components.
+ */
+ unsigned num_iterations = 1;
+ unsigned num_components = instr->num_components;
+ unsigned first_component = nir_intrinsic_component(instr);
+ fs_reg orig_dst = dst;
+ if (type_sz(dst.type) == 8) {
+ first_component = first_component / 2;
+ if (instr->num_components > 2) {
+ num_iterations = 2;
+ num_components = 2;
+ }
+
+ fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
+ dst = tmp;
+ }
+
+ for (unsigned iter = 0; iter < num_iterations; iter++) {
+ if (indirect_offset.file == BAD_FILE) {
+ /* Constant indexing - use global offset. */
+ if (first_component != 0) {
+ unsigned read_components = num_components + first_component;
+ fs_reg tmp = bld.vgrf(dst.type, read_components);
+ inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
+ for (unsigned i = 0; i < num_components; i++) {
+ bld.MOV(offset(dst, bld, i),
+ offset(tmp, bld, i + first_component));
+ }
+ } else {
+ inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
+ }
+ inst->offset = imm_offset;
+ inst->mlen = 1;
+ } else {
+ /* Indirect indexing - use per-slot offsets as well. */
+ const fs_reg srcs[] = { icp_handle, indirect_offset };
+ fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+ bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+ if (first_component != 0) {
+ unsigned read_components = num_components + first_component;
+ fs_reg tmp = bld.vgrf(dst.type, read_components);
+ inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
+ payload);
+ for (unsigned i = 0; i < num_components; i++) {
+ bld.MOV(offset(dst, bld, i),
+ offset(tmp, bld, i + first_component));
+ }
+ } else {
+ inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
+ payload);
+ }
+ inst->offset = imm_offset;
+ inst->mlen = 2;
+ }
+ inst->size_written = (num_components + first_component) *
+ inst->dst.component_size(inst->exec_size);
+
+ /* If we are reading 64-bit data using 32-bit read messages we need
+ * build proper 64-bit data elements by shuffling the low and high
+ * 32-bit components around like we do for other things like UBOs
+ * or SSBOs.
+ */
+ if (type_sz(dst.type) == 8) {
+ shuffle_32bit_load_result_to_64bit_data(
+ bld, dst, retype(dst, BRW_REGISTER_TYPE_F), num_components);
+
+ for (unsigned c = 0; c < num_components; c++) {
+ bld.MOV(offset(orig_dst, bld, iter * 2 + c),
+ offset(dst, bld, c));
+ }
+ }
+
+ /* Copy the temporary to the destination to deal with writemasking.
+ *
+ * Also attempt to deal with gl_PointSize being in the .w component.
+ */
+ if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
+ assert(type_sz(dst.type) < 8);
+ inst->dst = bld.vgrf(dst.type, 4);
+ inst->size_written = 4 * REG_SIZE;
+ bld.MOV(dst, offset(inst->dst, bld, 3));
+ }
+
+ /* If we are loading double data and we need a second read message
+ * adjust the write offset
+ */
+ if (num_iterations > 1) {
+ num_components = instr->num_components - 2;
+ imm_offset++;
+ }
+ }
+ break;
+ }
+
+ case nir_intrinsic_load_output:
+ case nir_intrinsic_load_per_vertex_output: {
+ fs_reg indirect_offset = get_indirect_offset(instr);
+ unsigned imm_offset = instr->const_index[0];
+ unsigned first_component = nir_intrinsic_component(instr);
+
+ fs_inst *inst;
+ if (indirect_offset.file == BAD_FILE) {
+ /* Replicate the patch handle to all enabled channels */
+ fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ bld.MOV(patch_handle,
+ retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+ {
+ if (first_component != 0) {
+ unsigned read_components =
+ instr->num_components + first_component;
+ fs_reg tmp = bld.vgrf(dst.type, read_components);
+ inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
+ patch_handle);
+ inst->size_written = read_components * REG_SIZE;
+ for (unsigned i = 0; i < instr->num_components; i++) {
+ bld.MOV(offset(dst, bld, i),
+ offset(tmp, bld, i + first_component));
+ }
+ } else {
+ inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
+ patch_handle);
+ inst->size_written = instr->num_components * REG_SIZE;
+ }
+ inst->offset = imm_offset;
+ inst->mlen = 1;
+ }
+ } else {
+ /* Indirect indexing - use per-slot offsets as well. */
+ const fs_reg srcs[] = {
+ retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
+ indirect_offset
+ };
+ fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+ bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+ if (first_component != 0) {
+ unsigned read_components =
+ instr->num_components + first_component;
+ fs_reg tmp = bld.vgrf(dst.type, read_components);
+ inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
+ payload);
+ inst->size_written = read_components * REG_SIZE;
+ for (unsigned i = 0; i < instr->num_components; i++) {
+ bld.MOV(offset(dst, bld, i),
+ offset(tmp, bld, i + first_component));
+ }
+ } else {
+ inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
+ payload);
+ inst->size_written = instr->num_components * REG_SIZE;
+ }
+ inst->offset = imm_offset;
+ inst->mlen = 2;
+ }
+ break;
+ }
+
+ case nir_intrinsic_store_output:
+ case nir_intrinsic_store_per_vertex_output: {
+ fs_reg value = get_nir_src(instr->src[0]);
+ bool is_64bit = (instr->src[0].is_ssa ?
+ instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64;
+ fs_reg indirect_offset = get_indirect_offset(instr);
+ unsigned imm_offset = instr->const_index[0];
+ unsigned swiz = BRW_SWIZZLE_XYZW;
+ unsigned mask = instr->const_index[1];
+ unsigned header_regs = 0;
+ fs_reg srcs[7];
+ srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
+
+ if (indirect_offset.file != BAD_FILE) {
+ srcs[header_regs++] = indirect_offset;
+ }
+
+ if (mask == 0)
+ break;
+
+ unsigned num_components = util_last_bit(mask);
+ enum opcode opcode;
+
+ /* We can only pack two 64-bit components in a single message, so send
+ * 2 messages if we have more components
+ */
+ unsigned num_iterations = 1;
+ unsigned iter_components = num_components;
+ unsigned first_component = nir_intrinsic_component(instr);
+ if (is_64bit) {
+ first_component = first_component / 2;
+ if (instr->num_components > 2) {
+ num_iterations = 2;
+ iter_components = 2;
+ }
+ }
+
+ /* 64-bit data needs to me shuffled before we can write it to the URB.
+ * We will use this temporary to shuffle the components in each
+ * iteration.
+ */
+ fs_reg tmp =
+ fs_reg(VGRF, alloc.allocate(2 * iter_components), value.type);
+
+ mask = mask << first_component;
+
+ for (unsigned iter = 0; iter < num_iterations; iter++) {
+ if (!is_64bit && mask != WRITEMASK_XYZW) {
+ srcs[header_regs++] = brw_imm_ud(mask << 16);
+ opcode = indirect_offset.file != BAD_FILE ?
+ SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
+ SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
+ } else if (is_64bit && ((mask & WRITEMASK_XY) != WRITEMASK_XY)) {
+ /* Expand the 64-bit mask to 32-bit channels. We only handle
+ * two channels in each iteration, so we only care about X/Y.
+ */
+ unsigned mask32 = 0;
+ if (mask & WRITEMASK_X)
+ mask32 |= WRITEMASK_XY;
+ if (mask & WRITEMASK_Y)
+ mask32 |= WRITEMASK_ZW;
+
+ /* If the mask does not include any of the channels X or Y there
+ * is nothing to do in this iteration. Move on to the next couple
+ * of 64-bit channels.
+ */
+ if (!mask32) {
+ mask >>= 2;
+ imm_offset++;
+ continue;
+ }
+
+ srcs[header_regs++] = brw_imm_ud(mask32 << 16);
+ opcode = indirect_offset.file != BAD_FILE ?
+ SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
+ SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
+ } else {
+ opcode = indirect_offset.file != BAD_FILE ?
+ SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT :
+ SHADER_OPCODE_URB_WRITE_SIMD8;
+ }
+
+ for (unsigned i = 0; i < iter_components; i++) {
+ if (!(mask & (1 << (i + first_component))))
+ continue;
+
+ if (!is_64bit) {
+ srcs[header_regs + i + first_component] =
+ offset(value, bld, BRW_GET_SWZ(swiz, i));
+ } else {
+ /* We need to shuffle the 64-bit data to match the layout
+ * expected by our 32-bit URB write messages. We use a temporary
+ * for that.
+ */
+ unsigned channel = BRW_GET_SWZ(swiz, iter * 2 + i);
+ shuffle_64bit_data_for_32bit_write(bld,
+ retype(offset(tmp, bld, 2 * i), BRW_REGISTER_TYPE_F),
+ retype(offset(value, bld, 2 * channel), BRW_REGISTER_TYPE_DF),
+ 1);
+
+ /* Now copy the data to the destination */
+ fs_reg dest = fs_reg(VGRF, alloc.allocate(2), value.type);
+ unsigned idx = 2 * i;
+ bld.MOV(dest, offset(tmp, bld, idx));
+ bld.MOV(offset(dest, bld, 1), offset(tmp, bld, idx + 1));
+ srcs[header_regs + idx + first_component * 2] = dest;
+ srcs[header_regs + idx + 1 + first_component * 2] =
+ offset(dest, bld, 1);
+ }
+ }
+
+ unsigned mlen =
+ header_regs + (is_64bit ? 2 * iter_components : iter_components) +
+ (is_64bit ? 2 * first_component : first_component);
+ fs_reg payload =
+ bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
+ bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
+
+ fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);
+ inst->offset = imm_offset;
+ inst->mlen = mlen;
+
+ /* If this is a 64-bit attribute, select the next two 64-bit channels
+ * to be handled in the next iteration.
+ */
+ if (is_64bit) {
+ mask >>= 2;
+ imm_offset++;
+ }
+ }
+ break;
+ }
+
+ default:
+ nir_emit_intrinsic(bld, instr);
+ break;
+ }
+}
+
+void
+fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
+ nir_intrinsic_instr *instr)
+{
+ assert(stage == MESA_SHADER_TESS_EVAL);
+ struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data);
+
+ fs_reg dest;
+ if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+ dest = get_nir_dest(instr->dest);
+
+ switch (instr->intrinsic) {
+ case nir_intrinsic_load_primitive_id:
+ bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1)));
+ break;
+ case nir_intrinsic_load_tess_coord:
+ /* gl_TessCoord is part of the payload in g1-3 */
+ for (unsigned i = 0; i < 3; i++) {
+ bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0)));
+ }
+ break;
+
+ case nir_intrinsic_load_input:
+ case nir_intrinsic_load_per_vertex_input: {
+ fs_reg indirect_offset = get_indirect_offset(instr);
+ unsigned imm_offset = instr->const_index[0];
+ unsigned first_component = nir_intrinsic_component(instr);
+
+ if (type_sz(dest.type) == 8) {
+ first_component = first_component / 2;
+ }
+
+ fs_inst *inst;
+ if (indirect_offset.file == BAD_FILE) {
+ /* Arbitrarily only push up to 32 vec4 slots worth of data,
+ * which is 16 registers (since each holds 2 vec4 slots).
+ */
+ const unsigned max_push_slots = 32;
+ if (imm_offset < max_push_slots) {
+ fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
+ for (int i = 0; i < instr->num_components; i++) {
+ unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) +
+ i + first_component;
+ bld.MOV(offset(dest, bld, i), component(src, comp));
+ }
+ tes_prog_data->base.urb_read_length =
+ MAX2(tes_prog_data->base.urb_read_length,
+ DIV_ROUND_UP(imm_offset + 1, 2));
+ } else {
+ /* Replicate the patch handle to all enabled channels */
+ const fs_reg srcs[] = {
+ retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
+ };
+ fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
+
+ if (first_component != 0) {
+ unsigned read_components =
+ instr->num_components + first_component;
+ fs_reg tmp = bld.vgrf(dest.type, read_components);
+ inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
+ patch_handle);
+ inst->size_written = read_components * REG_SIZE;
+ for (unsigned i = 0; i < instr->num_components; i++) {
+ bld.MOV(offset(dest, bld, i),
+ offset(tmp, bld, i + first_component));
+ }
+ } else {
+ inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest,
+ patch_handle);
+ inst->size_written = instr->num_components * REG_SIZE;
+ }
+ inst->mlen = 1;
+ inst->offset = imm_offset;
+ }
+ } else {
+ /* Indirect indexing - use per-slot offsets as well. */
+
+ /* We can only read two double components with each URB read, so
+ * we send two read messages in that case, each one loading up to
+ * two double components.
+ */
+ unsigned num_iterations = 1;
+ unsigned num_components = instr->num_components;
+ fs_reg orig_dest = dest;
+ if (type_sz(dest.type) == 8) {
+ if (instr->num_components > 2) {
+ num_iterations = 2;
+ num_components = 2;
+ }
+ fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dest.type);
+ dest = tmp;
+ }
+
+ for (unsigned iter = 0; iter < num_iterations; iter++) {
+ const fs_reg srcs[] = {
+ retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
+ indirect_offset
+ };
+ fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+ bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+
+ if (first_component != 0) {
+ unsigned read_components =
+ num_components + first_component;
+ fs_reg tmp = bld.vgrf(dest.type, read_components);
+ inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
+ payload);
+ for (unsigned i = 0; i < num_components; i++) {
+ bld.MOV(offset(dest, bld, i),
+ offset(tmp, bld, i + first_component));
+ }
+ } else {
+ inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest,
+ payload);
+ }
+ inst->mlen = 2;
+ inst->offset = imm_offset;
+ inst->size_written = (num_components + first_component) *
+ inst->dst.component_size(inst->exec_size);
+
+ /* If we are reading 64-bit data using 32-bit read messages we need
+ * build proper 64-bit data elements by shuffling the low and high
+ * 32-bit components around like we do for other things like UBOs
+ * or SSBOs.
+ */
+ if (type_sz(dest.type) == 8) {
+ shuffle_32bit_load_result_to_64bit_data(
+ bld, dest, retype(dest, BRW_REGISTER_TYPE_F), num_components);
+
+ for (unsigned c = 0; c < num_components; c++) {
+ bld.MOV(offset(orig_dest, bld, iter * 2 + c),
+ offset(dest, bld, c));
+ }
+ }
+
+ /* If we are loading double data and we need a second read message
+ * adjust the offset
+ */
+ if (num_iterations > 1) {
+ num_components = instr->num_components - 2;
+ imm_offset++;
+ }
+ }
+ }
+ break;
+ }
+ default:
+ nir_emit_intrinsic(bld, instr);
+ break;
+ }
+}
+
+void
+fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
+ nir_intrinsic_instr *instr)
+{
+ assert(stage == MESA_SHADER_GEOMETRY);
+ fs_reg indirect_offset;
+
+ fs_reg dest;
+ if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+ dest = get_nir_dest(instr->dest);
+
+ switch (instr->intrinsic) {
+ case nir_intrinsic_load_primitive_id:
+ assert(stage == MESA_SHADER_GEOMETRY);
+ assert(brw_gs_prog_data(prog_data)->include_primitive_id);
+ bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
+ retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
+ break;
+
+ case nir_intrinsic_load_input:
+ unreachable("load_input intrinsics are invalid for the GS stage");
+
+ case nir_intrinsic_load_per_vertex_input:
+ emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
+ instr->src[1], instr->num_components,
+ nir_intrinsic_component(instr));
+ break;
+
+ case nir_intrinsic_emit_vertex_with_counter:
+ emit_gs_vertex(instr->src[0], instr->const_index[0]);
+ break;
+
+ case nir_intrinsic_end_primitive_with_counter:
+ emit_gs_end_primitive(instr->src[0]);
+ break;
+
+ case nir_intrinsic_set_vertex_count:
+ bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
+ break;
+
+ case nir_intrinsic_load_invocation_id: {
+ fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
+ assert(val.file != BAD_FILE);
+ dest.type = val.type;
+ bld.MOV(dest, val);
+ break;
+ }
+
+ default:
+ nir_emit_intrinsic(bld, instr);
+ break;
+ }
+}
+
+/**
+ * Fetch the current render target layer index.
+ */
+static fs_reg
+fetch_render_target_array_index(const fs_builder &bld)
+{
+ if (bld.shader->devinfo->gen >= 6) {
+ /* The render target array index is provided in the thread payload as
+ * bits 26:16 of r0.0.
+ */
+ const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
+ bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1),
+ brw_imm_uw(0x7ff));
+ return idx;
+ } else {
+ /* Pre-SNB we only ever render into the first layer of the framebuffer
+ * since layered rendering is not implemented.
+ */
+ return brw_imm_ud(0);
+ }
+}
+
+/**
+ * Fake non-coherent framebuffer read implemented using TXF to fetch from the
+ * framebuffer at the current fragment coordinates and sample index.
+ */
+fs_inst *
+fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst,
+ unsigned target)
+{
+ const struct gen_device_info *devinfo = bld.shader->devinfo;
+
+ assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
+ const brw_wm_prog_key *wm_key =
+ reinterpret_cast<const brw_wm_prog_key *>(key);
+ assert(!wm_key->coherent_fb_fetch);
+ const struct brw_wm_prog_data *wm_prog_data =
+ brw_wm_prog_data(stage_prog_data);
+
+ /* Calculate the surface index relative to the start of the texture binding
+ * table block, since that's what the texturing messages expect.
+ */
+ const unsigned surface = target +
+ wm_prog_data->binding_table.render_target_read_start -
+ wm_prog_data->base.binding_table.texture_start;
+
+ brw_mark_surface_used(
+ bld.shader->stage_prog_data,
+ wm_prog_data->binding_table.render_target_read_start + target);
+
+ /* Calculate the fragment coordinates. */
+ const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
+ bld.MOV(offset(coords, bld, 0), pixel_x);
+ bld.MOV(offset(coords, bld, 1), pixel_y);
+ bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
+
+ /* Calculate the sample index and MCS payload when multisampling. Luckily
+ * the MCS fetch message behaves deterministically for UMS surfaces, so it
+ * shouldn't be necessary to recompile based on whether the framebuffer is
+ * CMS or UMS.
+ */
+ if (wm_key->multisample_fbo &&
+ nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
+ nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
+
+ const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
+ const fs_reg mcs = wm_key->multisample_fbo ?
+ emit_mcs_fetch(coords, 3, brw_imm_ud(surface)) : fs_reg();
+
+ /* Use either a normal or a CMS texel fetch message depending on whether
+ * the framebuffer is single or multisample. On SKL+ use the wide CMS
+ * message just in case the framebuffer uses 16x multisampling, it should
+ * be equivalent to the normal CMS fetch for lower multisampling modes.
+ */
+ const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL :
+ devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL :
+ SHADER_OPCODE_TXF_CMS_LOGICAL;
+
+ /* Emit the instruction. */
+ const fs_reg srcs[] = { coords, fs_reg(), brw_imm_ud(0), fs_reg(),
+ sample, mcs,
+ brw_imm_ud(surface), brw_imm_ud(0),
+ fs_reg(), brw_imm_ud(3), brw_imm_ud(0) };
+ STATIC_ASSERT(ARRAY_SIZE(srcs) == TEX_LOGICAL_NUM_SRCS);
+
+ fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
+ inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
+
+ return inst;
+}
+
+/**
+ * Actual coherent framebuffer read implemented using the native render target
+ * read message. Requires SKL+.
+ */
+static fs_inst *
+emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target)
+{
+ assert(bld.shader->devinfo->gen >= 9);
+ fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
+ inst->target = target;
+ inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
+
+ return inst;
+}
+
+static fs_reg
+alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n)
+{
+ if (n && regs[0].file != BAD_FILE) {
+ return regs[0];
+
+ } else {
+ const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size);
+
+ for (unsigned i = 0; i < n; i++)
+ regs[i] = tmp;
+
+ return tmp;
+ }
+}
+
+static fs_reg
+alloc_frag_output(fs_visitor *v, unsigned location)
+{
+ assert(v->stage == MESA_SHADER_FRAGMENT);
+ const brw_wm_prog_key *const key =
+ reinterpret_cast<const brw_wm_prog_key *>(v->key);
+ const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
+ const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
+
+ if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
+ return alloc_temporary(v->bld, 4, &v->dual_src_output, 1);
+
+ else if (l == FRAG_RESULT_COLOR)
+ return alloc_temporary(v->bld, 4, v->outputs,
+ MAX2(key->nr_color_regions, 1));
+
+ else if (l == FRAG_RESULT_DEPTH)
+ return alloc_temporary(v->bld, 1, &v->frag_depth, 1);
+
+ else if (l == FRAG_RESULT_STENCIL)
+ return alloc_temporary(v->bld, 1, &v->frag_stencil, 1);
+
+ else if (l == FRAG_RESULT_SAMPLE_MASK)
+ return alloc_temporary(v->bld, 1, &v->sample_mask, 1);
+
+ else if (l >= FRAG_RESULT_DATA0 &&
+ l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
+ return alloc_temporary(v->bld, 4,
+ &v->outputs[l - FRAG_RESULT_DATA0], 1);
+
+ else
+ unreachable("Invalid location");
+}
+
+void
+fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
+ nir_intrinsic_instr *instr)
+{
+ assert(stage == MESA_SHADER_FRAGMENT);
+
+ fs_reg dest;
+ if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+ dest = get_nir_dest(instr->dest);
+
+ switch (instr->intrinsic) {
+ case nir_intrinsic_load_front_face:
+ bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
+ *emit_frontfacing_interpolation());
+ break;
+
+ case nir_intrinsic_load_sample_pos: {
+ fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
+ assert(sample_pos.file != BAD_FILE);
+ dest.type = sample_pos.type;
+ bld.MOV(dest, sample_pos);
+ bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
+ break;
+ }
+
+ case nir_intrinsic_load_layer_id:
+ dest.type = BRW_REGISTER_TYPE_UD;
+ bld.MOV(dest, fetch_render_target_array_index(bld));
+ break;
+
+ case nir_intrinsic_load_helper_invocation:
+ case nir_intrinsic_load_sample_mask_in:
+ case nir_intrinsic_load_sample_id: {
+ gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
+ fs_reg val = nir_system_values[sv];
+ assert(val.file != BAD_FILE);
+ dest.type = val.type;
+ bld.MOV(dest, val);
+ break;
+ }
+
+ case nir_intrinsic_store_output: {
+ const fs_reg src = get_nir_src(instr->src[0]);
+ const nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
+ assert(const_offset && "Indirect output stores not allowed");
+ const unsigned location = nir_intrinsic_base(instr) +
+ SET_FIELD(const_offset->u32[0], BRW_NIR_FRAG_OUTPUT_LOCATION);
+ const fs_reg new_dest = retype(alloc_frag_output(this, location),
+ src.type);
+
+ for (unsigned j = 0; j < instr->num_components; j++)
+ bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
+ offset(src, bld, j));
+
+ break;
+ }
+
+ case nir_intrinsic_load_output: {
+ const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
+ BRW_NIR_FRAG_OUTPUT_LOCATION);
+ assert(l >= FRAG_RESULT_DATA0);
+ nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+ assert(const_offset && "Indirect output loads not allowed");
+ const unsigned target = l - FRAG_RESULT_DATA0 + const_offset->u32[0];
+ const fs_reg tmp = bld.vgrf(dest.type, 4);
+
+ if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch)
+ emit_coherent_fb_read(bld, tmp, target);
+ else
+ emit_non_coherent_fb_read(bld, tmp, target);
+
+ for (unsigned j = 0; j < instr->num_components; j++) {
+ bld.MOV(offset(dest, bld, j),
+ offset(tmp, bld, nir_intrinsic_component(instr) + j));
+ }
+
+ break;
+ }
+
+ case nir_intrinsic_discard:
+ case nir_intrinsic_discard_if: {
+ /* We track our discarded pixels in f0.1. By predicating on it, we can
+ * update just the flag bits that aren't yet discarded. If there's no
+ * condition, we emit a CMP of g0 != g0, so all currently executing
+ * channels will get turned off.
+ */
+ fs_inst *cmp;
+ if (instr->intrinsic == nir_intrinsic_discard_if) {
+ cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
+ brw_imm_d(0), BRW_CONDITIONAL_Z);
+ } else {
+ fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
+ BRW_REGISTER_TYPE_UW));
+ cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
+ }
+ cmp->predicate = BRW_PREDICATE_NORMAL;
+ cmp->flag_subreg = 1;
+
+ if (devinfo->gen >= 6) {
+ emit_discard_jump();
+ }
+ break;
+ }
+
+ case nir_intrinsic_load_input: {
+ /* load_input is only used for flat inputs */
+ unsigned base = nir_intrinsic_base(instr);
+ unsigned component = nir_intrinsic_component(instr);
+ unsigned num_components = instr->num_components;
+ enum brw_reg_type type = dest.type;
+
+ /* Special case fields in the VUE header */
+ if (base == VARYING_SLOT_LAYER)
+ component = 1;
+ else if (base == VARYING_SLOT_VIEWPORT)
+ component = 2;
+
+ if (nir_dest_bit_size(instr->dest) == 64) {
+ /* const_index is in 32-bit type size units that could not be aligned
+ * with DF. We need to read the double vector as if it was a float
+ * vector of twice the number of components to fetch the right data.
+ */
+ type = BRW_REGISTER_TYPE_F;
+ num_components *= 2;
+ }
+
+ for (unsigned int i = 0; i < num_components; i++) {
+ struct brw_reg interp = interp_reg(base, component + i);
+ interp = suboffset(interp, 3);
+ bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld, i),
+ retype(fs_reg(interp), type));
+ }
+
+ if (nir_dest_bit_size(instr->dest) == 64) {
+ shuffle_32bit_load_result_to_64bit_data(bld,
+ dest,
+ retype(dest, type),
+ instr->num_components);
+ }
+ break;
+ }
+
+ case nir_intrinsic_load_barycentric_pixel:
+ case nir_intrinsic_load_barycentric_centroid:
+ case nir_intrinsic_load_barycentric_sample:
+ /* Do nothing - load_interpolated_input handling will handle it later. */
+ break;
+
+ case nir_intrinsic_load_barycentric_at_sample: {
+ const glsl_interp_mode interpolation =
+ (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
+
+ nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
+
+ if (const_sample) {
+ unsigned msg_data = const_sample->i32[0] << 4;
+
+ emit_pixel_interpolater_send(bld,
+ FS_OPCODE_INTERPOLATE_AT_SAMPLE,
+ dest,
+ fs_reg(), /* src */
+ brw_imm_ud(msg_data),
+ interpolation);
+ } else {
+ const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
+ BRW_REGISTER_TYPE_UD);
+
+ if (nir_src_is_dynamically_uniform(instr->src[0])) {
+ const fs_reg sample_id = bld.emit_uniformize(sample_src);
+ const fs_reg msg_data = vgrf(glsl_type::uint_type);
+ bld.exec_all().group(1, 0)
+ .SHL(msg_data, sample_id, brw_imm_ud(4u));
+ emit_pixel_interpolater_send(bld,
+ FS_OPCODE_INTERPOLATE_AT_SAMPLE,
+ dest,
+ fs_reg(), /* src */
+ msg_data,
+ interpolation);
+ } else {
+ /* Make a loop that sends a message to the pixel interpolater
+ * for the sample number in each live channel. If there are
+ * multiple channels with the same sample number then these
+ * will be handled simultaneously with a single interation of
+ * the loop.
+ */
+ bld.emit(BRW_OPCODE_DO);
+
+ /* Get the next live sample number into sample_id_reg */
+ const fs_reg sample_id = bld.emit_uniformize(sample_src);
+
+ /* Set the flag register so that we can perform the send
+ * message on all channels that have the same sample number
+ */
+ bld.CMP(bld.null_reg_ud(),
+ sample_src, sample_id,
+ BRW_CONDITIONAL_EQ);
+ const fs_reg msg_data = vgrf(glsl_type::uint_type);
+ bld.exec_all().group(1, 0)
+ .SHL(msg_data, sample_id, brw_imm_ud(4u));
+ fs_inst *inst =
+ emit_pixel_interpolater_send(bld,
+ FS_OPCODE_INTERPOLATE_AT_SAMPLE,
+ dest,
+ fs_reg(), /* src */
+ msg_data,
+ interpolation);
+ set_predicate(BRW_PREDICATE_NORMAL, inst);
+
+ /* Continue the loop if there are any live channels left */
+ set_predicate_inv(BRW_PREDICATE_NORMAL,
+ true, /* inverse */
+ bld.emit(BRW_OPCODE_WHILE));
+ }
+ }
+ break;
+ }
+
+ case nir_intrinsic_load_barycentric_at_offset: {
+ const glsl_interp_mode interpolation =
+ (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
+
+ nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+
+ if (const_offset) {
+ unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf;
+ unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf;
+
+ emit_pixel_interpolater_send(bld,
+ FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
+ dest,
+ fs_reg(), /* src */
+ brw_imm_ud(off_x | (off_y << 4)),
+ interpolation);
+ } else {
+ fs_reg src = vgrf(glsl_type::ivec2_type);
+ fs_reg offset_src = retype(get_nir_src(instr->src[0]),
+ BRW_REGISTER_TYPE_F);
+ for (int i = 0; i < 2; i++) {
+ fs_reg temp = vgrf(glsl_type::float_type);
+ bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f));
+ fs_reg itemp = vgrf(glsl_type::int_type);
+ /* float to int */
+ bld.MOV(itemp, temp);
+
+ /* Clamp the upper end of the range to +7/16.
+ * ARB_gpu_shader5 requires that we support a maximum offset
+ * of +0.5, which isn't representable in a S0.4 value -- if
+ * we didn't clamp it, we'd end up with -8/16, which is the
+ * opposite of what the shader author wanted.
+ *
+ * This is legal due to ARB_gpu_shader5's quantization
+ * rules:
+ *
+ * "Not all values of <offset> may be supported; x and y
+ * offsets may be rounded to fixed-point values with the
+ * number of fraction bits given by the
+ * implementation-dependent constant
+ * FRAGMENT_INTERPOLATION_OFFSET_BITS"
+ */
+ set_condmod(BRW_CONDITIONAL_L,
+ bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7)));
+ }
+
+ const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
+ emit_pixel_interpolater_send(bld,
+ opcode,
+ dest,
+ src,
+ brw_imm_ud(0u),
+ interpolation);
+ }
+ break;
+ }
+
+ case nir_intrinsic_load_interpolated_input: {
+ if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) {
+ emit_fragcoord_interpolation(dest);
+ break;
+ }
+
+ assert(instr->src[0].ssa &&
+ instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
+ nir_intrinsic_instr *bary_intrinsic =
+ nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
+ nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
+ enum glsl_interp_mode interp_mode =
+ (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
+ fs_reg dst_xy;
+
+ if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
+ bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
+ /* Use the result of the PI message */
+ dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);
+ } else {
+ /* Use the delta_xy values computed from the payload */
+ enum brw_barycentric_mode bary =
+ brw_barycentric_mode(interp_mode, bary_intrin);
+
+ dst_xy = this->delta_xy[bary];
+ }
+
+ for (unsigned int i = 0; i < instr->num_components; i++) {
+ fs_reg interp =
+ fs_reg(interp_reg(nir_intrinsic_base(instr),
+ nir_intrinsic_component(instr) + i));
+ interp.type = BRW_REGISTER_TYPE_F;
+ dest.type = BRW_REGISTER_TYPE_F;
+
+ if (devinfo->gen < 6 && interp_mode == INTERP_MODE_SMOOTH) {
+ fs_reg tmp = vgrf(glsl_type::float_type);
+ bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
+ bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
+ } else {
+ bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
+ }
+ }
+ break;
+ }
+
+ default:
+ nir_emit_intrinsic(bld, instr);
+ break;
+ }
+}
+
+void
+fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
+ nir_intrinsic_instr *instr)
+{
+ assert(stage == MESA_SHADER_COMPUTE);
+ struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
+
+ fs_reg dest;
+ if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+ dest = get_nir_dest(instr->dest);
+
+ switch (instr->intrinsic) {
+ case nir_intrinsic_barrier:
+ emit_barrier();
+ cs_prog_data->uses_barrier = true;
+ break;
+
+ case nir_intrinsic_load_local_invocation_id:
+ case nir_intrinsic_load_work_group_id: {
+ gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
+ fs_reg val = nir_system_values[sv];
+ assert(val.file != BAD_FILE);
+ dest.type = val.type;
+ for (unsigned i = 0; i < 3; i++)
+ bld.MOV(offset(dest, bld, i), offset(val, bld, i));
+ break;
+ }
+
+ case nir_intrinsic_load_num_work_groups: {
+ const unsigned surface =
+ cs_prog_data->binding_table.work_groups_start;
+
+ cs_prog_data->uses_num_work_groups = true;
+
+ fs_reg surf_index = brw_imm_ud(surface);
+ brw_mark_surface_used(prog_data, surface);
+
+ /* Read the 3 GLuint components of gl_NumWorkGroups */
+ for (unsigned i = 0; i < 3; i++) {
+ fs_reg read_result =
+ emit_untyped_read(bld, surf_index,
+ brw_imm_ud(i << 2),
+ 1 /* dims */, 1 /* size */,
+ BRW_PREDICATE_NONE);
+ read_result.type = dest.type;
+ bld.MOV(dest, read_result);
+ dest = offset(dest, bld, 1);
+ }
+ break;
+ }
+
+ case nir_intrinsic_shared_atomic_add:
+ nir_emit_shared_atomic(bld, BRW_AOP_ADD, instr);
+ break;
+ case nir_intrinsic_shared_atomic_imin:
+ nir_emit_shared_atomic(bld, BRW_AOP_IMIN, instr);
+ break;
+ case nir_intrinsic_shared_atomic_umin:
+ nir_emit_shared_atomic(bld, BRW_AOP_UMIN, instr);
+ break;
+ case nir_intrinsic_shared_atomic_imax:
+ nir_emit_shared_atomic(bld, BRW_AOP_IMAX, instr);
+ break;
+ case nir_intrinsic_shared_atomic_umax:
+ nir_emit_shared_atomic(bld, BRW_AOP_UMAX, instr);
+ break;
+ case nir_intrinsic_shared_atomic_and:
+ nir_emit_shared_atomic(bld, BRW_AOP_AND, instr);
+ break;
+ case nir_intrinsic_shared_atomic_or:
+ nir_emit_shared_atomic(bld, BRW_AOP_OR, instr);
+ break;
+ case nir_intrinsic_shared_atomic_xor:
+ nir_emit_shared_atomic(bld, BRW_AOP_XOR, instr);
+ break;
+ case nir_intrinsic_shared_atomic_exchange:
+ nir_emit_shared_atomic(bld, BRW_AOP_MOV, instr);
+ break;
+ case nir_intrinsic_shared_atomic_comp_swap:
+ nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr);
+ break;
+
+ case nir_intrinsic_load_shared: {
+ assert(devinfo->gen >= 7);
+
+ fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
+
+ /* Get the offset to read from */
+ fs_reg offset_reg;
+ nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+ if (const_offset) {
+ offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
+ } else {
+ offset_reg = vgrf(glsl_type::uint_type);
+ bld.ADD(offset_reg,
+ retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(instr->const_index[0]));
+ }
+
+ /* Read the vector */
+ do_untyped_vector_read(bld, dest, surf_index, offset_reg,
+ instr->num_components);
+ break;
+ }
+
+ case nir_intrinsic_store_shared: {
+ assert(devinfo->gen >= 7);
+
+ /* Block index */
+ fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
+
+ /* Value */
+ fs_reg val_reg = get_nir_src(instr->src[0]);
+
+ /* Writemask */
+ unsigned writemask = instr->const_index[1];
+
+ /* get_nir_src() retypes to integer. Be wary of 64-bit types though
+ * since the untyped writes below operate in units of 32-bits, which
+ * means that we need to write twice as many components each time.
+ * Also, we have to suffle 64-bit data to be in the appropriate layout
+ * expected by our 32-bit write messages.
+ */
+ unsigned type_size = 4;
+ unsigned bit_size = instr->src[0].is_ssa ?
+ instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
+ if (bit_size == 64) {
+ type_size = 8;
+ fs_reg tmp =
+ fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
+ shuffle_64bit_data_for_32bit_write(
+ bld,
+ retype(tmp, BRW_REGISTER_TYPE_F),
+ retype(val_reg, BRW_REGISTER_TYPE_DF),
+ instr->num_components);
+ val_reg = tmp;
+ }
+
+ unsigned type_slots = type_size / 4;
+
+ /* Combine groups of consecutive enabled channels in one write
+ * message. We use ffs to find the first enabled channel and then ffs on
+ * the bit-inverse, down-shifted writemask to determine the length of
+ * the block of enabled bits.
+ */
+ while (writemask) {
+ unsigned first_component = ffs(writemask) - 1;
+ unsigned length = ffs(~(writemask >> first_component)) - 1;
+
+ /* We can't write more than 2 64-bit components at once. Limit the
+ * length of the write to what we can do and let the next iteration
+ * handle the rest
+ */
+ if (type_size > 4)
+ length = MIN2(2, length);
+
+ fs_reg offset_reg;
+ nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
+ if (const_offset) {
+ offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0] +
+ type_size * first_component);
+ } else {
+ offset_reg = vgrf(glsl_type::uint_type);
+ bld.ADD(offset_reg,
+ retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(instr->const_index[0] + type_size * first_component));
+ }
+
+ emit_untyped_write(bld, surf_index, offset_reg,
+ offset(val_reg, bld, first_component * type_slots),
+ 1 /* dims */, length * type_slots,
+ BRW_PREDICATE_NONE);
+
+ /* Clear the bits in the writemask that we just wrote, then try
+ * again to see if more channels are left.
+ */
+ writemask &= (15 << (first_component + length));
+ }
+
+ break;
+ }
+
+ default:
+ nir_emit_intrinsic(bld, instr);
+ break;
+ }
+}
+
+void
+fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
+{
+ fs_reg dest;
+ if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+ dest = get_nir_dest(instr->dest);
+
+ switch (instr->intrinsic) {
+ case nir_intrinsic_atomic_counter_inc:
+ case nir_intrinsic_atomic_counter_dec:
+ case nir_intrinsic_atomic_counter_read:
+ case nir_intrinsic_atomic_counter_add:
+ case nir_intrinsic_atomic_counter_min:
+ case nir_intrinsic_atomic_counter_max:
+ case nir_intrinsic_atomic_counter_and:
+ case nir_intrinsic_atomic_counter_or:
+ case nir_intrinsic_atomic_counter_xor:
+ case nir_intrinsic_atomic_counter_exchange:
+ case nir_intrinsic_atomic_counter_comp_swap: {
+ if (stage == MESA_SHADER_FRAGMENT &&
+ instr->intrinsic != nir_intrinsic_atomic_counter_read)
+ brw_wm_prog_data(prog_data)->has_side_effects = true;
+
+ /* Get some metadata from the image intrinsic. */
+ const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
+
+ /* Get the arguments of the atomic intrinsic. */
+ const fs_reg offset = get_nir_src(instr->src[0]);
+ const unsigned surface = (stage_prog_data->binding_table.abo_start +
+ instr->const_index[0]);
+ const fs_reg src0 = (info->num_srcs >= 2
+ ? get_nir_src(instr->src[1]) : fs_reg());
+ const fs_reg src1 = (info->num_srcs >= 3
+ ? get_nir_src(instr->src[2]) : fs_reg());
+ fs_reg tmp;
+
+ assert(info->num_srcs <= 3);
+
+ /* Emit a surface read or atomic op. */
+ if (instr->intrinsic == nir_intrinsic_atomic_counter_read) {
+ tmp = emit_untyped_read(bld, brw_imm_ud(surface), offset, 1, 1);
+ } else {
+ tmp = emit_untyped_atomic(bld, brw_imm_ud(surface), offset, src0,
+ src1, 1, 1,
+ get_atomic_counter_op(instr->intrinsic));
+ }
+
+ /* Assign the result. */
+ bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), tmp);
+
+ /* Mark the surface as used. */
+ brw_mark_surface_used(stage_prog_data, surface);
+ break;
+ }
+
+ case nir_intrinsic_image_load:
+ case nir_intrinsic_image_store:
+ case nir_intrinsic_image_atomic_add:
+ case nir_intrinsic_image_atomic_min:
+ case nir_intrinsic_image_atomic_max:
+ case nir_intrinsic_image_atomic_and:
+ case nir_intrinsic_image_atomic_or:
+ case nir_intrinsic_image_atomic_xor:
+ case nir_intrinsic_image_atomic_exchange:
+ case nir_intrinsic_image_atomic_comp_swap: {
+ using namespace image_access;
+
+ if (stage == MESA_SHADER_FRAGMENT &&
+ instr->intrinsic != nir_intrinsic_image_load)
+ brw_wm_prog_data(prog_data)->has_side_effects = true;
+
+ /* Get the referenced image variable and type. */
+ const nir_variable *var = instr->variables[0]->var;
+ const glsl_type *type = var->type->without_array();
+ const brw_reg_type base_type = get_image_base_type(type);
+
+ /* Get some metadata from the image intrinsic. */
+ const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
+ const unsigned arr_dims = type->sampler_array ? 1 : 0;
+ const unsigned surf_dims = type->coordinate_components() - arr_dims;
+ const unsigned format = var->data.image.format;
+
+ /* Get the arguments of the image intrinsic. */
+ const fs_reg image = get_nir_image_deref(instr->variables[0]);
+ const fs_reg addr = retype(get_nir_src(instr->src[0]),
+ BRW_REGISTER_TYPE_UD);
+ const fs_reg src0 = (info->num_srcs >= 3 ?
+ retype(get_nir_src(instr->src[2]), base_type) :
+ fs_reg());
+ const fs_reg src1 = (info->num_srcs >= 4 ?
+ retype(get_nir_src(instr->src[3]), base_type) :
+ fs_reg());
+ fs_reg tmp;
+
+ /* Emit an image load, store or atomic op. */
+ if (instr->intrinsic == nir_intrinsic_image_load)
+ tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format);
+
+ else if (instr->intrinsic == nir_intrinsic_image_store)
+ emit_image_store(bld, image, addr, src0, surf_dims, arr_dims,
+ var->data.image.write_only ? GL_NONE : format);
+
+ else
+ tmp = emit_image_atomic(bld, image, addr, src0, src1,
+ surf_dims, arr_dims, info->dest_components,
+ get_image_atomic_op(instr->intrinsic, type));
+
+ /* Assign the result. */
+ for (unsigned c = 0; c < info->dest_components; ++c)
+ bld.MOV(offset(retype(dest, base_type), bld, c),
+ offset(tmp, bld, c));
+ break;
+ }
+
+ case nir_intrinsic_memory_barrier_atomic_counter:
+ case nir_intrinsic_memory_barrier_buffer:
+ case nir_intrinsic_memory_barrier_image:
+ case nir_intrinsic_memory_barrier: {
+ const fs_builder ubld = bld.group(8, 0);
+ const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+ ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
+ ->size_written = 2 * REG_SIZE;
+ break;
+ }
+
+ case nir_intrinsic_group_memory_barrier:
+ case nir_intrinsic_memory_barrier_shared:
+ /* We treat these workgroup-level barriers as no-ops. This should be
+ * safe at present and as long as:
+ *
+ * - Memory access instructions are not subsequently reordered by the
+ * compiler back-end.
+ *
+ * - All threads from a given compute shader workgroup fit within a
+ * single subslice and therefore talk to the same HDC shared unit
+ * what supposedly guarantees ordering and coherency between threads
+ * from the same workgroup. This may change in the future when we
+ * start splitting workgroups across multiple subslices.
+ *
+ * - The context is not in fault-and-stream mode, which could cause
+ * memory transactions (including to SLM) prior to the barrier to be
+ * replayed after the barrier if a pagefault occurs. This shouldn't
+ * be a problem up to and including SKL because fault-and-stream is
+ * not usable due to hardware issues, but that's likely to change in
+ * the future.
+ */
+ break;
+
+ case nir_intrinsic_shader_clock: {
+ /* We cannot do anything if there is an event, so ignore it for now */
+ const fs_reg shader_clock = get_timestamp(bld);
+ const fs_reg srcs[] = { component(shader_clock, 0),
+ component(shader_clock, 1) };
+ bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
+ break;
+ }
+
+ case nir_intrinsic_image_size: {
+ /* Get the referenced image variable and type. */
+ const nir_variable *var = instr->variables[0]->var;
+ const glsl_type *type = var->type->without_array();
+
+ /* Get the size of the image. */
+ const fs_reg image = get_nir_image_deref(instr->variables[0]);
+ const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
+
+ /* For 1DArray image types, the array index is stored in the Z component.
+ * Fix this by swizzling the Z component to the Y component.
+ */
+ const bool is_1d_array_image =
+ type->sampler_dimensionality == GLSL_SAMPLER_DIM_1D &&
+ type->sampler_array;
+
+ /* For CubeArray images, we should count the number of cubes instead
+ * of the number of faces. Fix it by dividing the (Z component) by 6.
+ */
+ const bool is_cube_array_image =
+ type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
+ type->sampler_array;
+
+ /* Copy all the components. */
+ const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
+ for (unsigned c = 0; c < info->dest_components; ++c) {
+ if ((int)c >= type->coordinate_components()) {
+ bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
+ brw_imm_d(1));
+ } else if (c == 1 && is_1d_array_image) {
+ bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
+ offset(size, bld, 2));
+ } else if (c == 2 && is_cube_array_image) {
+ bld.emit(SHADER_OPCODE_INT_QUOTIENT,
+ offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
+ offset(size, bld, c), brw_imm_d(6));
+ } else {
+ bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
+ offset(size, bld, c));
+ }
+ }
+
+ break;
+ }
+
+ case nir_intrinsic_image_samples:
+ /* The driver does not support multi-sampled images. */
+ bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
+ break;
+
+ case nir_intrinsic_load_uniform: {
+ /* Offsets are in bytes but they should always be multiples of 4 */
+ assert(instr->const_index[0] % 4 == 0);
+
+ fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type);
+
+ nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+ if (const_offset) {
+ /* Offsets are in bytes but they should always be multiples of 4 */
+ assert(const_offset->u32[0] % 4 == 0);
+ src.offset = const_offset->u32[0];
+
+ for (unsigned j = 0; j < instr->num_components; j++) {
+ bld.MOV(offset(dest, bld, j), offset(src, bld, j));
+ }
+ } else {
+ fs_reg indirect = retype(get_nir_src(instr->src[0]),
+ BRW_REGISTER_TYPE_UD);
+
+ /* We need to pass a size to the MOV_INDIRECT but we don't want it to
+ * go past the end of the uniform. In order to keep the n'th
+ * component from running past, we subtract off the size of all but
+ * one component of the vector.
+ */
+ assert(instr->const_index[1] >=
+ instr->num_components * (int) type_sz(dest.type));
+ unsigned read_size = instr->const_index[1] -
+ (instr->num_components - 1) * type_sz(dest.type);
+
+ bool supports_64bit_indirects =
+ !devinfo->is_cherryview && !devinfo->is_broxton;
+
+ if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
+ for (unsigned j = 0; j < instr->num_components; j++) {
+ bld.emit(SHADER_OPCODE_MOV_INDIRECT,
+ offset(dest, bld, j), offset(src, bld, j),
+ indirect, brw_imm_ud(read_size));
+ }
+ } else {
+ const unsigned num_mov_indirects =
+ type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD);
+ /* We read a little bit less per MOV INDIRECT, as they are now
+ * 32-bits ones instead of 64-bit. Fix read_size then.
+ */
+ const unsigned read_size_32bit = read_size -
+ (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD);
+ for (unsigned j = 0; j < instr->num_components; j++) {
+ for (unsigned i = 0; i < num_mov_indirects; i++) {
+ bld.emit(SHADER_OPCODE_MOV_INDIRECT,
+ subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i),
+ subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i),
+ indirect, brw_imm_ud(read_size_32bit));
+ }
+ }
+ }
+ }
+ break;
+ }
+
+ case nir_intrinsic_load_ubo: {
+ nir_const_value *const_index = nir_src_as_const_value(instr->src[0]);
+ fs_reg surf_index;
+
+ if (const_index) {
+ const unsigned index = stage_prog_data->binding_table.ubo_start +
+ const_index->u32[0];
+ surf_index = brw_imm_ud(index);
+ brw_mark_surface_used(prog_data, index);
+ } else {
+ /* The block index is not a constant. Evaluate the index expression
+ * per-channel and add the base UBO index; we have to select a value
+ * from any live channel.
+ */
+ surf_index = vgrf(glsl_type::uint_type);
+ bld.ADD(surf_index, get_nir_src(instr->src[0]),
+ brw_imm_ud(stage_prog_data->binding_table.ubo_start));
+ surf_index = bld.emit_uniformize(surf_index);
+
+ /* Assume this may touch any UBO. It would be nice to provide
+ * a tighter bound, but the array information is already lowered away.
+ */
+ brw_mark_surface_used(prog_data,
+ stage_prog_data->binding_table.ubo_start +
+ nir->info->num_ubos - 1);
+ }
+
+ nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
+ if (const_offset == NULL) {
+ fs_reg base_offset = retype(get_nir_src(instr->src[1]),
+ BRW_REGISTER_TYPE_UD);
+
+ for (int i = 0; i < instr->num_components; i++)
+ VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
+ base_offset, i * type_sz(dest.type));
+ } else {
+ /* Even if we are loading doubles, a pull constant load will load
+ * a 32-bit vec4, so should only reserve vgrf space for that. If we
+ * need to load a full dvec4 we will have to emit 2 loads. This is
+ * similar to demote_pull_constants(), except that in that case we
+ * see individual accesses to each component of the vector and then
+ * we let CSE deal with duplicate loads. Here we see a vector access
+ * and we have to split it if necessary.
+ */
+ const unsigned type_size = type_sz(dest.type);
+ const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
+ const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
+ const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+
+ for (unsigned c = 0; c < instr->num_components;) {
+ const unsigned base = const_offset->u32[0] + c * type_size;
+ /* Number of usable components in the next block-aligned load. */
+ const unsigned count = MIN2(instr->num_components - c,
+ (block_sz - base % block_sz) / type_size);
+
+ ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+ packed_consts, surf_index,
+ brw_imm_ud(base & ~(block_sz - 1)));
+
+ const fs_reg consts =
+ retype(byte_offset(packed_consts, base & (block_sz - 1)),
+ dest.type);
+
+ for (unsigned d = 0; d < count; d++)
+ bld.MOV(offset(dest, bld, c + d), component(consts, d));
+
+ c += count;
+ }
+ }
+ break;
+ }
+
+ case nir_intrinsic_load_ssbo: {
+ assert(devinfo->gen >= 7);
+
+ nir_const_value *const_uniform_block =
+ nir_src_as_const_value(instr->src[0]);
+
+ fs_reg surf_index;
+ if (const_uniform_block) {
+ unsigned index = stage_prog_data->binding_table.ssbo_start +
+ const_uniform_block->u32[0];
+ surf_index = brw_imm_ud(index);
+ brw_mark_surface_used(prog_data, index);
+ } else {
+ surf_index = vgrf(glsl_type::uint_type);
+ bld.ADD(surf_index, get_nir_src(instr->src[0]),
+ brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
+
+ /* Assume this may touch any UBO. It would be nice to provide
+ * a tighter bound, but the array information is already lowered away.
+ */
+ brw_mark_surface_used(prog_data,
+ stage_prog_data->binding_table.ssbo_start +
+ nir->info->num_ssbos - 1);
+ }
+
+ fs_reg offset_reg;
+ nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
+ if (const_offset) {
+ offset_reg = brw_imm_ud(const_offset->u32[0]);
+ } else {
+ offset_reg = get_nir_src(instr->src[1]);
+ }
+
+ /* Read the vector */
+ do_untyped_vector_read(bld, dest, surf_index, offset_reg,
+ instr->num_components);
+
+ break;
+ }
+
+ case nir_intrinsic_store_ssbo: {
+ assert(devinfo->gen >= 7);
+
+ if (stage == MESA_SHADER_FRAGMENT)
+ brw_wm_prog_data(prog_data)->has_side_effects = true;
+
+ /* Block index */
+ fs_reg surf_index;
+ nir_const_value *const_uniform_block =
+ nir_src_as_const_value(instr->src[1]);
+ if (const_uniform_block) {
+ unsigned index = stage_prog_data->binding_table.ssbo_start +
+ const_uniform_block->u32[0];
+ surf_index = brw_imm_ud(index);
+ brw_mark_surface_used(prog_data, index);
+ } else {
+ surf_index = vgrf(glsl_type::uint_type);
+ bld.ADD(surf_index, get_nir_src(instr->src[1]),
+ brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
+
+ brw_mark_surface_used(prog_data,
+ stage_prog_data->binding_table.ssbo_start +
+ nir->info->num_ssbos - 1);
+ }
+
+ /* Value */
+ fs_reg val_reg = get_nir_src(instr->src[0]);
+
+ /* Writemask */
+ unsigned writemask = instr->const_index[0];
+
+ /* get_nir_src() retypes to integer. Be wary of 64-bit types though
+ * since the untyped writes below operate in units of 32-bits, which
+ * means that we need to write twice as many components each time.
+ * Also, we have to suffle 64-bit data to be in the appropriate layout
+ * expected by our 32-bit write messages.
+ */
+ unsigned type_size = 4;
+ unsigned bit_size = instr->src[0].is_ssa ?
+ instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
+ if (bit_size == 64) {
+ type_size = 8;
+ fs_reg tmp =
+ fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
+ shuffle_64bit_data_for_32bit_write(bld,
+ retype(tmp, BRW_REGISTER_TYPE_F),
+ retype(val_reg, BRW_REGISTER_TYPE_DF),
+ instr->num_components);
+ val_reg = tmp;
+ }
+
+ unsigned type_slots = type_size / 4;
+
+ /* Combine groups of consecutive enabled channels in one write
+ * message. We use ffs to find the first enabled channel and then ffs on
+ * the bit-inverse, down-shifted writemask to determine the length of
+ * the block of enabled bits.
+ */
+ while (writemask) {
+ unsigned first_component = ffs(writemask) - 1;
+ unsigned length = ffs(~(writemask >> first_component)) - 1;
+
+ /* We can't write more than 2 64-bit components at once. Limit the
+ * length of the write to what we can do and let the next iteration
+ * handle the rest
+ */
+ if (type_size > 4)
+ length = MIN2(2, length);
+
+ fs_reg offset_reg;
+ nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]);
+ if (const_offset) {
+ offset_reg = brw_imm_ud(const_offset->u32[0] +
+ type_size * first_component);
+ } else {
+ offset_reg = vgrf(glsl_type::uint_type);
+ bld.ADD(offset_reg,
+ retype(get_nir_src(instr->src[2]), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(type_size * first_component));
+ }
+
+
+ emit_untyped_write(bld, surf_index, offset_reg,
+ offset(val_reg, bld, first_component * type_slots),
+ 1 /* dims */, length * type_slots,
+ BRW_PREDICATE_NONE);
+
+ /* Clear the bits in the writemask that we just wrote, then try
+ * again to see if more channels are left.
+ */
+ writemask &= (15 << (first_component + length));
+ }
+ break;
+ }
+
+ case nir_intrinsic_store_output: {
+ fs_reg src = get_nir_src(instr->src[0]);
+
+ nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
+ assert(const_offset && "Indirect output stores not allowed");
+ fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
+ 4 * const_offset->u32[0]), src.type);
+
+ unsigned num_components = instr->num_components;
+ unsigned first_component = nir_intrinsic_component(instr);
+ unsigned bit_size = instr->src[0].is_ssa ?
+ instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
+ if (bit_size == 64) {
+ fs_reg tmp =
+ fs_reg(VGRF, alloc.allocate(2 * num_components),
+ BRW_REGISTER_TYPE_F);
+ shuffle_64bit_data_for_32bit_write(
+ bld, tmp, retype(src, BRW_REGISTER_TYPE_DF), num_components);
+ src = retype(tmp, src.type);
+ num_components *= 2;
+ }
+
+ for (unsigned j = 0; j < num_components; j++) {
+ bld.MOV(offset(new_dest, bld, j + first_component),
+ offset(src, bld, j));
+ }
+ break;
+ }
+
+ case nir_intrinsic_ssbo_atomic_add:
+ nir_emit_ssbo_atomic(bld, BRW_AOP_ADD, instr);
+ break;
+ case nir_intrinsic_ssbo_atomic_imin:
+ nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr);
+ break;
+ case nir_intrinsic_ssbo_atomic_umin:
+ nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr);
+ break;
+ case nir_intrinsic_ssbo_atomic_imax:
+ nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr);
+ break;
+ case nir_intrinsic_ssbo_atomic_umax:
+ nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr);
+ break;
+ case nir_intrinsic_ssbo_atomic_and:
+ nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr);
+ break;
+ case nir_intrinsic_ssbo_atomic_or:
+ nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr);
+ break;
+ case nir_intrinsic_ssbo_atomic_xor:
+ nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr);
+ break;
+ case nir_intrinsic_ssbo_atomic_exchange:
+ nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr);
+ break;
+ case nir_intrinsic_ssbo_atomic_comp_swap:
+ nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr);
+ break;
+
+ case nir_intrinsic_get_buffer_size: {
+ nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
+ unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0;
+
+ /* A resinfo's sampler message is used to get the buffer size. The
+ * SIMD8's writeback message consists of four registers and SIMD16's
+ * writeback message consists of 8 destination registers (two per each
+ * component). Because we are only interested on the first channel of
+ * the first returned component, where resinfo returns the buffer size
+ * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
+ * the dispatch width.
+ */
+ const fs_builder ubld = bld.exec_all().group(8, 0);
+ fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+ fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+
+ /* Set LOD = 0 */
+ ubld.MOV(src_payload, brw_imm_d(0));
+
+ const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;
+ fs_inst *inst = ubld.emit(FS_OPCODE_GET_BUFFER_SIZE, ret_payload,
+ src_payload, brw_imm_ud(index));
+ inst->header_size = 0;
+ inst->mlen = 1;
+ inst->size_written = 4 * REG_SIZE;
+
+ bld.MOV(retype(dest, ret_payload.type), component(ret_payload, 0));
+ brw_mark_surface_used(prog_data, index);
+ break;
+ }
+
+ case nir_intrinsic_load_channel_num: {
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW);
+ dest = retype(dest, BRW_REGISTER_TYPE_UD);
+ const fs_builder allbld8 = bld.group(8, 0).exec_all();
+ allbld8.MOV(tmp, brw_imm_v(0x76543210));
+ if (dispatch_width > 8)
+ allbld8.ADD(byte_offset(tmp, 16), tmp, brw_imm_uw(8u));
+ if (dispatch_width > 16) {
+ const fs_builder allbld16 = bld.group(16, 0).exec_all();
+ allbld16.ADD(byte_offset(tmp, 32), tmp, brw_imm_uw(16u));
+ }
+ bld.MOV(dest, tmp);
+ break;
+ }
+
+ default:
+ unreachable("unknown intrinsic");
+ }
+}
+
+void
+fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
+ int op, nir_intrinsic_instr *instr)
+{
+ if (stage == MESA_SHADER_FRAGMENT)
+ brw_wm_prog_data(prog_data)->has_side_effects = true;
+
+ fs_reg dest;
+ if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+ dest = get_nir_dest(instr->dest);
+
+ fs_reg surface;
+ nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]);
+ if (const_surface) {
+ unsigned surf_index = stage_prog_data->binding_table.ssbo_start +
+ const_surface->u32[0];
+ surface = brw_imm_ud(surf_index);
+ brw_mark_surface_used(prog_data, surf_index);
+ } else {
+ surface = vgrf(glsl_type::uint_type);
+ bld.ADD(surface, get_nir_src(instr->src[0]),
+ brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
+
+ /* Assume this may touch any SSBO. This is the same we do for other
+ * UBO/SSBO accesses with non-constant surface.
+ */
+ brw_mark_surface_used(prog_data,
+ stage_prog_data->binding_table.ssbo_start +
+ nir->info->num_ssbos - 1);
+ }
+
+ fs_reg offset = get_nir_src(instr->src[1]);
+ fs_reg data1 = get_nir_src(instr->src[2]);
+ fs_reg data2;
+ if (op == BRW_AOP_CMPWR)
+ data2 = get_nir_src(instr->src[3]);
+
+ /* Emit the actual atomic operation */
+
+ fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
+ data1, data2,
+ 1 /* dims */, 1 /* rsize */,
+ op,
+ BRW_PREDICATE_NONE);
+ dest.type = atomic_result.type;
+ bld.MOV(dest, atomic_result);
+}
+
+void
+fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
+ int op, nir_intrinsic_instr *instr)
+{
+ fs_reg dest;
+ if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+ dest = get_nir_dest(instr->dest);
+
+ fs_reg surface = brw_imm_ud(GEN7_BTI_SLM);
+ fs_reg offset;
+ fs_reg data1 = get_nir_src(instr->src[1]);
+ fs_reg data2;
+ if (op == BRW_AOP_CMPWR)
+ data2 = get_nir_src(instr->src[2]);
+
+ /* Get the offset */
+ nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+ if (const_offset) {
+ offset = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
+ } else {
+ offset = vgrf(glsl_type::uint_type);
+ bld.ADD(offset,
+ retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(instr->const_index[0]));
+ }
+
+ /* Emit the actual atomic operation operation */
+
+ fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
+ data1, data2,
+ 1 /* dims */, 1 /* rsize */,
+ op,
+ BRW_PREDICATE_NONE);
+ dest.type = atomic_result.type;
+ bld.MOV(dest, atomic_result);
+}
+
+void
+fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
+{
+ unsigned texture = instr->texture_index;
+ unsigned sampler = instr->sampler_index;
+
+ fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
+
+ srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture);
+ srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler);
+
+ int lod_components = 0;
+
+ /* The hardware requires a LOD for buffer textures */
+ if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
+ srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
+
+ uint32_t header_bits = 0;
+ for (unsigned i = 0; i < instr->num_srcs; i++) {
+ fs_reg src = get_nir_src(instr->src[i].src);
+ switch (instr->src[i].src_type) {
+ case nir_tex_src_bias:
+ srcs[TEX_LOGICAL_SRC_LOD] =
+ retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
+ break;
+ case nir_tex_src_comparator:
+ srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F);
+ break;
+ case nir_tex_src_coord:
+ switch (instr->op) {
+ case nir_texop_txf:
+ case nir_texop_txf_ms:
+ case nir_texop_txf_ms_mcs:
+ case nir_texop_samples_identical:
+ srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D);
+ break;
+ default:
+ srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F);
+ break;
+ }
+ break;
+ case nir_tex_src_ddx:
+ srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F);
+ lod_components = nir_tex_instr_src_size(instr, i);
+ break;
+ case nir_tex_src_ddy:
+ srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F);
+ break;
+ case nir_tex_src_lod:
+ switch (instr->op) {
+ case nir_texop_txs:
+ srcs[TEX_LOGICAL_SRC_LOD] =
+ retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD);
+ break;
+ case nir_texop_txf:
+ srcs[TEX_LOGICAL_SRC_LOD] =
+ retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D);
+ break;
+ default:
+ srcs[TEX_LOGICAL_SRC_LOD] =
+ retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
+ break;
+ }
+ break;
+ case nir_tex_src_ms_index:
+ srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD);
+ break;
+
+ case nir_tex_src_offset: {
+ nir_const_value *const_offset =
+ nir_src_as_const_value(instr->src[i].src);
+ unsigned offset_bits = 0;
+ if (const_offset &&
+ brw_texture_offset(const_offset->i32,
+ nir_tex_instr_src_size(instr, i),
+ &offset_bits)) {
+ header_bits |= offset_bits;
+ } else {
+ srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
+ retype(src, BRW_REGISTER_TYPE_D);
+ }
+ break;
+ }
+
+ case nir_tex_src_projector:
+ unreachable("should be lowered");
+
+ case nir_tex_src_texture_offset: {
+ /* Figure out the highest possible texture index and mark it as used */
+ uint32_t max_used = texture + instr->texture_array_size - 1;
+ if (instr->op == nir_texop_tg4 && devinfo->gen < 8) {
+ max_used += stage_prog_data->binding_table.gather_texture_start;
+ } else {
+ max_used += stage_prog_data->binding_table.texture_start;
+ }
+ brw_mark_surface_used(prog_data, max_used);
+
+ /* Emit code to evaluate the actual indexing expression */
+ fs_reg tmp = vgrf(glsl_type::uint_type);
+ bld.ADD(tmp, src, brw_imm_ud(texture));
+ srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
+ break;
+ }
+
+ case nir_tex_src_sampler_offset: {
+ /* Emit code to evaluate the actual indexing expression */
+ fs_reg tmp = vgrf(glsl_type::uint_type);
+ bld.ADD(tmp, src, brw_imm_ud(sampler));
+ srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
+ break;
+ }
+
+ case nir_tex_src_ms_mcs:
+ assert(instr->op == nir_texop_txf_ms);
+ srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);
+ break;
+
+ case nir_tex_src_plane: {
+ nir_const_value *const_plane =
+ nir_src_as_const_value(instr->src[i].src);
+ const uint32_t plane = const_plane->u32[0];
+ const uint32_t texture_index =
+ instr->texture_index +
+ stage_prog_data->binding_table.plane_start[plane] -
+ stage_prog_data->binding_table.texture_start;
+
+ srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index);
+ break;
+ }
+
+ default:
+ unreachable("unknown texture source");
+ }
+ }
+
+ if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
+ (instr->op == nir_texop_txf_ms ||
+ instr->op == nir_texop_samples_identical)) {
+ if (devinfo->gen >= 7 &&
+ key_tex->compressed_multisample_layout_mask & (1 << texture)) {
+ srcs[TEX_LOGICAL_SRC_MCS] =
+ emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE],
+ instr->coord_components,
+ srcs[TEX_LOGICAL_SRC_SURFACE]);
+ } else {
+ srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u);
+ }
+ }
+
+ srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
+ srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
+
+ if (instr->op == nir_texop_query_levels ||
+ (instr->op == nir_texop_tex && stage != MESA_SHADER_FRAGMENT)) {
+ /* textureQueryLevels() and texture() are implemented in terms of TXS
+ * and TXL respectively, so we need to pass a valid LOD argument.
+ */
+ assert(srcs[TEX_LOGICAL_SRC_LOD].file == BAD_FILE);
+ srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_ud(0u);
+ }
+
+ enum opcode opcode;
+ switch (instr->op) {
+ case nir_texop_tex:
+ opcode = (stage == MESA_SHADER_FRAGMENT ? SHADER_OPCODE_TEX_LOGICAL :
+ SHADER_OPCODE_TXL_LOGICAL);
+ break;
+ case nir_texop_txb:
+ opcode = FS_OPCODE_TXB_LOGICAL;
+ break;
+ case nir_texop_txl:
+ opcode = SHADER_OPCODE_TXL_LOGICAL;
+ break;
+ case nir_texop_txd:
+ opcode = SHADER_OPCODE_TXD_LOGICAL;
+ break;
+ case nir_texop_txf:
+ opcode = SHADER_OPCODE_TXF_LOGICAL;
+ break;
+ case nir_texop_txf_ms:
+ if ((key_tex->msaa_16 & (1 << sampler)))
+ opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
+ else
+ opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
+ break;
+ case nir_texop_txf_ms_mcs:
+ opcode = SHADER_OPCODE_TXF_MCS_LOGICAL;
+ break;
+ case nir_texop_query_levels:
+ case nir_texop_txs:
+ opcode = SHADER_OPCODE_TXS_LOGICAL;
+ break;
+ case nir_texop_lod:
+ opcode = SHADER_OPCODE_LOD_LOGICAL;
+ break;
+ case nir_texop_tg4:
+ if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
+ opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;
+ else
+ opcode = SHADER_OPCODE_TG4_LOGICAL;
+ break;
+ case nir_texop_texture_samples:
+ opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;
+ break;
+ case nir_texop_samples_identical: {
+ fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);
+
+ /* If mcs is an immediate value, it means there is no MCS. In that case
+ * just return false.
+ */
+ if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) {
+ bld.MOV(dst, brw_imm_ud(0u));
+ } else if ((key_tex->msaa_16 & (1 << sampler))) {
+ fs_reg tmp = vgrf(glsl_type::uint_type);
+ bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS],
+ offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
+ bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
+ } else {
+ bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u),
+ BRW_CONDITIONAL_EQ);
+ }
+ return;
+ }
+ default:
+ unreachable("unknown texture opcode");
+ }
+
+ if (instr->op == nir_texop_tg4) {
+ if (instr->component == 1 &&
+ key_tex->gather_channel_quirk_mask & (1 << texture)) {
+ /* gather4 sampler is broken for green channel on RG32F --
+ * we must ask for blue instead.
+ */
+ header_bits |= 2 << 16;
+ } else {
+ header_bits |= instr->component << 16;
+ }
+ }
+
+ fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4);
+ fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
+ inst->offset = header_bits;
+
+ const unsigned dest_size = nir_tex_instr_dest_size(instr);
+ if (devinfo->gen >= 9 &&
+ instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
+ unsigned write_mask = instr->dest.is_ssa ?
+ nir_ssa_def_components_read(&instr->dest.ssa):
+ (1 << dest_size) - 1;
+ assert(write_mask != 0); /* dead code should have been eliminated */
+ inst->size_written = util_last_bit(write_mask) *
+ inst->dst.component_size(inst->exec_size);
+ } else {
+ inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
+ }
+
+ if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
+ inst->shadow_compare = true;
+
+ if (instr->op == nir_texop_tg4 && devinfo->gen == 6)
+ emit_gen6_gather_wa(key_tex->gen6_gather_wa[texture], dst);
+
+ fs_reg nir_dest[4];
+ for (unsigned i = 0; i < dest_size; i++)
+ nir_dest[i] = offset(dst, bld, i);
+
+ if (instr->op == nir_texop_query_levels) {
+ /* # levels is in .w */
+ nir_dest[0] = offset(dst, bld, 3);
+ } else if (instr->op == nir_texop_txs &&
+ dest_size >= 3 && devinfo->gen < 7) {
+ /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
+ fs_reg depth = offset(dst, bld, 2);
+ nir_dest[2] = vgrf(glsl_type::int_type);
+ bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
+ }
+
+ bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0);
+}
+
+void
+fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
+{
+ switch (instr->type) {
+ case nir_jump_break:
+ bld.emit(BRW_OPCODE_BREAK);
+ break;
+ case nir_jump_continue:
+ bld.emit(BRW_OPCODE_CONTINUE);
+ break;
+ case nir_jump_return:
+ default:
+ unreachable("unknown jump");
+ }
+}
+
+/**
+ * This helper takes the result of a load operation that reads 32-bit elements
+ * in this format:
+ *
+ * x x x x x x x x
+ * y y y y y y y y
+ * z z z z z z z z
+ * w w w w w w w w
+ *
+ * and shuffles the data to get this:
+ *
+ * x y x y x y x y
+ * x y x y x y x y
+ * z w z w z w z w
+ * z w z w z w z w
+ *
+ * Which is exactly what we want if the load is reading 64-bit components
+ * like doubles, where x represents the low 32-bit of the x double component
+ * and y represents the high 32-bit of the x double component (likewise with
+ * z and w for double component y). The parameter @components represents
+ * the number of 64-bit components present in @src. This would typically be
+ * 2 at most, since we can only fit 2 double elements in the result of a
+ * vec4 load.
+ *
+ * Notice that @dst and @src can be the same register.
+ */
+void
+shuffle_32bit_load_result_to_64bit_data(const fs_builder &bld,
+ const fs_reg &dst,
+ const fs_reg &src,
+ uint32_t components)
+{
+ assert(type_sz(src.type) == 4);
+ assert(type_sz(dst.type) == 8);
+
+ /* A temporary that we will use to shuffle the 32-bit data of each
+ * component in the vector into valid 64-bit data. We can't write directly
+ * to dst because dst can be (and would usually be) the same as src
+ * and in that case the first MOV in the loop below would overwrite the
+ * data read in the second MOV.
+ */
+ fs_reg tmp = bld.vgrf(dst.type);
+
+ for (unsigned i = 0; i < components; i++) {
+ const fs_reg component_i = offset(src, bld, 2 * i);
+
+ bld.MOV(subscript(tmp, src.type, 0), component_i);
+ bld.MOV(subscript(tmp, src.type, 1), offset(component_i, bld, 1));
+
+ bld.MOV(offset(dst, bld, i), tmp);
+ }
+}
+
+/**
+ * This helper does the inverse operation of
+ * SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA.
+ *
+ * We need to do this when we are going to use untyped write messsages that
+ * operate with 32-bit components in order to arrange our 64-bit data to be
+ * in the expected layout.
+ *
+ * Notice that callers of this function, unlike in the case of the inverse
+ * operation, would typically need to call this with dst and src being
+ * different registers, since they would otherwise corrupt the original
+ * 64-bit data they are about to write. Because of this the function checks
+ * that the src and dst regions involved in the operation do not overlap.
+ */
+void
+shuffle_64bit_data_for_32bit_write(const fs_builder &bld,
+ const fs_reg &dst,
+ const fs_reg &src,
+ uint32_t components)
+{
+ assert(type_sz(src.type) == 8);
+ assert(type_sz(dst.type) == 4);
+
+ assert(!regions_overlap(
+ dst, 2 * components * dst.component_size(bld.dispatch_width()),
+ src, components * src.component_size(bld.dispatch_width())));
+
+ for (unsigned i = 0; i < components; i++) {
+ const fs_reg component_i = offset(src, bld, i);
+ bld.MOV(offset(dst, bld, 2 * i), subscript(component_i, dst.type, 0));
+ bld.MOV(offset(dst, bld, 2 * i + 1), subscript(component_i, dst.type, 1));
+ }
+}
+
+fs_reg
+setup_imm_df(const fs_builder &bld, double v)
+{
+ const struct gen_device_info *devinfo = bld.shader->devinfo;
+ assert(devinfo->gen >= 7);
+
+ if (devinfo->gen >= 8)
+ return brw_imm_df(v);
+
+ /* gen7.5 does not support DF immediates straighforward but the DIM
+ * instruction allows to set the 64-bit immediate value.
+ */
+ if (devinfo->is_haswell) {
+ const fs_builder ubld = bld.exec_all().group(1, 0);
+ fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1);
+ ubld.DIM(dst, brw_imm_df(v));
+ return component(dst, 0);
+ }
+
+ /* gen7 does not support DF immediates, so we generate a 64-bit constant by
+ * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
+ * the high 32-bit to suboffset 4 and then applying a stride of 0.
+ *
+ * Alternatively, we could also produce a normal VGRF (without stride 0)
+ * by writing to all the channels in the VGRF, however, that would hit the
+ * gen7 bug where we have to split writes that span more than 1 register
+ * into instructions with a width of 4 (otherwise the write to the second
+ * register written runs into an execmask hardware bug) which isn't very
+ * nice.
+ */
+ union {
+ double d;
+ struct {
+ uint32_t i1;
+ uint32_t i2;
+ };
+ } di;
+
+ di.d = v;
+
+ const fs_builder ubld = bld.exec_all().group(1, 0);
+ const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+ ubld.MOV(tmp, brw_imm_ud(di.i1));
+ ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2));
+
+ return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0);
+}
diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp
new file mode 100644
index 00000000000..5c6f3d490f0
--- /dev/null
+++ b/src/intel/compiler/brw_fs_reg_allocate.cpp
@@ -0,0 +1,992 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ * Eric Anholt <[email protected]>
+ *
+ */
+
+#include "brw_eu.h"
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "util/register_allocate.h"
+
+using namespace brw;
+
+static void
+assign_reg(unsigned *reg_hw_locations, fs_reg *reg)
+{
+ if (reg->file == VGRF) {
+ reg->nr = reg_hw_locations[reg->nr] + reg->offset / REG_SIZE;
+ reg->offset %= REG_SIZE;
+ }
+}
+
+void
+fs_visitor::assign_regs_trivial()
+{
+ unsigned hw_reg_mapping[this->alloc.count + 1];
+ unsigned i;
+ int reg_width = dispatch_width / 8;
+
+ /* Note that compressed instructions require alignment to 2 registers. */
+ hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width);
+ for (i = 1; i <= this->alloc.count; i++) {
+ hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
+ this->alloc.sizes[i - 1]);
+ }
+ this->grf_used = hw_reg_mapping[this->alloc.count];
+
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ assign_reg(hw_reg_mapping, &inst->dst);
+ for (i = 0; i < inst->sources; i++) {
+ assign_reg(hw_reg_mapping, &inst->src[i]);
+ }
+ }
+
+ if (this->grf_used >= max_grf) {
+ fail("Ran out of regs on trivial allocator (%d/%d)\n",
+ this->grf_used, max_grf);
+ } else {
+ this->alloc.count = this->grf_used;
+ }
+
+}
+
+static void
+brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
+{
+ const struct gen_device_info *devinfo = compiler->devinfo;
+ int base_reg_count = BRW_MAX_GRF;
+ const int index = _mesa_logbase2(dispatch_width / 8);
+
+ if (dispatch_width > 8 && devinfo->gen >= 7) {
+ /* For IVB+, we don't need the PLN hacks or the even-reg alignment in
+ * SIMD16. Therefore, we can use the exact same register sets for
+ * SIMD16 as we do for SIMD8 and we don't need to recalculate them.
+ */
+ compiler->fs_reg_sets[index] = compiler->fs_reg_sets[0];
+ return;
+ }
+
+ /* The registers used to make up almost all values handled in the compiler
+ * are a scalar value occupying a single register (or 2 registers in the
+ * case of SIMD16, which is handled by dividing base_reg_count by 2 and
+ * multiplying allocated register numbers by 2). Things that were
+ * aggregates of scalar values at the GLSL level were split to scalar
+ * values by split_virtual_grfs().
+ *
+ * However, texture SEND messages return a series of contiguous registers
+ * to write into. We currently always ask for 4 registers, but we may
+ * convert that to use less some day.
+ *
+ * Additionally, on gen5 we need aligned pairs of registers for the PLN
+ * instruction, and on gen4 we need 8 contiguous regs for workaround simd16
+ * texturing.
+ */
+ const int class_count = MAX_VGRF_SIZE;
+ int class_sizes[MAX_VGRF_SIZE];
+ for (unsigned i = 0; i < MAX_VGRF_SIZE; i++)
+ class_sizes[i] = i + 1;
+
+ memset(compiler->fs_reg_sets[index].class_to_ra_reg_range, 0,
+ sizeof(compiler->fs_reg_sets[index].class_to_ra_reg_range));
+ int *class_to_ra_reg_range = compiler->fs_reg_sets[index].class_to_ra_reg_range;
+
+ /* Compute the total number of registers across all classes. */
+ int ra_reg_count = 0;
+ for (int i = 0; i < class_count; i++) {
+ if (devinfo->gen <= 5 && dispatch_width >= 16) {
+ /* From the G45 PRM:
+ *
+ * In order to reduce the hardware complexity, the following
+ * rules and restrictions apply to the compressed instruction:
+ * ...
+ * * Operand Alignment Rule: With the exceptions listed below, a
+ * source/destination operand in general should be aligned to
+ * even 256-bit physical register with a region size equal to
+ * two 256-bit physical register
+ */
+ ra_reg_count += (base_reg_count - (class_sizes[i] - 1)) / 2;
+ } else {
+ ra_reg_count += base_reg_count - (class_sizes[i] - 1);
+ }
+ /* Mark the last register. We'll fill in the beginnings later. */
+ class_to_ra_reg_range[class_sizes[i]] = ra_reg_count;
+ }
+
+ /* Fill out the rest of the range markers */
+ for (int i = 1; i < 17; ++i) {
+ if (class_to_ra_reg_range[i] == 0)
+ class_to_ra_reg_range[i] = class_to_ra_reg_range[i-1];
+ }
+
+ uint8_t *ra_reg_to_grf = ralloc_array(compiler, uint8_t, ra_reg_count);
+ struct ra_regs *regs = ra_alloc_reg_set(compiler, ra_reg_count, false);
+ if (devinfo->gen >= 6)
+ ra_set_allocate_round_robin(regs);
+ int *classes = ralloc_array(compiler, int, class_count);
+ int aligned_pairs_class = -1;
+
+ /* Allocate space for q values. We allocate class_count + 1 because we
+ * want to leave room for the aligned pairs class if we have it. */
+ unsigned int **q_values = ralloc_array(compiler, unsigned int *,
+ class_count + 1);
+ for (int i = 0; i < class_count + 1; ++i)
+ q_values[i] = ralloc_array(q_values, unsigned int, class_count + 1);
+
+ /* Now, add the registers to their classes, and add the conflicts
+ * between them and the base GRF registers (and also each other).
+ */
+ int reg = 0;
+ int pairs_base_reg = 0;
+ int pairs_reg_count = 0;
+ for (int i = 0; i < class_count; i++) {
+ int class_reg_count;
+ if (devinfo->gen <= 5 && dispatch_width >= 16) {
+ class_reg_count = (base_reg_count - (class_sizes[i] - 1)) / 2;
+
+ /* See comment below. The only difference here is that we are
+ * dealing with pairs of registers instead of single registers.
+ * Registers of odd sizes simply get rounded up. */
+ for (int j = 0; j < class_count; j++)
+ q_values[i][j] = (class_sizes[i] + 1) / 2 +
+ (class_sizes[j] + 1) / 2 - 1;
+ } else {
+ class_reg_count = base_reg_count - (class_sizes[i] - 1);
+
+ /* From register_allocate.c:
+ *
+ * q(B,C) (indexed by C, B is this register class) in
+ * Runeson/Nyström paper. This is "how many registers of B could
+ * the worst choice register from C conflict with".
+ *
+ * If we just let the register allocation algorithm compute these
+ * values, is extremely expensive. However, since all of our
+ * registers are laid out, we can very easily compute them
+ * ourselves. View the register from C as fixed starting at GRF n
+ * somwhere in the middle, and the register from B as sliding back
+ * and forth. Then the first register to conflict from B is the
+ * one starting at n - class_size[B] + 1 and the last register to
+ * conflict will start at n + class_size[B] - 1. Therefore, the
+ * number of conflicts from B is class_size[B] + class_size[C] - 1.
+ *
+ * +-+-+-+-+-+-+ +-+-+-+-+-+-+
+ * B | | | | | |n| --> | | | | | | |
+ * +-+-+-+-+-+-+ +-+-+-+-+-+-+
+ * +-+-+-+-+-+
+ * C |n| | | | |
+ * +-+-+-+-+-+
+ */
+ for (int j = 0; j < class_count; j++)
+ q_values[i][j] = class_sizes[i] + class_sizes[j] - 1;
+ }
+ classes[i] = ra_alloc_reg_class(regs);
+
+ /* Save this off for the aligned pair class at the end. */
+ if (class_sizes[i] == 2) {
+ pairs_base_reg = reg;
+ pairs_reg_count = class_reg_count;
+ }
+
+ if (devinfo->gen <= 5 && dispatch_width >= 16) {
+ for (int j = 0; j < class_reg_count; j++) {
+ ra_class_add_reg(regs, classes[i], reg);
+
+ ra_reg_to_grf[reg] = j * 2;
+
+ for (int base_reg = j;
+ base_reg < j + (class_sizes[i] + 1) / 2;
+ base_reg++) {
+ ra_add_reg_conflict(regs, base_reg, reg);
+ }
+
+ reg++;
+ }
+ } else {
+ for (int j = 0; j < class_reg_count; j++) {
+ ra_class_add_reg(regs, classes[i], reg);
+
+ ra_reg_to_grf[reg] = j;
+
+ for (int base_reg = j;
+ base_reg < j + class_sizes[i];
+ base_reg++) {
+ ra_add_reg_conflict(regs, base_reg, reg);
+ }
+
+ reg++;
+ }
+ }
+ }
+ assert(reg == ra_reg_count);
+
+ /* Applying transitivity to all of the base registers gives us the
+ * appropreate register conflict relationships everywhere.
+ */
+ for (int reg = 0; reg < base_reg_count; reg++)
+ ra_make_reg_conflicts_transitive(regs, reg);
+
+ /* Add a special class for aligned pairs, which we'll put delta_xy
+ * in on Gen <= 6 so that we can do PLN.
+ */
+ if (devinfo->has_pln && dispatch_width == 8 && devinfo->gen <= 6) {
+ aligned_pairs_class = ra_alloc_reg_class(regs);
+
+ for (int i = 0; i < pairs_reg_count; i++) {
+ if ((ra_reg_to_grf[pairs_base_reg + i] & 1) == 0) {
+ ra_class_add_reg(regs, aligned_pairs_class, pairs_base_reg + i);
+ }
+ }
+
+ for (int i = 0; i < class_count; i++) {
+ /* These are a little counter-intuitive because the pair registers
+ * are required to be aligned while the register they are
+ * potentially interferring with are not. In the case where the
+ * size is even, the worst-case is that the register is
+ * odd-aligned. In the odd-size case, it doesn't matter.
+ */
+ q_values[class_count][i] = class_sizes[i] / 2 + 1;
+ q_values[i][class_count] = class_sizes[i] + 1;
+ }
+ q_values[class_count][class_count] = 1;
+ }
+
+ ra_set_finalize(regs, q_values);
+
+ ralloc_free(q_values);
+
+ compiler->fs_reg_sets[index].regs = regs;
+ for (unsigned i = 0; i < ARRAY_SIZE(compiler->fs_reg_sets[index].classes); i++)
+ compiler->fs_reg_sets[index].classes[i] = -1;
+ for (int i = 0; i < class_count; i++)
+ compiler->fs_reg_sets[index].classes[class_sizes[i] - 1] = classes[i];
+ compiler->fs_reg_sets[index].ra_reg_to_grf = ra_reg_to_grf;
+ compiler->fs_reg_sets[index].aligned_pairs_class = aligned_pairs_class;
+}
+
+void
+brw_fs_alloc_reg_sets(struct brw_compiler *compiler)
+{
+ brw_alloc_reg_set(compiler, 8);
+ brw_alloc_reg_set(compiler, 16);
+ brw_alloc_reg_set(compiler, 32);
+}
+
+static int
+count_to_loop_end(const bblock_t *block)
+{
+ if (block->end()->opcode == BRW_OPCODE_WHILE)
+ return block->end_ip;
+
+ int depth = 1;
+ /* Skip the first block, since we don't want to count the do the calling
+ * function found.
+ */
+ for (block = block->next();
+ depth > 0;
+ block = block->next()) {
+ if (block->start()->opcode == BRW_OPCODE_DO)
+ depth++;
+ if (block->end()->opcode == BRW_OPCODE_WHILE) {
+ depth--;
+ if (depth == 0)
+ return block->end_ip;
+ }
+ }
+ unreachable("not reached");
+}
+
+void fs_visitor::calculate_payload_ranges(int payload_node_count,
+ int *payload_last_use_ip)
+{
+ int loop_depth = 0;
+ int loop_end_ip = 0;
+
+ for (int i = 0; i < payload_node_count; i++)
+ payload_last_use_ip[i] = -1;
+
+ int ip = 0;
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ switch (inst->opcode) {
+ case BRW_OPCODE_DO:
+ loop_depth++;
+
+ /* Since payload regs are deffed only at the start of the shader
+ * execution, any uses of the payload within a loop mean the live
+ * interval extends to the end of the outermost loop. Find the ip of
+ * the end now.
+ */
+ if (loop_depth == 1)
+ loop_end_ip = count_to_loop_end(block);
+ break;
+ case BRW_OPCODE_WHILE:
+ loop_depth--;
+ break;
+ default:
+ break;
+ }
+
+ int use_ip;
+ if (loop_depth > 0)
+ use_ip = loop_end_ip;
+ else
+ use_ip = ip;
+
+ /* Note that UNIFORM args have been turned into FIXED_GRF by
+ * assign_curbe_setup(), and interpolation uses fixed hardware regs from
+ * the start (see interp_reg()).
+ */
+ for (int i = 0; i < inst->sources; i++) {
+ if (inst->src[i].file == FIXED_GRF) {
+ int node_nr = inst->src[i].nr;
+ if (node_nr >= payload_node_count)
+ continue;
+
+ for (unsigned j = 0; j < regs_read(inst, i); j++) {
+ payload_last_use_ip[node_nr + j] = use_ip;
+ assert(node_nr + j < unsigned(payload_node_count));
+ }
+ }
+ }
+
+ /* Special case instructions which have extra implied registers used. */
+ switch (inst->opcode) {
+ case CS_OPCODE_CS_TERMINATE:
+ payload_last_use_ip[0] = use_ip;
+ break;
+
+ default:
+ if (inst->eot) {
+ /* We could omit this for the !inst->header_present case, except
+ * that the simulator apparently incorrectly reads from g0/g1
+ * instead of sideband. It also really freaks out driver
+ * developers to see g0 used in unusual places, so just always
+ * reserve it.
+ */
+ payload_last_use_ip[0] = use_ip;
+ payload_last_use_ip[1] = use_ip;
+ }
+ break;
+ }
+
+ ip++;
+ }
+}
+
+
+/**
+ * Sets up interference between thread payload registers and the virtual GRFs
+ * to be allocated for program temporaries.
+ *
+ * We want to be able to reallocate the payload for our virtual GRFs, notably
+ * because the setup coefficients for a full set of 16 FS inputs takes up 8 of
+ * our 128 registers.
+ *
+ * The layout of the payload registers is:
+ *
+ * 0..payload.num_regs-1: fixed function setup (including bary coordinates).
+ * payload.num_regs..payload.num_regs+curb_read_lengh-1: uniform data
+ * payload.num_regs+curb_read_lengh..first_non_payload_grf-1: setup coefficients.
+ *
+ * And we have payload_node_count nodes covering these registers in order
+ * (note that in SIMD16, a node is two registers).
+ */
+void
+fs_visitor::setup_payload_interference(struct ra_graph *g,
+ int payload_node_count,
+ int first_payload_node)
+{
+ int payload_last_use_ip[payload_node_count];
+ calculate_payload_ranges(payload_node_count, payload_last_use_ip);
+
+ for (int i = 0; i < payload_node_count; i++) {
+ if (payload_last_use_ip[i] == -1)
+ continue;
+
+ /* Mark the payload node as interfering with any virtual grf that is
+ * live between the start of the program and our last use of the payload
+ * node.
+ */
+ for (unsigned j = 0; j < this->alloc.count; j++) {
+ /* Note that we use a <= comparison, unlike virtual_grf_interferes(),
+ * in order to not have to worry about the uniform issue described in
+ * calculate_live_intervals().
+ */
+ if (this->virtual_grf_start[j] <= payload_last_use_ip[i]) {
+ ra_add_node_interference(g, first_payload_node + i, j);
+ }
+ }
+ }
+
+ for (int i = 0; i < payload_node_count; i++) {
+ /* Mark each payload node as being allocated to its physical register.
+ *
+ * The alternative would be to have per-physical-register classes, which
+ * would just be silly.
+ */
+ if (devinfo->gen <= 5 && dispatch_width >= 16) {
+ /* We have to divide by 2 here because we only have even numbered
+ * registers. Some of the payload registers will be odd, but
+ * that's ok because their physical register numbers have already
+ * been assigned. The only thing this is used for is interference.
+ */
+ ra_set_node_reg(g, first_payload_node + i, i / 2);
+ } else {
+ ra_set_node_reg(g, first_payload_node + i, i);
+ }
+ }
+}
+
+/**
+ * Sets the mrf_used array to indicate which MRFs are used by the shader IR
+ *
+ * This is used in assign_regs() to decide which of the GRFs that we use as
+ * MRFs on gen7 get normally register allocated, and in register spilling to
+ * see if we can actually use MRFs to do spills without overwriting normal MRF
+ * contents.
+ */
+static void
+get_used_mrfs(fs_visitor *v, bool *mrf_used)
+{
+ int reg_width = v->dispatch_width / 8;
+
+ memset(mrf_used, 0, BRW_MAX_MRF(v->devinfo->gen) * sizeof(bool));
+
+ foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
+ if (inst->dst.file == MRF) {
+ int reg = inst->dst.nr & ~BRW_MRF_COMPR4;
+ mrf_used[reg] = true;
+ if (reg_width == 2) {
+ if (inst->dst.nr & BRW_MRF_COMPR4) {
+ mrf_used[reg + 4] = true;
+ } else {
+ mrf_used[reg + 1] = true;
+ }
+ }
+ }
+
+ if (inst->mlen > 0) {
+ for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
+ mrf_used[inst->base_mrf + i] = true;
+ }
+ }
+ }
+}
+
+/**
+ * Sets interference between virtual GRFs and usage of the high GRFs for SEND
+ * messages (treated as MRFs in code generation).
+ */
+static void
+setup_mrf_hack_interference(fs_visitor *v, struct ra_graph *g,
+ int first_mrf_node, int *first_used_mrf)
+{
+ bool mrf_used[BRW_MAX_MRF(v->devinfo->gen)];
+ get_used_mrfs(v, mrf_used);
+
+ *first_used_mrf = BRW_MAX_MRF(v->devinfo->gen);
+ for (int i = 0; i < BRW_MAX_MRF(v->devinfo->gen); i++) {
+ /* Mark each MRF reg node as being allocated to its physical register.
+ *
+ * The alternative would be to have per-physical-register classes, which
+ * would just be silly.
+ */
+ ra_set_node_reg(g, first_mrf_node + i, GEN7_MRF_HACK_START + i);
+
+ /* Since we don't have any live/dead analysis on the MRFs, just mark all
+ * that are used as conflicting with all virtual GRFs.
+ */
+ if (mrf_used[i]) {
+ if (i < *first_used_mrf)
+ *first_used_mrf = i;
+
+ for (unsigned j = 0; j < v->alloc.count; j++) {
+ ra_add_node_interference(g, first_mrf_node + i, j);
+ }
+ }
+ }
+}
+
+bool
+fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
+{
+ /* Most of this allocation was written for a reg_width of 1
+ * (dispatch_width == 8). In extending to SIMD16, the code was
+ * left in place and it was converted to have the hardware
+ * registers it's allocating be contiguous physical pairs of regs
+ * for reg_width == 2.
+ */
+ int reg_width = dispatch_width / 8;
+ unsigned hw_reg_mapping[this->alloc.count];
+ int payload_node_count = ALIGN(this->first_non_payload_grf, reg_width);
+ int rsi = _mesa_logbase2(reg_width); /* Which compiler->fs_reg_sets[] to use */
+ calculate_live_intervals();
+
+ int node_count = this->alloc.count;
+ int first_payload_node = node_count;
+ node_count += payload_node_count;
+ int first_mrf_hack_node = node_count;
+ if (devinfo->gen >= 7)
+ node_count += BRW_MAX_GRF - GEN7_MRF_HACK_START;
+ struct ra_graph *g =
+ ra_alloc_interference_graph(compiler->fs_reg_sets[rsi].regs, node_count);
+
+ for (unsigned i = 0; i < this->alloc.count; i++) {
+ unsigned size = this->alloc.sizes[i];
+ int c;
+
+ assert(size <= ARRAY_SIZE(compiler->fs_reg_sets[rsi].classes) &&
+ "Register allocation relies on split_virtual_grfs()");
+ c = compiler->fs_reg_sets[rsi].classes[size - 1];
+
+ /* Special case: on pre-GEN6 hardware that supports PLN, the
+ * second operand of a PLN instruction needs to be an
+ * even-numbered register, so we have a special register class
+ * wm_aligned_pairs_class to handle this case. pre-GEN6 always
+ * uses this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL] as the
+ * second operand of a PLN instruction (since it doesn't support
+ * any other interpolation modes). So all we need to do is find
+ * that register and set it to the appropriate class.
+ */
+ if (compiler->fs_reg_sets[rsi].aligned_pairs_class >= 0 &&
+ this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL].file == VGRF &&
+ this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL].nr == i) {
+ c = compiler->fs_reg_sets[rsi].aligned_pairs_class;
+ }
+
+ ra_set_node_class(g, i, c);
+
+ for (unsigned j = 0; j < i; j++) {
+ if (virtual_grf_interferes(i, j)) {
+ ra_add_node_interference(g, i, j);
+ }
+ }
+ }
+
+ /* Certain instructions can't safely use the same register for their
+ * sources and destination. Add interference.
+ */
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) {
+ for (unsigned i = 0; i < 3; i++) {
+ if (inst->src[i].file == VGRF) {
+ ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr);
+ }
+ }
+ }
+ }
+
+ setup_payload_interference(g, payload_node_count, first_payload_node);
+ if (devinfo->gen >= 7) {
+ int first_used_mrf = BRW_MAX_MRF(devinfo->gen);
+ setup_mrf_hack_interference(this, g, first_mrf_hack_node,
+ &first_used_mrf);
+
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ /* When we do send-from-GRF for FB writes, we need to ensure that
+ * the last write instruction sends from a high register. This is
+ * because the vertex fetcher wants to start filling the low
+ * payload registers while the pixel data port is still working on
+ * writing out the memory. If we don't do this, we get rendering
+ * artifacts.
+ *
+ * We could just do "something high". Instead, we just pick the
+ * highest register that works.
+ */
+ if (inst->eot) {
+ int size = alloc.sizes[inst->src[0].nr];
+ int reg = compiler->fs_reg_sets[rsi].class_to_ra_reg_range[size] - 1;
+
+ /* If something happened to spill, we want to push the EOT send
+ * register early enough in the register file that we don't
+ * conflict with any used MRF hack registers.
+ */
+ reg -= BRW_MAX_MRF(devinfo->gen) - first_used_mrf;
+
+ ra_set_node_reg(g, inst->src[0].nr, reg);
+ break;
+ }
+ }
+ }
+
+ if (dispatch_width > 8) {
+ /* In 16-wide dispatch we have an issue where a compressed
+ * instruction is actually two instructions executed simultaneiously.
+ * It's actually ok to have the source and destination registers be
+ * the same. In this case, each instruction over-writes its own
+ * source and there's no problem. The real problem here is if the
+ * source and destination registers are off by one. Then you can end
+ * up in a scenario where the first instruction over-writes the
+ * source of the second instruction. Since the compiler doesn't know
+ * about this level of granularity, we simply make the source and
+ * destination interfere.
+ */
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ if (inst->dst.file != VGRF)
+ continue;
+
+ for (int i = 0; i < inst->sources; ++i) {
+ if (inst->src[i].file == VGRF) {
+ ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr);
+ }
+ }
+ }
+ }
+
+ /* Debug of register spilling: Go spill everything. */
+ if (unlikely(spill_all)) {
+ int reg = choose_spill_reg(g);
+
+ if (reg != -1) {
+ spill_reg(reg);
+ ralloc_free(g);
+ return false;
+ }
+ }
+
+ if (!ra_allocate(g)) {
+ /* Failed to allocate registers. Spill a reg, and the caller will
+ * loop back into here to try again.
+ */
+ int reg = choose_spill_reg(g);
+
+ if (reg == -1) {
+ fail("no register to spill:\n");
+ dump_instructions(NULL);
+ } else if (allow_spilling) {
+ spill_reg(reg);
+ }
+
+ ralloc_free(g);
+
+ return false;
+ }
+
+ /* Get the chosen virtual registers for each node, and map virtual
+ * regs in the register classes back down to real hardware reg
+ * numbers.
+ */
+ this->grf_used = payload_node_count;
+ for (unsigned i = 0; i < this->alloc.count; i++) {
+ int reg = ra_get_node_reg(g, i);
+
+ hw_reg_mapping[i] = compiler->fs_reg_sets[rsi].ra_reg_to_grf[reg];
+ this->grf_used = MAX2(this->grf_used,
+ hw_reg_mapping[i] + this->alloc.sizes[i]);
+ }
+
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ assign_reg(hw_reg_mapping, &inst->dst);
+ for (int i = 0; i < inst->sources; i++) {
+ assign_reg(hw_reg_mapping, &inst->src[i]);
+ }
+ }
+
+ this->alloc.count = this->grf_used;
+
+ ralloc_free(g);
+
+ return true;
+}
+
+namespace {
+ /**
+ * Maximum spill block size we expect to encounter in 32B units.
+ *
+ * This is somewhat arbitrary and doesn't necessarily limit the maximum
+ * variable size that can be spilled -- A higher value will allow a
+ * variable of a given size to be spilled more efficiently with a smaller
+ * number of scratch messages, but will increase the likelihood of a
+ * collision between the MRFs reserved for spilling and other MRFs used by
+ * the program (and possibly increase GRF register pressure on platforms
+ * without hardware MRFs), what could cause register allocation to fail.
+ *
+ * For the moment reserve just enough space so a register of 32 bit
+ * component type and natural region width can be spilled without splitting
+ * into multiple (force_writemask_all) scratch messages.
+ */
+ unsigned
+ spill_max_size(const backend_shader *s)
+ {
+ /* FINISHME - On Gen7+ it should be possible to avoid this limit
+ * altogether by spilling directly from the temporary GRF
+ * allocated to hold the result of the instruction (and the
+ * scratch write header).
+ */
+ /* FINISHME - The shader's dispatch width probably belongs in
+ * backend_shader (or some nonexistent fs_shader class?)
+ * rather than in the visitor class.
+ */
+ return static_cast<const fs_visitor *>(s)->dispatch_width / 8;
+ }
+
+ /**
+ * First MRF register available for spilling.
+ */
+ unsigned
+ spill_base_mrf(const backend_shader *s)
+ {
+ return BRW_MAX_MRF(s->devinfo->gen) - spill_max_size(s) - 1;
+ }
+}
+
+static void
+emit_unspill(const fs_builder &bld, fs_reg dst,
+ uint32_t spill_offset, unsigned count)
+{
+ const gen_device_info *devinfo = bld.shader->devinfo;
+ const unsigned reg_size = dst.component_size(bld.dispatch_width()) /
+ REG_SIZE;
+ assert(count % reg_size == 0);
+
+ for (unsigned i = 0; i < count / reg_size; i++) {
+ /* The Gen7 descriptor-based offset is 12 bits of HWORD units. Because
+ * the Gen7-style scratch block read is hardwired to BTI 255, on Gen9+
+ * it would cause the DC to do an IA-coherent read, what largely
+ * outweighs the slight advantage from not having to provide the address
+ * as part of the message header, so we're better off using plain old
+ * oword block reads.
+ */
+ bool gen7_read = (devinfo->gen >= 7 && devinfo->gen < 9 &&
+ spill_offset < (1 << 12) * REG_SIZE);
+ fs_inst *unspill_inst = bld.emit(gen7_read ?
+ SHADER_OPCODE_GEN7_SCRATCH_READ :
+ SHADER_OPCODE_GEN4_SCRATCH_READ,
+ dst);
+ unspill_inst->offset = spill_offset;
+
+ if (!gen7_read) {
+ unspill_inst->base_mrf = spill_base_mrf(bld.shader);
+ unspill_inst->mlen = 1; /* header contains offset */
+ }
+
+ dst.offset += reg_size * REG_SIZE;
+ spill_offset += reg_size * REG_SIZE;
+ }
+}
+
+static void
+emit_spill(const fs_builder &bld, fs_reg src,
+ uint32_t spill_offset, unsigned count)
+{
+ const unsigned reg_size = src.component_size(bld.dispatch_width()) /
+ REG_SIZE;
+ assert(count % reg_size == 0);
+
+ for (unsigned i = 0; i < count / reg_size; i++) {
+ fs_inst *spill_inst =
+ bld.emit(SHADER_OPCODE_GEN4_SCRATCH_WRITE, bld.null_reg_f(), src);
+ src.offset += reg_size * REG_SIZE;
+ spill_inst->offset = spill_offset + i * reg_size * REG_SIZE;
+ spill_inst->mlen = 1 + reg_size; /* header, value */
+ spill_inst->base_mrf = spill_base_mrf(bld.shader);
+ }
+}
+
+int
+fs_visitor::choose_spill_reg(struct ra_graph *g)
+{
+ float loop_scale = 1.0;
+ float spill_costs[this->alloc.count];
+ bool no_spill[this->alloc.count];
+
+ for (unsigned i = 0; i < this->alloc.count; i++) {
+ spill_costs[i] = 0.0;
+ no_spill[i] = false;
+ }
+
+ /* Calculate costs for spilling nodes. Call it a cost of 1 per
+ * spill/unspill we'll have to do, and guess that the insides of
+ * loops run 10 times.
+ */
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ for (unsigned int i = 0; i < inst->sources; i++) {
+ if (inst->src[i].file == VGRF)
+ spill_costs[inst->src[i].nr] += loop_scale;
+ }
+
+ if (inst->dst.file == VGRF)
+ spill_costs[inst->dst.nr] += DIV_ROUND_UP(inst->size_written, REG_SIZE)
+ * loop_scale;
+
+ switch (inst->opcode) {
+
+ case BRW_OPCODE_DO:
+ loop_scale *= 10;
+ break;
+
+ case BRW_OPCODE_WHILE:
+ loop_scale /= 10;
+ break;
+
+ case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
+ if (inst->src[0].file == VGRF)
+ no_spill[inst->src[0].nr] = true;
+ break;
+
+ case SHADER_OPCODE_GEN4_SCRATCH_READ:
+ case SHADER_OPCODE_GEN7_SCRATCH_READ:
+ if (inst->dst.file == VGRF)
+ no_spill[inst->dst.nr] = true;
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ for (unsigned i = 0; i < this->alloc.count; i++) {
+ if (!no_spill[i])
+ ra_set_node_spill_cost(g, i, spill_costs[i]);
+ }
+
+ return ra_get_best_spill_node(g);
+}
+
+void
+fs_visitor::spill_reg(int spill_reg)
+{
+ int size = alloc.sizes[spill_reg];
+ unsigned int spill_offset = last_scratch;
+ assert(ALIGN(spill_offset, 16) == spill_offset); /* oword read/write req. */
+
+ /* Spills may use MRFs 13-15 in the SIMD16 case. Our texturing is done
+ * using up to 11 MRFs starting from either m1 or m2, and fb writes can use
+ * up to m13 (gen6+ simd16: 2 header + 8 color + 2 src0alpha + 2 omask) or
+ * m15 (gen4-5 simd16: 2 header + 8 color + 1 aads + 2 src depth + 2 dst
+ * depth), starting from m1. In summary: We may not be able to spill in
+ * SIMD16 mode, because we'd stomp the FB writes.
+ */
+ if (!spilled_any_registers) {
+ bool mrf_used[BRW_MAX_MRF(devinfo->gen)];
+ get_used_mrfs(this, mrf_used);
+
+ for (int i = spill_base_mrf(this); i < BRW_MAX_MRF(devinfo->gen); i++) {
+ if (mrf_used[i]) {
+ fail("Register spilling not supported with m%d used", i);
+ return;
+ }
+ }
+
+ spilled_any_registers = true;
+ }
+
+ last_scratch += size * REG_SIZE;
+
+ /* Generate spill/unspill instructions for the objects being
+ * spilled. Right now, we spill or unspill the whole thing to a
+ * virtual grf of the same size. For most instructions, though, we
+ * could just spill/unspill the GRF being accessed.
+ */
+ foreach_block_and_inst (block, fs_inst, inst, cfg) {
+ const fs_builder ibld = fs_builder(this, block, inst);
+
+ for (unsigned int i = 0; i < inst->sources; i++) {
+ if (inst->src[i].file == VGRF &&
+ inst->src[i].nr == spill_reg) {
+ int count = regs_read(inst, i);
+ int subset_spill_offset = spill_offset +
+ ROUND_DOWN_TO(inst->src[i].offset, REG_SIZE);
+ fs_reg unspill_dst(VGRF, alloc.allocate(count));
+
+ inst->src[i].nr = unspill_dst.nr;
+ inst->src[i].offset %= REG_SIZE;
+
+ /* We read the largest power-of-two divisor of the register count
+ * (because only POT scratch read blocks are allowed by the
+ * hardware) up to the maximum supported block size.
+ */
+ const unsigned width =
+ MIN2(32, 1u << (ffs(MAX2(1, count) * 8) - 1));
+
+ /* Set exec_all() on unspill messages under the (rather
+ * pessimistic) assumption that there is no one-to-one
+ * correspondence between channels of the spilled variable in
+ * scratch space and the scratch read message, which operates on
+ * 32 bit channels. It shouldn't hurt in any case because the
+ * unspill destination is a block-local temporary.
+ */
+ emit_unspill(ibld.exec_all().group(width, 0),
+ unspill_dst, subset_spill_offset, count);
+ }
+ }
+
+ if (inst->dst.file == VGRF &&
+ inst->dst.nr == spill_reg) {
+ int subset_spill_offset = spill_offset +
+ ROUND_DOWN_TO(inst->dst.offset, REG_SIZE);
+ fs_reg spill_src(VGRF, alloc.allocate(regs_written(inst)));
+
+ inst->dst.nr = spill_src.nr;
+ inst->dst.offset %= REG_SIZE;
+
+ /* If we're immediately spilling the register, we should not use
+ * destination dependency hints. Doing so will cause the GPU do
+ * try to read and write the register at the same time and may
+ * hang the GPU.
+ */
+ inst->no_dd_clear = false;
+ inst->no_dd_check = false;
+
+ /* Calculate the execution width of the scratch messages (which work
+ * in terms of 32 bit components so we have a fixed number of eight
+ * channels per spilled register). We attempt to write one
+ * exec_size-wide component of the variable at a time without
+ * exceeding the maximum number of (fake) MRF registers reserved for
+ * spills.
+ */
+ const unsigned width = 8 * MIN2(
+ DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE),
+ spill_max_size(this));
+
+ /* Spills should only write data initialized by the instruction for
+ * whichever channels are enabled in the excution mask. If that's
+ * not possible we'll have to emit a matching unspill before the
+ * instruction and set force_writemask_all on the spill.
+ */
+ const bool per_channel =
+ inst->dst.is_contiguous() && type_sz(inst->dst.type) == 4 &&
+ inst->exec_size == width;
+
+ /* Builder used to emit the scratch messages. */
+ const fs_builder ubld = ibld.exec_all(!per_channel).group(width, 0);
+
+ /* If our write is going to affect just part of the
+ * regs_written(inst), then we need to unspill the destination since
+ * we write back out all of the regs_written(). If the original
+ * instruction had force_writemask_all set and is not a partial
+ * write, there should be no need for the unspill since the
+ * instruction will be overwriting the whole destination in any case.
+ */
+ if (inst->is_partial_write() ||
+ (!inst->force_writemask_all && !per_channel))
+ emit_unspill(ubld, spill_src, subset_spill_offset,
+ regs_written(inst));
+
+ emit_spill(ubld.at(block, inst->next), spill_src,
+ subset_spill_offset, regs_written(inst));
+ }
+ }
+
+ invalidate_live_intervals();
+}
diff --git a/src/intel/compiler/brw_fs_register_coalesce.cpp b/src/intel/compiler/brw_fs_register_coalesce.cpp
new file mode 100644
index 00000000000..952276faed8
--- /dev/null
+++ b/src/intel/compiler/brw_fs_register_coalesce.cpp
@@ -0,0 +1,295 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_register_coalesce.cpp
+ *
+ * Implements register coalescing: Checks if the two registers involved in a
+ * raw move don't interfere, in which case they can both be stored in the same
+ * place and the MOV removed.
+ *
+ * To do this, all uses of the source of the MOV in the shader are replaced
+ * with the destination of the MOV. For example:
+ *
+ * add vgrf3:F, vgrf1:F, vgrf2:F
+ * mov vgrf4:F, vgrf3:F
+ * mul vgrf5:F, vgrf5:F, vgrf4:F
+ *
+ * becomes
+ *
+ * add vgrf4:F, vgrf1:F, vgrf2:F
+ * mul vgrf5:F, vgrf5:F, vgrf4:F
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "brw_fs_live_variables.h"
+
+static bool
+is_nop_mov(const fs_inst *inst)
+{
+ if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
+ fs_reg dst = inst->dst;
+ for (int i = 0; i < inst->sources; i++) {
+ if (!dst.equals(inst->src[i])) {
+ return false;
+ }
+ dst.offset += (i < inst->header_size ? REG_SIZE :
+ inst->exec_size * dst.stride *
+ type_sz(inst->src[i].type));
+ }
+ return true;
+ } else if (inst->opcode == BRW_OPCODE_MOV) {
+ return inst->dst.equals(inst->src[0]);
+ }
+
+ return false;
+}
+
+static bool
+is_coalesce_candidate(const fs_visitor *v, const fs_inst *inst)
+{
+ if ((inst->opcode != BRW_OPCODE_MOV &&
+ inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) ||
+ inst->is_partial_write() ||
+ inst->saturate ||
+ inst->src[0].file != VGRF ||
+ inst->src[0].negate ||
+ inst->src[0].abs ||
+ !inst->src[0].is_contiguous() ||
+ inst->dst.file != VGRF ||
+ inst->dst.type != inst->src[0].type) {
+ return false;
+ }
+
+ if (v->alloc.sizes[inst->src[0].nr] >
+ v->alloc.sizes[inst->dst.nr])
+ return false;
+
+ if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
+ if (!inst->is_copy_payload(v->alloc)) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool
+can_coalesce_vars(brw::fs_live_variables *live_intervals,
+ const cfg_t *cfg, const fs_inst *inst,
+ int dst_var, int src_var)
+{
+ if (!live_intervals->vars_interfere(src_var, dst_var))
+ return true;
+
+ int dst_start = live_intervals->start[dst_var];
+ int dst_end = live_intervals->end[dst_var];
+ int src_start = live_intervals->start[src_var];
+ int src_end = live_intervals->end[src_var];
+
+ /* Variables interfere and one line range isn't a subset of the other. */
+ if ((dst_end > src_end && src_start < dst_start) ||
+ (src_end > dst_end && dst_start < src_start))
+ return false;
+
+ /* Check for a write to either register in the intersection of their live
+ * ranges.
+ */
+ int start_ip = MAX2(dst_start, src_start);
+ int end_ip = MIN2(dst_end, src_end);
+
+ foreach_block(block, cfg) {
+ if (block->end_ip < start_ip)
+ continue;
+
+ int scan_ip = block->start_ip - 1;
+
+ foreach_inst_in_block(fs_inst, scan_inst, block) {
+ scan_ip++;
+
+ /* Ignore anything before the intersection of the live ranges */
+ if (scan_ip < start_ip)
+ continue;
+
+ /* Ignore the copying instruction itself */
+ if (scan_inst == inst)
+ continue;
+
+ if (scan_ip > end_ip)
+ return true; /* registers do not interfere */
+
+ if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+ inst->dst, inst->size_written) ||
+ regions_overlap(scan_inst->dst, scan_inst->size_written,
+ inst->src[0], inst->size_read(0)))
+ return false; /* registers interfere */
+ }
+ }
+
+ return true;
+}
+
+bool
+fs_visitor::register_coalesce()
+{
+ bool progress = false;
+
+ calculate_live_intervals();
+
+ int src_size = 0;
+ int channels_remaining = 0;
+ int src_reg = -1, dst_reg = -1;
+ int dst_reg_offset[MAX_VGRF_SIZE];
+ fs_inst *mov[MAX_VGRF_SIZE];
+ int dst_var[MAX_VGRF_SIZE];
+ int src_var[MAX_VGRF_SIZE];
+
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ if (!is_coalesce_candidate(this, inst))
+ continue;
+
+ if (is_nop_mov(inst)) {
+ inst->opcode = BRW_OPCODE_NOP;
+ progress = true;
+ continue;
+ }
+
+ if (src_reg != inst->src[0].nr) {
+ src_reg = inst->src[0].nr;
+
+ src_size = alloc.sizes[inst->src[0].nr];
+ assert(src_size <= MAX_VGRF_SIZE);
+
+ channels_remaining = src_size;
+ memset(mov, 0, sizeof(mov));
+
+ dst_reg = inst->dst.nr;
+ }
+
+ if (dst_reg != inst->dst.nr)
+ continue;
+
+ if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
+ for (int i = 0; i < src_size; i++) {
+ dst_reg_offset[i] = i;
+ }
+ mov[0] = inst;
+ channels_remaining -= regs_written(inst);
+ } else {
+ const int offset = inst->src[0].offset / REG_SIZE;
+ if (mov[offset]) {
+ /* This is the second time that this offset in the register has
+ * been set. This means, in particular, that inst->dst was
+ * live before this instruction and that the live ranges of
+ * inst->dst and inst->src[0] overlap and we can't coalesce the
+ * two variables. Let's ensure that doesn't happen.
+ */
+ channels_remaining = -1;
+ continue;
+ }
+ for (unsigned i = 0; i < MAX2(inst->size_written / REG_SIZE, 1); i++)
+ dst_reg_offset[offset + i] = inst->dst.offset / REG_SIZE + i;
+ mov[offset] = inst;
+ channels_remaining -= regs_written(inst);
+ }
+
+ if (channels_remaining)
+ continue;
+
+ bool can_coalesce = true;
+ for (int i = 0; i < src_size; i++) {
+ if (dst_reg_offset[i] != dst_reg_offset[0] + i) {
+ /* Registers are out-of-order. */
+ can_coalesce = false;
+ src_reg = -1;
+ break;
+ }
+
+ dst_var[i] = live_intervals->var_from_vgrf[dst_reg] + dst_reg_offset[i];
+ src_var[i] = live_intervals->var_from_vgrf[src_reg] + i;
+
+ if (!can_coalesce_vars(live_intervals, cfg, inst,
+ dst_var[i], src_var[i])) {
+ can_coalesce = false;
+ src_reg = -1;
+ break;
+ }
+ }
+
+ if (!can_coalesce)
+ continue;
+
+ progress = true;
+
+ for (int i = 0; i < src_size; i++) {
+ if (mov[i]) {
+ mov[i]->opcode = BRW_OPCODE_NOP;
+ mov[i]->conditional_mod = BRW_CONDITIONAL_NONE;
+ mov[i]->dst = reg_undef;
+ for (int j = 0; j < mov[i]->sources; j++) {
+ mov[i]->src[j] = reg_undef;
+ }
+ }
+ }
+
+ foreach_block_and_inst(block, fs_inst, scan_inst, cfg) {
+ if (scan_inst->dst.file == VGRF &&
+ scan_inst->dst.nr == src_reg) {
+ scan_inst->dst.nr = dst_reg;
+ scan_inst->dst.offset = scan_inst->dst.offset % REG_SIZE +
+ dst_reg_offset[scan_inst->dst.offset / REG_SIZE] * REG_SIZE;
+ }
+
+ for (int j = 0; j < scan_inst->sources; j++) {
+ if (scan_inst->src[j].file == VGRF &&
+ scan_inst->src[j].nr == src_reg) {
+ scan_inst->src[j].nr = dst_reg;
+ scan_inst->src[j].offset = scan_inst->src[j].offset % REG_SIZE +
+ dst_reg_offset[scan_inst->src[j].offset / REG_SIZE] * REG_SIZE;
+ }
+ }
+ }
+
+ for (int i = 0; i < src_size; i++) {
+ live_intervals->start[dst_var[i]] =
+ MIN2(live_intervals->start[dst_var[i]],
+ live_intervals->start[src_var[i]]);
+ live_intervals->end[dst_var[i]] =
+ MAX2(live_intervals->end[dst_var[i]],
+ live_intervals->end[src_var[i]]);
+ }
+ src_reg = -1;
+ }
+
+ if (progress) {
+ foreach_block_and_inst_safe (block, backend_instruction, inst, cfg) {
+ if (inst->opcode == BRW_OPCODE_NOP) {
+ inst->remove(block);
+ }
+ }
+
+ invalidate_live_intervals();
+ }
+
+ return progress;
+}
diff --git a/src/intel/compiler/brw_fs_saturate_propagation.cpp b/src/intel/compiler/brw_fs_saturate_propagation.cpp
new file mode 100644
index 00000000000..1c97a507d8c
--- /dev/null
+++ b/src/intel/compiler/brw_fs_saturate_propagation.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_live_variables.h"
+#include "brw_cfg.h"
+
+/** @file brw_fs_saturate_propagation.cpp
+ *
+ * Implements a pass that propagates the SAT modifier from a MOV.SAT into the
+ * instruction that produced the source of the MOV.SAT, thereby allowing the
+ * MOV's src and dst to be coalesced and the MOV removed.
+ *
+ * For instance,
+ *
+ * ADD tmp, src0, src1
+ * MOV.SAT dst, tmp
+ *
+ * would be transformed into
+ *
+ * ADD.SAT tmp, src0, src1
+ * MOV dst, tmp
+ */
+
+static bool
+opt_saturate_propagation_local(fs_visitor *v, bblock_t *block)
+{
+ bool progress = false;
+ int ip = block->end_ip + 1;
+
+ foreach_inst_in_block_reverse(fs_inst, inst, block) {
+ ip--;
+
+ if (inst->opcode != BRW_OPCODE_MOV ||
+ !inst->saturate ||
+ inst->dst.file != VGRF ||
+ inst->dst.type != inst->src[0].type ||
+ inst->src[0].file != VGRF ||
+ inst->src[0].abs)
+ continue;
+
+ int src_var = v->live_intervals->var_from_reg(inst->src[0]);
+ int src_end_ip = v->live_intervals->end[src_var];
+
+ bool interfered = false;
+ foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+ if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+ inst->src[0], inst->size_read(0))) {
+ if (scan_inst->is_partial_write() ||
+ (scan_inst->dst.type != inst->dst.type &&
+ !scan_inst->can_change_types()))
+ break;
+
+ if (scan_inst->saturate) {
+ inst->saturate = false;
+ progress = true;
+ } else if (src_end_ip == ip || inst->dst.equals(inst->src[0])) {
+ if (scan_inst->can_do_saturate()) {
+ if (scan_inst->dst.type != inst->dst.type) {
+ scan_inst->dst.type = inst->dst.type;
+ for (int i = 0; i < scan_inst->sources; i++) {
+ scan_inst->src[i].type = inst->dst.type;
+ }
+ }
+
+ if (inst->src[0].negate) {
+ if (scan_inst->opcode == BRW_OPCODE_MUL) {
+ scan_inst->src[0].negate = !scan_inst->src[0].negate;
+ inst->src[0].negate = false;
+ } else if (scan_inst->opcode == BRW_OPCODE_MAD) {
+ scan_inst->src[0].negate = !scan_inst->src[0].negate;
+ scan_inst->src[1].negate = !scan_inst->src[1].negate;
+ inst->src[0].negate = false;
+ } else if (scan_inst->opcode == BRW_OPCODE_ADD) {
+ if (scan_inst->src[1].file == IMM) {
+ if (!brw_negate_immediate(scan_inst->src[1].type,
+ &scan_inst->src[1].as_brw_reg())) {
+ break;
+ }
+ } else {
+ scan_inst->src[1].negate = !scan_inst->src[1].negate;
+ }
+ scan_inst->src[0].negate = !scan_inst->src[0].negate;
+ inst->src[0].negate = false;
+ } else {
+ break;
+ }
+ }
+
+ scan_inst->saturate = true;
+ inst->saturate = false;
+ progress = true;
+ }
+ }
+ break;
+ }
+ for (int i = 0; i < scan_inst->sources; i++) {
+ if (scan_inst->src[i].file == VGRF &&
+ scan_inst->src[i].nr == inst->src[0].nr &&
+ scan_inst->src[i].offset / REG_SIZE ==
+ inst->src[0].offset / REG_SIZE) {
+ if (scan_inst->opcode != BRW_OPCODE_MOV ||
+ !scan_inst->saturate ||
+ scan_inst->src[0].abs ||
+ scan_inst->src[0].negate ||
+ scan_inst->src[0].abs != inst->src[0].abs ||
+ scan_inst->src[0].negate != inst->src[0].negate) {
+ interfered = true;
+ break;
+ }
+ }
+ }
+
+ if (interfered)
+ break;
+ }
+ }
+
+ return progress;
+}
+
+bool
+fs_visitor::opt_saturate_propagation()
+{
+ bool progress = false;
+
+ calculate_live_intervals();
+
+ foreach_block (block, cfg) {
+ progress = opt_saturate_propagation_local(this, block) || progress;
+ }
+
+ /* Live intervals are still valid. */
+
+ return progress;
+}
diff --git a/src/intel/compiler/brw_fs_sel_peephole.cpp b/src/intel/compiler/brw_fs_sel_peephole.cpp
new file mode 100644
index 00000000000..8cd897f72e0
--- /dev/null
+++ b/src/intel/compiler/brw_fs_sel_peephole.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+
+/** @file brw_fs_sel_peephole.cpp
+ *
+ * This file contains the opt_peephole_sel() optimization pass that replaces
+ * MOV instructions to the same destination in the "then" and "else" bodies of
+ * an if statement with SEL instructions.
+ */
+
+/* Four MOVs seems to be pretty typical, so I picked the next power of two in
+ * the hopes that it would handle almost anything possible in a single
+ * pass.
+ */
+#define MAX_MOVS 8 /**< The maximum number of MOVs to attempt to match. */
+
+using namespace brw;
+
+/**
+ * Scans forwards from an IF counting consecutive MOV instructions in the
+ * "then" and "else" blocks of the if statement.
+ *
+ * A pointer to the bblock_t following the IF is passed as the <then_block>
+ * argument. The function stores pointers to the MOV instructions in the
+ * <then_mov> and <else_mov> arrays.
+ *
+ * \return the minimum number of MOVs found in the two branches or zero if
+ * an error occurred.
+ *
+ * E.g.:
+ * IF ...
+ * then_mov[0] = MOV g4, ...
+ * then_mov[1] = MOV g5, ...
+ * then_mov[2] = MOV g6, ...
+ * ELSE ...
+ * else_mov[0] = MOV g4, ...
+ * else_mov[1] = MOV g5, ...
+ * else_mov[2] = MOV g7, ...
+ * ENDIF
+ * returns 3.
+ */
+static int
+count_movs_from_if(fs_inst *then_mov[MAX_MOVS], fs_inst *else_mov[MAX_MOVS],
+ bblock_t *then_block, bblock_t *else_block)
+{
+ int then_movs = 0;
+ foreach_inst_in_block(fs_inst, inst, then_block) {
+ if (then_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV)
+ break;
+
+ then_mov[then_movs] = inst;
+ then_movs++;
+ }
+
+ int else_movs = 0;
+ foreach_inst_in_block(fs_inst, inst, else_block) {
+ if (else_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV)
+ break;
+
+ else_mov[else_movs] = inst;
+ else_movs++;
+ }
+
+ return MIN2(then_movs, else_movs);
+}
+
+/**
+ * Try to replace IF/MOV+/ELSE/MOV+/ENDIF with SEL.
+ *
+ * Many GLSL shaders contain the following pattern:
+ *
+ * x = condition ? foo : bar
+ *
+ * or
+ *
+ * if (...) a.xyzw = foo.xyzw;
+ * else a.xyzw = bar.xyzw;
+ *
+ * The compiler emits an ir_if tree for this, since each subexpression might be
+ * a complex tree that could have side-effects or short-circuit logic.
+ *
+ * However, the common case is to simply select one of two constants or
+ * variable values---which is exactly what SEL is for. In this case, the
+ * assembly looks like:
+ *
+ * (+f0) IF
+ * MOV dst src0
+ * ...
+ * ELSE
+ * MOV dst src1
+ * ...
+ * ENDIF
+ *
+ * where each pair of MOVs to a common destination and can be easily translated
+ * into
+ *
+ * (+f0) SEL dst src0 src1
+ *
+ * If src0 is an immediate value, we promote it to a temporary GRF.
+ */
+bool
+fs_visitor::opt_peephole_sel()
+{
+ bool progress = false;
+
+ foreach_block (block, cfg) {
+ /* IF instructions, by definition, can only be found at the ends of
+ * basic blocks.
+ */
+ fs_inst *if_inst = (fs_inst *)block->end();
+ if (if_inst->opcode != BRW_OPCODE_IF)
+ continue;
+
+ fs_inst *else_mov[MAX_MOVS] = { NULL };
+ fs_inst *then_mov[MAX_MOVS] = { NULL };
+
+ bblock_t *then_block = block->next();
+ bblock_t *else_block = NULL;
+ foreach_list_typed(bblock_link, child, link, &block->children) {
+ if (child->block != then_block) {
+ if (child->block->prev()->end()->opcode == BRW_OPCODE_ELSE) {
+ else_block = child->block;
+ }
+ break;
+ }
+ }
+ if (else_block == NULL)
+ continue;
+
+ int movs = count_movs_from_if(then_mov, else_mov, then_block, else_block);
+
+ if (movs == 0)
+ continue;
+
+ /* Generate SEL instructions for pairs of MOVs to a common destination. */
+ for (int i = 0; i < movs; i++) {
+ if (!then_mov[i] || !else_mov[i])
+ break;
+
+ /* Check that the MOVs are the right form. */
+ if (!then_mov[i]->dst.equals(else_mov[i]->dst) ||
+ then_mov[i]->exec_size != else_mov[i]->exec_size ||
+ then_mov[i]->group != else_mov[i]->group ||
+ then_mov[i]->force_writemask_all != else_mov[i]->force_writemask_all ||
+ then_mov[i]->is_partial_write() ||
+ else_mov[i]->is_partial_write() ||
+ then_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE ||
+ else_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE) {
+ movs = i;
+ break;
+ }
+
+ /* Check that source types for mov operations match. */
+ if (then_mov[i]->src[0].type != else_mov[i]->src[0].type) {
+ movs = i;
+ break;
+ }
+ }
+
+ if (movs == 0)
+ continue;
+
+ for (int i = 0; i < movs; i++) {
+ const fs_builder ibld = fs_builder(this, then_block, then_mov[i])
+ .at(block, if_inst);
+
+ if (then_mov[i]->src[0].equals(else_mov[i]->src[0])) {
+ ibld.MOV(then_mov[i]->dst, then_mov[i]->src[0]);
+ } else {
+ /* Only the last source register can be a constant, so if the MOV
+ * in the "then" clause uses a constant, we need to put it in a
+ * temporary.
+ */
+ fs_reg src0(then_mov[i]->src[0]);
+ if (src0.file == IMM) {
+ src0 = vgrf(glsl_type::float_type);
+ src0.type = then_mov[i]->src[0].type;
+ ibld.MOV(src0, then_mov[i]->src[0]);
+ }
+
+ set_predicate_inv(if_inst->predicate, if_inst->predicate_inverse,
+ ibld.SEL(then_mov[i]->dst, src0,
+ else_mov[i]->src[0]));
+ }
+
+ then_mov[i]->remove(then_block);
+ else_mov[i]->remove(else_block);
+ }
+
+ progress = true;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
diff --git a/src/intel/compiler/brw_fs_surface_builder.cpp b/src/intel/compiler/brw_fs_surface_builder.cpp
new file mode 100644
index 00000000000..8990a5ca710
--- /dev/null
+++ b/src/intel/compiler/brw_fs_surface_builder.cpp
@@ -0,0 +1,1194 @@
+/*
+ * Copyright © 2013-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "isl/isl.h"
+#include "brw_fs_surface_builder.h"
+#include "brw_fs.h"
+
+using namespace brw;
+
+namespace brw {
+ namespace surface_access {
+ namespace {
+ /**
+ * Generate a logical send opcode for a surface message and return
+ * the result.
+ */
+ fs_reg
+ emit_send(const fs_builder &bld, enum opcode opcode,
+ const fs_reg &addr, const fs_reg &src, const fs_reg &surface,
+ unsigned dims, unsigned arg, unsigned rsize,
+ brw_predicate pred = BRW_PREDICATE_NONE)
+ {
+ /* Reduce the dynamically uniform surface index to a single
+ * scalar.
+ */
+ const fs_reg usurface = bld.emit_uniformize(surface);
+ const fs_reg srcs[] = {
+ addr, src, usurface, brw_imm_ud(dims), brw_imm_ud(arg)
+ };
+ const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, rsize);
+ fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
+
+ inst->size_written = rsize * dst.component_size(inst->exec_size);
+ inst->predicate = pred;
+ return dst;
+ }
+ }
+
+ /**
+ * Emit an untyped surface read opcode. \p dims determines the number
+ * of components of the address and \p size the number of components of
+ * the returned value.
+ */
+ fs_reg
+ emit_untyped_read(const fs_builder &bld,
+ const fs_reg &surface, const fs_reg &addr,
+ unsigned dims, unsigned size,
+ brw_predicate pred)
+ {
+ return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
+ addr, fs_reg(), surface, dims, size, size, pred);
+ }
+
+ /**
+ * Emit an untyped surface write opcode. \p dims determines the number
+ * of components of the address and \p size the number of components of
+ * the argument.
+ */
+ void
+ emit_untyped_write(const fs_builder &bld, const fs_reg &surface,
+ const fs_reg &addr, const fs_reg &src,
+ unsigned dims, unsigned size,
+ brw_predicate pred)
+ {
+ emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
+ addr, src, surface, dims, size, 0, pred);
+ }
+
+ /**
+ * Emit an untyped surface atomic opcode. \p dims determines the number
+ * of components of the address and \p rsize the number of components of
+ * the returned value (either zero or one).
+ */
+ fs_reg
+ emit_untyped_atomic(const fs_builder &bld,
+ const fs_reg &surface, const fs_reg &addr,
+ const fs_reg &src0, const fs_reg &src1,
+ unsigned dims, unsigned rsize, unsigned op,
+ brw_predicate pred)
+ {
+ /* FINISHME: Factor out this frequently recurring pattern into a
+ * helper function.
+ */
+ const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
+ const fs_reg srcs[] = { src0, src1 };
+ const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
+ bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
+
+ return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
+ addr, tmp, surface, dims, op, rsize, pred);
+ }
+
+ /**
+ * Emit a typed surface read opcode. \p dims determines the number of
+ * components of the address and \p size the number of components of the
+ * returned value.
+ */
+ fs_reg
+ emit_typed_read(const fs_builder &bld, const fs_reg &surface,
+ const fs_reg &addr, unsigned dims, unsigned size)
+ {
+ return emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
+ addr, fs_reg(), surface, dims, size, size);
+ }
+
+ /**
+ * Emit a typed surface write opcode. \p dims determines the number of
+ * components of the address and \p size the number of components of the
+ * argument.
+ */
+ void
+ emit_typed_write(const fs_builder &bld, const fs_reg &surface,
+ const fs_reg &addr, const fs_reg &src,
+ unsigned dims, unsigned size)
+ {
+ emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
+ addr, src, surface, dims, size, 0);
+ }
+
+ /**
+ * Emit a typed surface atomic opcode. \p dims determines the number of
+ * components of the address and \p rsize the number of components of
+ * the returned value (either zero or one).
+ */
+ fs_reg
+ emit_typed_atomic(const fs_builder &bld, const fs_reg &surface,
+ const fs_reg &addr,
+ const fs_reg &src0, const fs_reg &src1,
+ unsigned dims, unsigned rsize, unsigned op,
+ brw_predicate pred)
+ {
+ /* FINISHME: Factor out this frequently recurring pattern into a
+ * helper function.
+ */
+ const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
+ const fs_reg srcs[] = { src0, src1 };
+ const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
+ bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
+
+ return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
+ addr, tmp, surface, dims, op, rsize);
+ }
+ }
+}
+
+namespace {
+ namespace image_format_info {
+ /* The higher compiler layers use the GL enums for image formats even if
+ * they come in from SPIR-V or Vulkan. We need to turn them into an ISL
+ * enum before we can use them.
+ */
+ enum isl_format
+ isl_format_for_gl_format(uint32_t gl_format)
+ {
+ switch (gl_format) {
+ case GL_R8: return ISL_FORMAT_R8_UNORM;
+ case GL_R8_SNORM: return ISL_FORMAT_R8_SNORM;
+ case GL_R8UI: return ISL_FORMAT_R8_UINT;
+ case GL_R8I: return ISL_FORMAT_R8_SINT;
+ case GL_RG8: return ISL_FORMAT_R8G8_UNORM;
+ case GL_RG8_SNORM: return ISL_FORMAT_R8G8_SNORM;
+ case GL_RG8UI: return ISL_FORMAT_R8G8_UINT;
+ case GL_RG8I: return ISL_FORMAT_R8G8_SINT;
+ case GL_RGBA8: return ISL_FORMAT_R8G8B8A8_UNORM;
+ case GL_RGBA8_SNORM: return ISL_FORMAT_R8G8B8A8_SNORM;
+ case GL_RGBA8UI: return ISL_FORMAT_R8G8B8A8_UINT;
+ case GL_RGBA8I: return ISL_FORMAT_R8G8B8A8_SINT;
+ case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT;
+ case GL_RGB10_A2: return ISL_FORMAT_R10G10B10A2_UNORM;
+ case GL_RGB10_A2UI: return ISL_FORMAT_R10G10B10A2_UINT;
+ case GL_R16: return ISL_FORMAT_R16_UNORM;
+ case GL_R16_SNORM: return ISL_FORMAT_R16_SNORM;
+ case GL_R16F: return ISL_FORMAT_R16_FLOAT;
+ case GL_R16UI: return ISL_FORMAT_R16_UINT;
+ case GL_R16I: return ISL_FORMAT_R16_SINT;
+ case GL_RG16: return ISL_FORMAT_R16G16_UNORM;
+ case GL_RG16_SNORM: return ISL_FORMAT_R16G16_SNORM;
+ case GL_RG16F: return ISL_FORMAT_R16G16_FLOAT;
+ case GL_RG16UI: return ISL_FORMAT_R16G16_UINT;
+ case GL_RG16I: return ISL_FORMAT_R16G16_SINT;
+ case GL_RGBA16: return ISL_FORMAT_R16G16B16A16_UNORM;
+ case GL_RGBA16_SNORM: return ISL_FORMAT_R16G16B16A16_SNORM;
+ case GL_RGBA16F: return ISL_FORMAT_R16G16B16A16_FLOAT;
+ case GL_RGBA16UI: return ISL_FORMAT_R16G16B16A16_UINT;
+ case GL_RGBA16I: return ISL_FORMAT_R16G16B16A16_SINT;
+ case GL_R32F: return ISL_FORMAT_R32_FLOAT;
+ case GL_R32UI: return ISL_FORMAT_R32_UINT;
+ case GL_R32I: return ISL_FORMAT_R32_SINT;
+ case GL_RG32F: return ISL_FORMAT_R32G32_FLOAT;
+ case GL_RG32UI: return ISL_FORMAT_R32G32_UINT;
+ case GL_RG32I: return ISL_FORMAT_R32G32_SINT;
+ case GL_RGBA32F: return ISL_FORMAT_R32G32B32A32_FLOAT;
+ case GL_RGBA32UI: return ISL_FORMAT_R32G32B32A32_UINT;
+ case GL_RGBA32I: return ISL_FORMAT_R32G32B32A32_SINT;
+ case GL_NONE: return ISL_FORMAT_UNSUPPORTED;
+ default:
+ assert(!"Invalid image format");
+ return ISL_FORMAT_UNSUPPORTED;
+ }
+ }
+
+ /**
+ * Simple 4-tuple of scalars used to pass around per-color component
+ * values.
+ */
+ struct color_u {
+ color_u(unsigned x = 0) : r(x), g(x), b(x), a(x)
+ {
+ }
+
+ color_u(unsigned r, unsigned g, unsigned b, unsigned a) :
+ r(r), g(g), b(b), a(a)
+ {
+ }
+
+ unsigned
+ operator[](unsigned i) const
+ {
+ const unsigned xs[] = { r, g, b, a };
+ return xs[i];
+ }
+
+ unsigned r, g, b, a;
+ };
+
+ /**
+ * Return the per-channel bitfield widths for a given image format.
+ */
+ inline color_u
+ get_bit_widths(isl_format format)
+ {
+ const isl_format_layout *fmtl = isl_format_get_layout(format);
+
+ return color_u(fmtl->channels.r.bits,
+ fmtl->channels.g.bits,
+ fmtl->channels.b.bits,
+ fmtl->channels.a.bits);
+ }
+
+ /**
+ * Return the per-channel bitfield shifts for a given image format.
+ */
+ inline color_u
+ get_bit_shifts(isl_format format)
+ {
+ const color_u widths = get_bit_widths(format);
+ return color_u(0, widths.r, widths.r + widths.g,
+ widths.r + widths.g + widths.b);
+ }
+
+ /**
+ * Return true if all present components have the same bit width.
+ */
+ inline bool
+ is_homogeneous(isl_format format)
+ {
+ const color_u widths = get_bit_widths(format);
+ return ((widths.g == 0 || widths.g == widths.r) &&
+ (widths.b == 0 || widths.b == widths.r) &&
+ (widths.a == 0 || widths.a == widths.r));
+ }
+
+ /**
+ * Return true if the format conversion boils down to a trivial copy.
+ */
+ inline bool
+ is_conversion_trivial(const gen_device_info *devinfo, isl_format format)
+ {
+ return (get_bit_widths(format).r == 32 && is_homogeneous(format)) ||
+ format == isl_lower_storage_image_format(devinfo, format);
+ }
+
+ /**
+ * Return true if the hardware natively supports some format with
+ * compatible bitfield layout, but possibly different data types.
+ */
+ inline bool
+ has_supported_bit_layout(const gen_device_info *devinfo,
+ isl_format format)
+ {
+ const color_u widths = get_bit_widths(format);
+ const color_u lower_widths = get_bit_widths(
+ isl_lower_storage_image_format(devinfo, format));
+
+ return (widths.r == lower_widths.r &&
+ widths.g == lower_widths.g &&
+ widths.b == lower_widths.b &&
+ widths.a == lower_widths.a);
+ }
+
+ /**
+ * Return true if we are required to spread individual components over
+ * several components of the format used by the hardware (RG32 and
+ * friends implemented as RGBA16UI).
+ */
+ inline bool
+ has_split_bit_layout(const gen_device_info *devinfo, isl_format format)
+ {
+ const isl_format lower_format =
+ isl_lower_storage_image_format(devinfo, format);
+
+ return (isl_format_get_num_channels(format) <
+ isl_format_get_num_channels(lower_format));
+ }
+
+ /**
+ * Return true if the hardware returns garbage in the unused high bits
+ * of each component. This may happen on IVB because we rely on the
+ * undocumented behavior that typed reads from surfaces of the
+ * unsupported R8 and R16 formats return useful data in their least
+ * significant bits.
+ */
+ inline bool
+ has_undefined_high_bits(const gen_device_info *devinfo,
+ isl_format format)
+ {
+ const isl_format lower_format =
+ isl_lower_storage_image_format(devinfo, format);
+
+ return (devinfo->gen == 7 && !devinfo->is_haswell &&
+ (lower_format == ISL_FORMAT_R16_UINT ||
+ lower_format == ISL_FORMAT_R8_UINT));
+ }
+
+ /**
+ * Return true if the format represents values as signed integers
+ * requiring sign extension when unpacking.
+ */
+ inline bool
+ needs_sign_extension(isl_format format)
+ {
+ return isl_format_has_snorm_channel(format) ||
+ isl_format_has_sint_channel(format);
+ }
+ }
+
+ namespace image_validity {
+ /**
+ * Check whether the bound image is suitable for untyped access.
+ */
+ brw_predicate
+ emit_untyped_image_check(const fs_builder &bld, const fs_reg &image,
+ brw_predicate pred)
+ {
+ const gen_device_info *devinfo = bld.shader->devinfo;
+ const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
+
+ if (devinfo->gen == 7 && !devinfo->is_haswell) {
+ /* Check whether the first stride component (i.e. the Bpp value)
+ * is greater than four, what on Gen7 indicates that a surface of
+ * type RAW has been bound for untyped access. Reading or writing
+ * to a surface of type other than RAW using untyped surface
+ * messages causes a hang on IVB and VLV.
+ */
+ set_predicate(pred,
+ bld.CMP(bld.null_reg_ud(), stride, brw_imm_d(4),
+ BRW_CONDITIONAL_G));
+
+ return BRW_PREDICATE_NORMAL;
+ } else {
+ /* More recent generations handle the format mismatch
+ * gracefully.
+ */
+ return pred;
+ }
+ }
+
+ /**
+ * Check whether there is an image bound at the given index and write
+ * the comparison result to f0.0. Returns an appropriate predication
+ * mode to use on subsequent image operations.
+ */
+ brw_predicate
+ emit_typed_atomic_check(const fs_builder &bld, const fs_reg &image)
+ {
+ const gen_device_info *devinfo = bld.shader->devinfo;
+ const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
+
+ if (devinfo->gen == 7 && !devinfo->is_haswell) {
+ /* Check the first component of the size field to find out if the
+ * image is bound. Necessary on IVB for typed atomics because
+ * they don't seem to respect null surfaces and will happily
+ * corrupt or read random memory when no image is bound.
+ */
+ bld.CMP(bld.null_reg_ud(),
+ retype(size, BRW_REGISTER_TYPE_UD),
+ brw_imm_d(0), BRW_CONDITIONAL_NZ);
+
+ return BRW_PREDICATE_NORMAL;
+ } else {
+ /* More recent platforms implement compliant behavior when a null
+ * surface is bound.
+ */
+ return BRW_PREDICATE_NONE;
+ }
+ }
+
+ /**
+ * Check whether the provided coordinates are within the image bounds
+ * and write the comparison result to f0.0. Returns an appropriate
+ * predication mode to use on subsequent image operations.
+ */
+ brw_predicate
+ emit_bounds_check(const fs_builder &bld, const fs_reg &image,
+ const fs_reg &addr, unsigned dims)
+ {
+ const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
+
+ for (unsigned c = 0; c < dims; ++c)
+ set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL,
+ bld.CMP(bld.null_reg_ud(),
+ offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c),
+ offset(size, bld, c),
+ BRW_CONDITIONAL_L));
+
+ return BRW_PREDICATE_NORMAL;
+ }
+ }
+
+ namespace image_coordinates {
+ /**
+ * Return the total number of coordinates needed to address a texel of
+ * the surface, which may be more than the sum of \p surf_dims and \p
+ * arr_dims if padding is required.
+ */
+ unsigned
+ num_image_coordinates(const fs_builder &bld,
+ unsigned surf_dims, unsigned arr_dims,
+ isl_format format)
+ {
+ /* HSW in vec4 mode and our software coordinate handling for untyped
+ * reads want the array index to be at the Z component.
+ */
+ const bool array_index_at_z =
+ format != ISL_FORMAT_UNSUPPORTED &&
+ !isl_has_matching_typed_storage_image_format(
+ bld.shader->devinfo, format);
+ const unsigned zero_dims =
+ ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0);
+
+ return surf_dims + zero_dims + arr_dims;
+ }
+
+ /**
+ * Transform image coordinates into the form expected by the
+ * implementation.
+ */
+ fs_reg
+ emit_image_coordinates(const fs_builder &bld, const fs_reg &addr,
+ unsigned surf_dims, unsigned arr_dims,
+ isl_format format)
+ {
+ const unsigned dims =
+ num_image_coordinates(bld, surf_dims, arr_dims, format);
+
+ if (dims > surf_dims + arr_dims) {
+ assert(surf_dims == 1 && arr_dims == 1 && dims == 3);
+ /* The array index is required to be passed in as the Z component,
+ * insert a zero at the Y component to shift it to the right
+ * position.
+ *
+ * FINISHME: Factor out this frequently recurring pattern into a
+ * helper function.
+ */
+ const fs_reg srcs[] = { addr, brw_imm_d(0), offset(addr, bld, 1) };
+ const fs_reg dst = bld.vgrf(addr.type, dims);
+ bld.LOAD_PAYLOAD(dst, srcs, dims, 0);
+ return dst;
+ } else {
+ return addr;
+ }
+ }
+
+ /**
+ * Calculate the offset in memory of the texel given by \p coord.
+ *
+ * This is meant to be used with untyped surface messages to access a
+ * tiled surface, what involves taking into account the tiling and
+ * swizzling modes of the surface manually so it will hopefully not
+ * happen very often.
+ *
+ * The tiling algorithm implemented here matches either the X or Y
+ * tiling layouts supported by the hardware depending on the tiling
+ * coefficients passed to the program as uniforms. See Volume 1 Part 2
+ * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth
+ * explanation of the hardware tiling format.
+ */
+ fs_reg
+ emit_address_calculation(const fs_builder &bld, const fs_reg &image,
+ const fs_reg &coord, unsigned dims)
+ {
+ const gen_device_info *devinfo = bld.shader->devinfo;
+ const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET);
+ const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
+ const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET);
+ const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET);
+ const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+ const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+ const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+ const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+ const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+ /* Shift the coordinates by the fixed surface offset. It may be
+ * non-zero if the image is a single slice of a higher-dimensional
+ * surface, or if a non-zero mipmap level of the surface is bound to
+ * the pipeline. The offset needs to be applied here rather than at
+ * surface state set-up time because the desired slice-level may
+ * start mid-tile, so simply shifting the surface base address
+ * wouldn't give a well-formed tiled surface in the general case.
+ */
+ for (unsigned c = 0; c < 2; ++c)
+ bld.ADD(offset(addr, bld, c), offset(off, bld, c),
+ (c < dims ?
+ offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) :
+ fs_reg(brw_imm_d(0))));
+
+ /* The layout of 3-D textures in memory is sort-of like a tiling
+ * format. At each miplevel, the slices are arranged in rows of
+ * 2^level slices per row. The slice row is stored in tmp.y and
+ * the slice within the row is stored in tmp.x.
+ *
+ * The layout of 2-D array textures and cubemaps is much simpler:
+ * Depending on whether the ARYSPC_LOD0 layout is in use it will be
+ * stored in memory as an array of slices, each one being a 2-D
+ * arrangement of miplevels, or as a 2D arrangement of miplevels,
+ * each one being an array of slices. In either case the separation
+ * between slices of the same LOD is equal to the qpitch value
+ * provided as stride.w.
+ *
+ * This code can be made to handle either 2D arrays and 3D textures
+ * by passing in the miplevel as tile.z for 3-D textures and 0 in
+ * tile.z for 2-D array textures.
+ *
+ * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
+ * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
+ * of the hardware 3D texture and 2D array layouts.
+ */
+ if (dims > 2) {
+ /* Decompose z into a major (tmp.y) and a minor (tmp.x)
+ * index.
+ */
+ bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), brw_imm_d(0),
+ offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2));
+ bld.SHR(offset(tmp, bld, 1),
+ offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2),
+ offset(tile, bld, 2));
+
+ /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
+ * slice offset.
+ */
+ for (unsigned c = 0; c < 2; ++c) {
+ bld.MUL(offset(tmp, bld, c),
+ offset(stride, bld, 2 + c), offset(tmp, bld, c));
+ bld.ADD(offset(addr, bld, c),
+ offset(addr, bld, c), offset(tmp, bld, c));
+ }
+ }
+
+ if (dims > 1) {
+ /* Calculate the major/minor x and y indices. In order to
+ * accommodate both X and Y tiling, the Y-major tiling format is
+ * treated as being a bunch of narrow X-tiles placed next to each
+ * other. This means that the tile width for Y-tiling is actually
+ * the width of one sub-column of the Y-major tile where each 4K
+ * tile has 8 512B sub-columns.
+ *
+ * The major Y value is the row of tiles in which the pixel lives.
+ * The major X value is the tile sub-column in which the pixel
+ * lives; for X tiling, this is the same as the tile column, for Y
+ * tiling, each tile has 8 sub-columns. The minor X and Y indices
+ * are the position within the sub-column.
+ */
+ for (unsigned c = 0; c < 2; ++c) {
+ /* Calculate the minor x and y indices. */
+ bld.BFE(offset(minor, bld, c), offset(tile, bld, c),
+ brw_imm_d(0), offset(addr, bld, c));
+
+ /* Calculate the major x and y indices. */
+ bld.SHR(offset(major, bld, c),
+ offset(addr, bld, c), offset(tile, bld, c));
+ }
+
+ /* Calculate the texel index from the start of the tile row and
+ * the vertical coordinate of the row.
+ * Equivalent to:
+ * tmp.x = (major.x << tile.y << tile.x) +
+ * (minor.y << tile.x) + minor.x
+ * tmp.y = major.y << tile.y
+ */
+ bld.SHL(tmp, major, offset(tile, bld, 1));
+ bld.ADD(tmp, tmp, offset(minor, bld, 1));
+ bld.SHL(tmp, tmp, offset(tile, bld, 0));
+ bld.ADD(tmp, tmp, minor);
+ bld.SHL(offset(tmp, bld, 1),
+ offset(major, bld, 1), offset(tile, bld, 1));
+
+ /* Add it to the start of the tile row. */
+ bld.MUL(offset(tmp, bld, 1),
+ offset(tmp, bld, 1), offset(stride, bld, 1));
+ bld.ADD(tmp, tmp, offset(tmp, bld, 1));
+
+ /* Multiply by the Bpp value. */
+ bld.MUL(dst, tmp, stride);
+
+ if (devinfo->gen < 8 && !devinfo->is_baytrail) {
+ /* Take into account the two dynamically specified shifts.
+ * Both need are used to implement swizzling of X-tiled
+ * surfaces. For Y-tiled surfaces only one bit needs to be
+ * XOR-ed with bit 6 of the memory address, so a swz value of
+ * 0xff (actually interpreted as 31 by the hardware) will be
+ * provided to cause the relevant bit of tmp.y to be zero and
+ * turn the first XOR into the identity. For linear surfaces
+ * or platforms lacking address swizzling both shifts will be
+ * 0xff causing the relevant bits of both tmp.x and .y to be
+ * zero, what effectively disables swizzling.
+ */
+ for (unsigned c = 0; c < 2; ++c)
+ bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c));
+
+ /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
+ bld.XOR(tmp, tmp, offset(tmp, bld, 1));
+ bld.AND(tmp, tmp, brw_imm_d(1 << 6));
+ bld.XOR(dst, dst, tmp);
+ }
+
+ } else {
+ /* Multiply by the Bpp/stride value. Note that the addr.y may be
+ * non-zero even if the image is one-dimensional because a
+ * vertical offset may have been applied above to select a
+ * non-zero slice or level of a higher-dimensional texture.
+ */
+ bld.MUL(offset(addr, bld, 1),
+ offset(addr, bld, 1), offset(stride, bld, 1));
+ bld.ADD(addr, addr, offset(addr, bld, 1));
+ bld.MUL(dst, addr, stride);
+ }
+
+ return dst;
+ }
+ }
+
+ namespace image_format_conversion {
+ using image_format_info::color_u;
+
+ namespace {
+ /**
+ * Maximum representable value in an unsigned integer with the given
+ * number of bits.
+ */
+ inline unsigned
+ scale(unsigned n)
+ {
+ return (1 << n) - 1;
+ }
+ }
+
+ /**
+ * Pack the vector \p src in a bitfield given the per-component bit
+ * shifts and widths. Note that bitfield components are not allowed to
+ * cross 32-bit boundaries.
+ */
+ fs_reg
+ emit_pack(const fs_builder &bld, const fs_reg &src,
+ const color_u &shifts, const color_u &widths)
+ {
+ const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+ bool seen[4] = {};
+
+ for (unsigned c = 0; c < 4; ++c) {
+ if (widths[c]) {
+ const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+ /* Shift each component left to the correct bitfield position. */
+ bld.SHL(tmp, offset(src, bld, c), brw_imm_ud(shifts[c] % 32));
+
+ /* Add everything up. */
+ if (seen[shifts[c] / 32]) {
+ bld.OR(offset(dst, bld, shifts[c] / 32),
+ offset(dst, bld, shifts[c] / 32), tmp);
+ } else {
+ bld.MOV(offset(dst, bld, shifts[c] / 32), tmp);
+ seen[shifts[c] / 32] = true;
+ }
+ }
+ }
+
+ return dst;
+ }
+
+ /**
+ * Unpack a vector from the bitfield \p src given the per-component bit
+ * shifts and widths. Note that bitfield components are not allowed to
+ * cross 32-bit boundaries.
+ */
+ fs_reg
+ emit_unpack(const fs_builder &bld, const fs_reg &src,
+ const color_u &shifts, const color_u &widths)
+ {
+ const fs_reg dst = bld.vgrf(src.type, 4);
+
+ for (unsigned c = 0; c < 4; ++c) {
+ if (widths[c]) {
+ /* Shift left to discard the most significant bits. */
+ bld.SHL(offset(dst, bld, c),
+ offset(src, bld, shifts[c] / 32),
+ brw_imm_ud(32 - shifts[c] % 32 - widths[c]));
+
+ /* Shift back to the least significant bits using an arithmetic
+ * shift to get sign extension on signed types.
+ */
+ bld.ASR(offset(dst, bld, c),
+ offset(dst, bld, c), brw_imm_ud(32 - widths[c]));
+ }
+ }
+
+ return dst;
+ }
+
+ /**
+ * Convert an integer vector into another integer vector of the
+ * specified bit widths, properly handling overflow.
+ */
+ fs_reg
+ emit_convert_to_integer(const fs_builder &bld, const fs_reg &src,
+ const color_u &widths, bool is_signed)
+ {
+ const unsigned s = (is_signed ? 1 : 0);
+ const fs_reg dst = bld.vgrf(
+ is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
+ assert(src.type == dst.type);
+
+ for (unsigned c = 0; c < 4; ++c) {
+ if (widths[c]) {
+ /* Clamp to the maximum value. */
+ bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c),
+ brw_imm_d((int)scale(widths[c] - s)),
+ BRW_CONDITIONAL_L);
+
+ /* Clamp to the minimum value. */
+ if (is_signed)
+ bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
+ brw_imm_d(-(int)scale(widths[c] - s) - 1),
+ BRW_CONDITIONAL_GE);
+
+ /* Mask off all but the bits we actually want. Otherwise, if
+ * we pass a negative number into the hardware when it's
+ * expecting something like UINT8, it will happily clamp it to
+ * +255 for us.
+ */
+ if (is_signed && widths[c] < 32)
+ bld.AND(offset(dst, bld, c), offset(dst, bld, c),
+ brw_imm_d(scale(widths[c])));
+ }
+ }
+
+ return dst;
+ }
+
+ /**
+ * Convert a normalized fixed-point vector of the specified signedness
+ * and bit widths into a floating point vector.
+ */
+ fs_reg
+ emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src,
+ const color_u &widths, bool is_signed)
+ {
+ const unsigned s = (is_signed ? 1 : 0);
+ const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+
+ for (unsigned c = 0; c < 4; ++c) {
+ if (widths[c]) {
+ /* Convert to float. */
+ bld.MOV(offset(dst, bld, c), offset(src, bld, c));
+
+ /* Divide by the normalization constants. */
+ bld.MUL(offset(dst, bld, c), offset(dst, bld, c),
+ brw_imm_f(1.0f / scale(widths[c] - s)));
+
+ /* Clamp to the minimum value. */
+ if (is_signed)
+ bld.emit_minmax(offset(dst, bld, c),
+ offset(dst, bld, c), brw_imm_f(-1.0f),
+ BRW_CONDITIONAL_GE);
+ }
+ }
+ return dst;
+ }
+
+ /**
+ * Convert a floating-point vector into a normalized fixed-point vector
+ * of the specified signedness and bit widths.
+ */
+ fs_reg
+ emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src,
+ const color_u &widths, bool is_signed)
+ {
+ const unsigned s = (is_signed ? 1 : 0);
+ const fs_reg dst = bld.vgrf(
+ is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
+ const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
+
+ for (unsigned c = 0; c < 4; ++c) {
+ if (widths[c]) {
+ /* Clamp the normalized floating-point argument. */
+ if (is_signed) {
+ bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c),
+ brw_imm_f(-1.0f), BRW_CONDITIONAL_GE);
+
+ bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
+ brw_imm_f(1.0f), BRW_CONDITIONAL_L);
+ } else {
+ set_saturate(true, bld.MOV(offset(fdst, bld, c),
+ offset(src, bld, c)));
+ }
+
+ /* Multiply by the normalization constants. */
+ bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c),
+ brw_imm_f((float)scale(widths[c] - s)));
+
+ /* Convert to integer. */
+ bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c));
+ bld.MOV(offset(dst, bld, c), offset(fdst, bld, c));
+
+ /* Mask off all but the bits we actually want. Otherwise, if
+ * we pass a negative number into the hardware when it's
+ * expecting something like UINT8, it will happily clamp it to
+ * +255 for us.
+ */
+ if (is_signed && widths[c] < 32)
+ bld.AND(offset(dst, bld, c), offset(dst, bld, c),
+ brw_imm_d(scale(widths[c])));
+ }
+ }
+
+ return dst;
+ }
+
+ /**
+ * Convert a floating point vector of the specified bit widths into a
+ * 32-bit floating point vector.
+ */
+ fs_reg
+ emit_convert_from_float(const fs_builder &bld, const fs_reg &src,
+ const color_u &widths)
+ {
+ const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+ const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
+
+ for (unsigned c = 0; c < 4; ++c) {
+ if (widths[c]) {
+ bld.MOV(offset(dst, bld, c), offset(src, bld, c));
+
+ /* Extend 10-bit and 11-bit floating point numbers to 15 bits.
+ * This works because they have a 5-bit exponent just like the
+ * 16-bit floating point format, and they have no sign bit.
+ */
+ if (widths[c] < 16)
+ bld.SHL(offset(dst, bld, c),
+ offset(dst, bld, c), brw_imm_ud(15 - widths[c]));
+
+ /* Convert to 32-bit floating point. */
+ bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c));
+ }
+ }
+
+ return fdst;
+ }
+
+ /**
+ * Convert a vector into a floating point vector of the specified bit
+ * widths.
+ */
+ fs_reg
+ emit_convert_to_float(const fs_builder &bld, const fs_reg &src,
+ const color_u &widths)
+ {
+ const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+ const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
+
+ for (unsigned c = 0; c < 4; ++c) {
+ if (widths[c]) {
+ bld.MOV(offset(fdst, bld, c), offset(src, bld, c));
+
+ /* Clamp to the minimum value. */
+ if (widths[c] < 16)
+ bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
+ brw_imm_f(0.0f), BRW_CONDITIONAL_GE);
+
+ /* Convert to 16-bit floating-point. */
+ bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c));
+
+ /* Discard the least significant bits to get floating point
+ * numbers of the requested width. This works because the
+ * 10-bit and 11-bit floating point formats have a 5-bit
+ * exponent just like the 16-bit format, and they have no sign
+ * bit.
+ */
+ if (widths[c] < 16)
+ bld.SHR(offset(dst, bld, c), offset(dst, bld, c),
+ brw_imm_ud(15 - widths[c]));
+ }
+ }
+
+ return dst;
+ }
+
+ /**
+ * Fill missing components of a vector with 0, 0, 0, 1.
+ */
+ fs_reg
+ emit_pad(const fs_builder &bld, const fs_reg &src,
+ const color_u &widths)
+ {
+ const fs_reg dst = bld.vgrf(src.type, 4);
+ const unsigned pad[] = { 0, 0, 0, 1 };
+
+ for (unsigned c = 0; c < 4; ++c)
+ bld.MOV(offset(dst, bld, c),
+ widths[c] ? offset(src, bld, c)
+ : fs_reg(brw_imm_ud(pad[c])));
+
+ return dst;
+ }
+ }
+}
+
+namespace brw {
+ namespace image_access {
+ /**
+ * Load a vector from a surface of the given format and dimensionality
+ * at the given coordinates. \p surf_dims and \p arr_dims give the
+ * number of non-array and array coordinates of the image respectively.
+ */
+ fs_reg
+ emit_image_load(const fs_builder &bld,
+ const fs_reg &image, const fs_reg &addr,
+ unsigned surf_dims, unsigned arr_dims,
+ unsigned gl_format)
+ {
+ using namespace image_format_info;
+ using namespace image_format_conversion;
+ using namespace image_validity;
+ using namespace image_coordinates;
+ using namespace surface_access;
+ const gen_device_info *devinfo = bld.shader->devinfo;
+ const isl_format format = isl_format_for_gl_format(gl_format);
+ const isl_format lower_format =
+ isl_lower_storage_image_format(devinfo, format);
+ fs_reg tmp;
+
+ /* Transform the image coordinates into actual surface coordinates. */
+ const fs_reg saddr =
+ emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
+ const unsigned dims =
+ num_image_coordinates(bld, surf_dims, arr_dims, format);
+
+ if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
+ /* Hopefully we get here most of the time... */
+ tmp = emit_typed_read(bld, image, saddr, dims,
+ isl_format_get_num_channels(lower_format));
+ } else {
+ /* Untyped surface reads return 32 bits of the surface per
+ * component, without any sort of unpacking or type conversion,
+ */
+ const unsigned size = isl_format_get_layout(format)->bpb / 32;
+ /* they don't properly handle out of bounds access, so we have to
+ * check manually if the coordinates are valid and predicate the
+ * surface read on the result,
+ */
+ const brw_predicate pred =
+ emit_untyped_image_check(bld, image,
+ emit_bounds_check(bld, image,
+ saddr, dims));
+
+ /* and they don't know about surface coordinates, we need to
+ * convert them to a raw memory offset.
+ */
+ const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims);
+
+ tmp = emit_untyped_read(bld, image, laddr, 1, size, pred);
+
+ /* An out of bounds surface access should give zero as result. */
+ for (unsigned c = 0; c < size; ++c)
+ set_predicate(pred, bld.SEL(offset(tmp, bld, c),
+ offset(tmp, bld, c), brw_imm_d(0)));
+ }
+
+ /* Set the register type to D instead of UD if the data type is
+ * represented as a signed integer in memory so that sign extension
+ * is handled correctly by unpack.
+ */
+ if (needs_sign_extension(format))
+ tmp = retype(tmp, BRW_REGISTER_TYPE_D);
+
+ if (!has_supported_bit_layout(devinfo, format)) {
+ /* Unpack individual vector components from the bitfield if the
+ * hardware is unable to do it for us.
+ */
+ if (has_split_bit_layout(devinfo, format))
+ tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format),
+ get_bit_widths(lower_format));
+ else
+ tmp = emit_unpack(bld, tmp, get_bit_shifts(format),
+ get_bit_widths(format));
+
+ } else if ((needs_sign_extension(format) &&
+ !is_conversion_trivial(devinfo, format)) ||
+ has_undefined_high_bits(devinfo, format)) {
+ /* Perform a trivial unpack even though the bit layout matches in
+ * order to get the most significant bits of each component
+ * initialized properly.
+ */
+ tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96),
+ get_bit_widths(format));
+ }
+
+ if (!isl_format_has_int_channel(format)) {
+ if (is_conversion_trivial(devinfo, format)) {
+ /* Just need to cast the vector to the target type. */
+ tmp = retype(tmp, BRW_REGISTER_TYPE_F);
+ } else {
+ /* Do the right sort of type conversion to float. */
+ if (isl_format_has_float_channel(format))
+ tmp = emit_convert_from_float(
+ bld, tmp, get_bit_widths(format));
+ else
+ tmp = emit_convert_from_scaled(
+ bld, tmp, get_bit_widths(format),
+ isl_format_has_snorm_channel(format));
+ }
+ }
+
+ /* Initialize missing components of the result. */
+ return emit_pad(bld, tmp, get_bit_widths(format));
+ }
+
+ /**
+ * Store a vector in a surface of the given format and dimensionality at
+ * the given coordinates. \p surf_dims and \p arr_dims give the number
+ * of non-array and array coordinates of the image respectively.
+ */
+ void
+ emit_image_store(const fs_builder &bld, const fs_reg &image,
+ const fs_reg &addr, const fs_reg &src,
+ unsigned surf_dims, unsigned arr_dims,
+ unsigned gl_format)
+ {
+ using namespace image_format_info;
+ using namespace image_format_conversion;
+ using namespace image_validity;
+ using namespace image_coordinates;
+ using namespace surface_access;
+ const isl_format format = isl_format_for_gl_format(gl_format);
+ const gen_device_info *devinfo = bld.shader->devinfo;
+
+ /* Transform the image coordinates into actual surface coordinates. */
+ const fs_reg saddr =
+ emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
+ const unsigned dims =
+ num_image_coordinates(bld, surf_dims, arr_dims, format);
+
+ if (gl_format == GL_NONE) {
+ /* We don't know what the format is, but that's fine because it
+ * implies write-only access, and typed surface writes are always
+ * able to take care of type conversion and packing for us.
+ */
+ emit_typed_write(bld, image, saddr, src, dims, 4);
+
+ } else {
+ const isl_format lower_format =
+ isl_lower_storage_image_format(devinfo, format);
+ fs_reg tmp = src;
+
+ if (!is_conversion_trivial(devinfo, format)) {
+ /* Do the right sort of type conversion. */
+ if (isl_format_has_float_channel(format))
+ tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format));
+
+ else if (isl_format_has_int_channel(format))
+ tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format),
+ isl_format_has_sint_channel(format));
+
+ else
+ tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format),
+ isl_format_has_snorm_channel(format));
+ }
+
+ /* We're down to bit manipulation at this point. */
+ tmp = retype(tmp, BRW_REGISTER_TYPE_UD);
+
+ if (!has_supported_bit_layout(devinfo, format)) {
+ /* Pack the vector components into a bitfield if the hardware
+ * is unable to do it for us.
+ */
+ if (has_split_bit_layout(devinfo, format))
+ tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format),
+ get_bit_widths(lower_format));
+
+ else
+ tmp = emit_pack(bld, tmp, get_bit_shifts(format),
+ get_bit_widths(format));
+ }
+
+ if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
+ /* Hopefully we get here most of the time... */
+ emit_typed_write(bld, image, saddr, tmp, dims,
+ isl_format_get_num_channels(lower_format));
+
+ } else {
+ /* Untyped surface writes store 32 bits of the surface per
+ * component, without any sort of packing or type conversion,
+ */
+ const unsigned size = isl_format_get_layout(format)->bpb / 32;
+
+ /* they don't properly handle out of bounds access, so we have
+ * to check manually if the coordinates are valid and predicate
+ * the surface write on the result,
+ */
+ const brw_predicate pred =
+ emit_untyped_image_check(bld, image,
+ emit_bounds_check(bld, image,
+ saddr, dims));
+
+ /* and, phew, they don't know about surface coordinates, we
+ * need to convert them to a raw memory offset.
+ */
+ const fs_reg laddr = emit_address_calculation(
+ bld, image, saddr, dims);
+
+ emit_untyped_write(bld, image, laddr, tmp, 1, size, pred);
+ }
+ }
+ }
+
+ /**
+ * Perform an atomic read-modify-write operation in a surface of the
+ * given dimensionality at the given coordinates. \p surf_dims and \p
+ * arr_dims give the number of non-array and array coordinates of the
+ * image respectively. Main building block of the imageAtomic GLSL
+ * built-ins.
+ */
+ fs_reg
+ emit_image_atomic(const fs_builder &bld,
+ const fs_reg &image, const fs_reg &addr,
+ const fs_reg &src0, const fs_reg &src1,
+ unsigned surf_dims, unsigned arr_dims,
+ unsigned rsize, unsigned op)
+ {
+ using namespace image_validity;
+ using namespace image_coordinates;
+ using namespace surface_access;
+ /* Avoid performing an atomic operation on an unbound surface. */
+ const brw_predicate pred = emit_typed_atomic_check(bld, image);
+
+ /* Transform the image coordinates into actual surface coordinates. */
+ const fs_reg saddr =
+ emit_image_coordinates(bld, addr, surf_dims, arr_dims,
+ ISL_FORMAT_R32_UINT);
+ const unsigned dims =
+ num_image_coordinates(bld, surf_dims, arr_dims,
+ ISL_FORMAT_R32_UINT);
+
+ /* Thankfully we can do without untyped atomics here. */
+ const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1,
+ dims, rsize, op, pred);
+
+ /* An unbound surface access should give zero as result. */
+ if (rsize && pred)
+ set_predicate(pred, bld.SEL(tmp, tmp, brw_imm_d(0)));
+
+ return retype(tmp, src0.type);
+ }
+ }
+}
diff --git a/src/intel/compiler/brw_fs_surface_builder.h b/src/intel/compiler/brw_fs_surface_builder.h
new file mode 100644
index 00000000000..32b56d387f6
--- /dev/null
+++ b/src/intel/compiler/brw_fs_surface_builder.h
@@ -0,0 +1,88 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2013-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_FS_SURFACE_BUILDER_H
+#define BRW_FS_SURFACE_BUILDER_H
+
+#include "brw_fs_builder.h"
+
+namespace brw {
+ namespace surface_access {
+ fs_reg
+ emit_untyped_read(const fs_builder &bld,
+ const fs_reg &surface, const fs_reg &addr,
+ unsigned dims, unsigned size,
+ brw_predicate pred = BRW_PREDICATE_NONE);
+
+ void
+ emit_untyped_write(const fs_builder &bld, const fs_reg &surface,
+ const fs_reg &addr, const fs_reg &src,
+ unsigned dims, unsigned size,
+ brw_predicate pred = BRW_PREDICATE_NONE);
+
+ fs_reg
+ emit_untyped_atomic(const fs_builder &bld,
+ const fs_reg &surface, const fs_reg &addr,
+ const fs_reg &src0, const fs_reg &src1,
+ unsigned dims, unsigned rsize, unsigned op,
+ brw_predicate pred = BRW_PREDICATE_NONE);
+
+ fs_reg
+ emit_typed_read(const fs_builder &bld, const fs_reg &surface,
+ const fs_reg &addr, unsigned dims, unsigned size);
+
+ void
+ emit_typed_write(const fs_builder &bld, const fs_reg &surface,
+ const fs_reg &addr, const fs_reg &src,
+ unsigned dims, unsigned size);
+
+ fs_reg
+ emit_typed_atomic(const fs_builder &bld, const fs_reg &surface,
+ const fs_reg &addr,
+ const fs_reg &src0, const fs_reg &src1,
+ unsigned dims, unsigned rsize, unsigned op,
+ brw_predicate pred = BRW_PREDICATE_NONE);
+ }
+
+ namespace image_access {
+ fs_reg
+ emit_image_load(const fs_builder &bld,
+ const fs_reg &image, const fs_reg &addr,
+ unsigned surf_dims, unsigned arr_dims,
+ unsigned gl_format);
+
+ void
+ emit_image_store(const fs_builder &bld, const fs_reg &image,
+ const fs_reg &addr, const fs_reg &src,
+ unsigned surf_dims, unsigned arr_dims,
+ unsigned gl_format);
+ fs_reg
+ emit_image_atomic(const fs_builder &bld,
+ const fs_reg &image, const fs_reg &addr,
+ const fs_reg &src0, const fs_reg &src1,
+ unsigned surf_dims, unsigned arr_dims,
+ unsigned rsize, unsigned op);
+ }
+}
+#endif
diff --git a/src/intel/compiler/brw_fs_validate.cpp b/src/intel/compiler/brw_fs_validate.cpp
new file mode 100644
index 00000000000..676942c19c0
--- /dev/null
+++ b/src/intel/compiler/brw_fs_validate.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_validate.cpp
+ *
+ * Implements a pass that validates various invariants of the IR. The current
+ * pass only validates that GRF's uses are sane. More can be added later.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+
+#define fsv_assert(cond) \
+ if (!(cond)) { \
+ fprintf(stderr, "ASSERT: Scalar %s validation failed!\n", stage_abbrev); \
+ dump_instruction(inst, stderr); \
+ fprintf(stderr, "%s:%d: %s\n", __FILE__, __LINE__, #cond); \
+ abort(); \
+ }
+
+void
+fs_visitor::validate()
+{
+ foreach_block_and_inst (block, fs_inst, inst, cfg) {
+ if (inst->dst.file == VGRF) {
+ fsv_assert(inst->dst.offset / REG_SIZE + regs_written(inst) <=
+ alloc.sizes[inst->dst.nr]);
+ }
+
+ for (unsigned i = 0; i < inst->sources; i++) {
+ if (inst->src[i].file == VGRF) {
+ fsv_assert(inst->src[i].offset / REG_SIZE + regs_read(inst, i) <=
+ alloc.sizes[inst->src[i].nr]);
+ }
+ }
+ }
+}
diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp
new file mode 100644
index 00000000000..cea38d86237
--- /dev/null
+++ b/src/intel/compiler/brw_fs_visitor.cpp
@@ -0,0 +1,953 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_visitor.cpp
+ *
+ * This file supports generating the FS LIR from the GLSL IR. The LIR
+ * makes it easier to do backend-specific optimizations than doing so
+ * in the GLSL IR or in the native code.
+ */
+#include "brw_fs.h"
+#include "compiler/glsl_types.h"
+
+using namespace brw;
+
+fs_reg *
+fs_visitor::emit_vs_system_value(int location)
+{
+ fs_reg *reg = new(this->mem_ctx)
+ fs_reg(ATTR, 4 * _mesa_bitcount_64(nir->info->inputs_read),
+ BRW_REGISTER_TYPE_D);
+ struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
+
+ switch (location) {
+ case SYSTEM_VALUE_BASE_VERTEX:
+ reg->offset = 0;
+ vs_prog_data->uses_basevertex = true;
+ break;
+ case SYSTEM_VALUE_BASE_INSTANCE:
+ reg->offset = REG_SIZE;
+ vs_prog_data->uses_baseinstance = true;
+ break;
+ case SYSTEM_VALUE_VERTEX_ID:
+ unreachable("should have been lowered");
+ case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
+ reg->offset = 2 * REG_SIZE;
+ vs_prog_data->uses_vertexid = true;
+ break;
+ case SYSTEM_VALUE_INSTANCE_ID:
+ reg->offset = 3 * REG_SIZE;
+ vs_prog_data->uses_instanceid = true;
+ break;
+ case SYSTEM_VALUE_DRAW_ID:
+ if (nir->info->system_values_read &
+ (BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX) |
+ BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) |
+ BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
+ BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID)))
+ reg->nr += 4;
+ reg->offset = 0;
+ vs_prog_data->uses_drawid = true;
+ break;
+ default:
+ unreachable("not reached");
+ }
+
+ return reg;
+}
+
+/* Sample from the MCS surface attached to this multisample texture. */
+fs_reg
+fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
+ const fs_reg &texture)
+{
+ const fs_reg dest = vgrf(glsl_type::uvec4_type);
+
+ fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
+ srcs[TEX_LOGICAL_SRC_COORDINATE] = coordinate;
+ srcs[TEX_LOGICAL_SRC_SURFACE] = texture;
+ srcs[TEX_LOGICAL_SRC_SAMPLER] = texture;
+ srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(components);
+ srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
+
+ fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
+ ARRAY_SIZE(srcs));
+
+ /* We only care about one or two regs of response, but the sampler always
+ * writes 4/8.
+ */
+ inst->size_written = 4 * dest.component_size(inst->exec_size);
+
+ return dest;
+}
+
+/**
+ * Apply workarounds for Gen6 gather with UINT/SINT
+ */
+void
+fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
+{
+ if (!wa)
+ return;
+
+ int width = (wa & WA_8BIT) ? 8 : 16;
+
+ for (int i = 0; i < 4; i++) {
+ fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
+ /* Convert from UNORM to UINT */
+ bld.MUL(dst_f, dst_f, brw_imm_f((1 << width) - 1));
+ bld.MOV(dst, dst_f);
+
+ if (wa & WA_SIGN) {
+ /* Reinterpret the UINT value as a signed INT value by
+ * shifting the sign bit into place, then shifting back
+ * preserving sign.
+ */
+ bld.SHL(dst, dst, brw_imm_d(32 - width));
+ bld.ASR(dst, dst, brw_imm_d(32 - width));
+ }
+
+ dst = offset(dst, bld, 1);
+ }
+}
+
+/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
+void
+fs_visitor::emit_dummy_fs()
+{
+ int reg_width = dispatch_width / 8;
+
+ /* Everyone's favorite color. */
+ const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
+ for (int i = 0; i < 4; i++) {
+ bld.MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F),
+ brw_imm_f(color[i]));
+ }
+
+ fs_inst *write;
+ write = bld.emit(FS_OPCODE_FB_WRITE);
+ write->eot = true;
+ if (devinfo->gen >= 6) {
+ write->base_mrf = 2;
+ write->mlen = 4 * reg_width;
+ } else {
+ write->header_size = 2;
+ write->base_mrf = 0;
+ write->mlen = 2 + 4 * reg_width;
+ }
+
+ /* Tell the SF we don't have any inputs. Gen4-5 require at least one
+ * varying to avoid GPU hangs, so set that.
+ */
+ struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
+ wm_prog_data->num_varying_inputs = devinfo->gen < 6 ? 1 : 0;
+ memset(wm_prog_data->urb_setup, -1,
+ sizeof(wm_prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
+
+ /* We don't have any uniforms. */
+ stage_prog_data->nr_params = 0;
+ stage_prog_data->nr_pull_params = 0;
+ stage_prog_data->curb_read_length = 0;
+ stage_prog_data->dispatch_grf_start_reg = 2;
+ wm_prog_data->dispatch_grf_start_reg_2 = 2;
+ grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */
+
+ calculate_cfg();
+}
+
+/* The register location here is relative to the start of the URB
+ * data. It will get adjusted to be a real location before
+ * generate_code() time.
+ */
+struct brw_reg
+fs_visitor::interp_reg(int location, int channel)
+{
+ assert(stage == MESA_SHADER_FRAGMENT);
+ struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+ int regnr = prog_data->urb_setup[location] * 2 + channel / 2;
+ int stride = (channel & 1) * 4;
+
+ assert(prog_data->urb_setup[location] != -1);
+
+ return brw_vec1_grf(regnr, stride);
+}
+
+/** Emits the interpolation for the varying inputs. */
+void
+fs_visitor::emit_interpolation_setup_gen4()
+{
+ struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
+
+ fs_builder abld = bld.annotate("compute pixel centers");
+ this->pixel_x = vgrf(glsl_type::uint_type);
+ this->pixel_y = vgrf(glsl_type::uint_type);
+ this->pixel_x.type = BRW_REGISTER_TYPE_UW;
+ this->pixel_y.type = BRW_REGISTER_TYPE_UW;
+ abld.ADD(this->pixel_x,
+ fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
+ fs_reg(brw_imm_v(0x10101010)));
+ abld.ADD(this->pixel_y,
+ fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
+ fs_reg(brw_imm_v(0x11001100)));
+
+ abld = bld.annotate("compute pixel deltas from v0");
+
+ this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL] =
+ vgrf(glsl_type::vec2_type);
+ const fs_reg &delta_xy = this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL];
+ const fs_reg xstart(negate(brw_vec1_grf(1, 0)));
+ const fs_reg ystart(negate(brw_vec1_grf(1, 1)));
+
+ if (devinfo->has_pln && dispatch_width == 16) {
+ for (unsigned i = 0; i < 2; i++) {
+ abld.half(i).ADD(half(offset(delta_xy, abld, i), 0),
+ half(this->pixel_x, i), xstart);
+ abld.half(i).ADD(half(offset(delta_xy, abld, i), 1),
+ half(this->pixel_y, i), ystart);
+ }
+ } else {
+ abld.ADD(offset(delta_xy, abld, 0), this->pixel_x, xstart);
+ abld.ADD(offset(delta_xy, abld, 1), this->pixel_y, ystart);
+ }
+
+ abld = bld.annotate("compute pos.w and 1/pos.w");
+ /* Compute wpos.w. It's always in our setup, since it's needed to
+ * interpolate the other attributes.
+ */
+ this->wpos_w = vgrf(glsl_type::float_type);
+ abld.emit(FS_OPCODE_LINTERP, wpos_w, delta_xy,
+ interp_reg(VARYING_SLOT_POS, 3));
+ /* Compute the pixel 1/W value from wpos.w. */
+ this->pixel_w = vgrf(glsl_type::float_type);
+ abld.emit(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
+}
+
+/** Emits the interpolation for the varying inputs. */
+void
+fs_visitor::emit_interpolation_setup_gen6()
+{
+ struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
+
+ fs_builder abld = bld.annotate("compute pixel centers");
+ if (devinfo->gen >= 8 || dispatch_width == 8) {
+ /* The "Register Region Restrictions" page says for BDW (and newer,
+ * presumably):
+ *
+ * "When destination spans two registers, the source may be one or
+ * two registers. The destination elements must be evenly split
+ * between the two registers."
+ *
+ * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16 to
+ * compute our pixel centers.
+ */
+ fs_reg int_pixel_xy(VGRF, alloc.allocate(dispatch_width / 8),
+ BRW_REGISTER_TYPE_UW);
+
+ const fs_builder dbld = abld.exec_all().group(dispatch_width * 2, 0);
+ dbld.ADD(int_pixel_xy,
+ fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
+ fs_reg(brw_imm_v(0x11001010)));
+
+ this->pixel_x = vgrf(glsl_type::float_type);
+ this->pixel_y = vgrf(glsl_type::float_type);
+ abld.emit(FS_OPCODE_PIXEL_X, this->pixel_x, int_pixel_xy);
+ abld.emit(FS_OPCODE_PIXEL_Y, this->pixel_y, int_pixel_xy);
+ } else {
+ /* The "Register Region Restrictions" page says for SNB, IVB, HSW:
+ *
+ * "When destination spans two registers, the source MUST span two
+ * registers."
+ *
+ * Since the GRF source of the ADD will only read a single register, we
+ * must do two separate ADDs in SIMD16.
+ */
+ fs_reg int_pixel_x = vgrf(glsl_type::uint_type);
+ fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
+ int_pixel_x.type = BRW_REGISTER_TYPE_UW;
+ int_pixel_y.type = BRW_REGISTER_TYPE_UW;
+ abld.ADD(int_pixel_x,
+ fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
+ fs_reg(brw_imm_v(0x10101010)));
+ abld.ADD(int_pixel_y,
+ fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
+ fs_reg(brw_imm_v(0x11001100)));
+
+ /* As of gen6, we can no longer mix float and int sources. We have
+ * to turn the integer pixel centers into floats for their actual
+ * use.
+ */
+ this->pixel_x = vgrf(glsl_type::float_type);
+ this->pixel_y = vgrf(glsl_type::float_type);
+ abld.MOV(this->pixel_x, int_pixel_x);
+ abld.MOV(this->pixel_y, int_pixel_y);
+ }
+
+ abld = bld.annotate("compute pos.w");
+ this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));
+ this->wpos_w = vgrf(glsl_type::float_type);
+ abld.emit(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
+
+ struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(prog_data);
+ uint32_t centroid_modes = wm_prog_data->barycentric_interp_modes &
+ (1 << BRW_BARYCENTRIC_PERSPECTIVE_CENTROID |
+ 1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
+
+ for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
+ uint8_t reg = payload.barycentric_coord_reg[i];
+ this->delta_xy[i] = fs_reg(brw_vec16_grf(reg, 0));
+
+ if (devinfo->needs_unlit_centroid_workaround &&
+ (centroid_modes & (1 << i))) {
+ /* Get the pixel/sample mask into f0 so that we know which
+ * pixels are lit. Then, for each channel that is unlit,
+ * replace the centroid data with non-centroid data.
+ */
+ bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
+
+ uint8_t pixel_reg = payload.barycentric_coord_reg[i - 1];
+
+ set_predicate_inv(BRW_PREDICATE_NORMAL, true,
+ bld.half(0).MOV(brw_vec8_grf(reg, 0),
+ brw_vec8_grf(pixel_reg, 0)));
+ set_predicate_inv(BRW_PREDICATE_NORMAL, true,
+ bld.half(0).MOV(brw_vec8_grf(reg + 1, 0),
+ brw_vec8_grf(pixel_reg + 1, 0)));
+ if (dispatch_width == 16) {
+ set_predicate_inv(BRW_PREDICATE_NORMAL, true,
+ bld.half(1).MOV(brw_vec8_grf(reg + 2, 0),
+ brw_vec8_grf(pixel_reg + 2, 0)));
+ set_predicate_inv(BRW_PREDICATE_NORMAL, true,
+ bld.half(1).MOV(brw_vec8_grf(reg + 3, 0),
+ brw_vec8_grf(pixel_reg + 3, 0)));
+ }
+ assert(dispatch_width != 32); /* not implemented yet */
+ }
+ }
+}
+
+static enum brw_conditional_mod
+cond_for_alpha_func(GLenum func)
+{
+ switch(func) {
+ case GL_GREATER:
+ return BRW_CONDITIONAL_G;
+ case GL_GEQUAL:
+ return BRW_CONDITIONAL_GE;
+ case GL_LESS:
+ return BRW_CONDITIONAL_L;
+ case GL_LEQUAL:
+ return BRW_CONDITIONAL_LE;
+ case GL_EQUAL:
+ return BRW_CONDITIONAL_EQ;
+ case GL_NOTEQUAL:
+ return BRW_CONDITIONAL_NEQ;
+ default:
+ unreachable("Not reached");
+ }
+}
+
+/**
+ * Alpha test support for when we compile it into the shader instead
+ * of using the normal fixed-function alpha test.
+ */
+void
+fs_visitor::emit_alpha_test()
+{
+ assert(stage == MESA_SHADER_FRAGMENT);
+ brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+ const fs_builder abld = bld.annotate("Alpha test");
+
+ fs_inst *cmp;
+ if (key->alpha_test_func == GL_ALWAYS)
+ return;
+
+ if (key->alpha_test_func == GL_NEVER) {
+ /* f0.1 = 0 */
+ fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
+ BRW_REGISTER_TYPE_UW));
+ cmp = abld.CMP(bld.null_reg_f(), some_reg, some_reg,
+ BRW_CONDITIONAL_NEQ);
+ } else {
+ /* RT0 alpha */
+ fs_reg color = offset(outputs[0], bld, 3);
+
+ /* f0.1 &= func(color, ref) */
+ cmp = abld.CMP(bld.null_reg_f(), color, brw_imm_f(key->alpha_test_ref),
+ cond_for_alpha_func(key->alpha_test_func));
+ }
+ cmp->predicate = BRW_PREDICATE_NORMAL;
+ cmp->flag_subreg = 1;
+}
+
+fs_inst *
+fs_visitor::emit_single_fb_write(const fs_builder &bld,
+ fs_reg color0, fs_reg color1,
+ fs_reg src0_alpha, unsigned components)
+{
+ assert(stage == MESA_SHADER_FRAGMENT);
+ struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+
+ /* Hand over gl_FragDepth or the payload depth. */
+ const fs_reg dst_depth = (payload.dest_depth_reg ?
+ fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0)) :
+ fs_reg());
+ fs_reg src_depth, src_stencil;
+
+ if (source_depth_to_render_target) {
+ if (nir->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
+ src_depth = frag_depth;
+ else
+ src_depth = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0));
+ }
+
+ if (nir->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL))
+ src_stencil = frag_stencil;
+
+ const fs_reg sources[] = {
+ color0, color1, src0_alpha, src_depth, dst_depth, src_stencil,
+ (prog_data->uses_omask ? sample_mask : fs_reg()),
+ brw_imm_ud(components)
+ };
+ assert(ARRAY_SIZE(sources) - 1 == FB_WRITE_LOGICAL_SRC_COMPONENTS);
+ fs_inst *write = bld.emit(FS_OPCODE_FB_WRITE_LOGICAL, fs_reg(),
+ sources, ARRAY_SIZE(sources));
+
+ if (prog_data->uses_kill) {
+ write->predicate = BRW_PREDICATE_NORMAL;
+ write->flag_subreg = 1;
+ }
+
+ return write;
+}
+
+void
+fs_visitor::emit_fb_writes()
+{
+ assert(stage == MESA_SHADER_FRAGMENT);
+ struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+ brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+
+ fs_inst *inst = NULL;
+
+ if (source_depth_to_render_target && devinfo->gen == 6) {
+ /* For outputting oDepth on gen6, SIMD8 writes have to be used. This
+ * would require SIMD8 moves of each half to message regs, e.g. by using
+ * the SIMD lowering pass. Unfortunately this is more difficult than it
+ * sounds because the SIMD8 single-source message lacks channel selects
+ * for the second and third subspans.
+ */
+ limit_dispatch_width(8, "Depth writes unsupported in SIMD16+ mode.\n");
+ }
+
+ if (nir->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
+ /* From the 'Render Target Write message' section of the docs:
+ * "Output Stencil is not supported with SIMD16 Render Target Write
+ * Messages."
+ */
+ limit_dispatch_width(8, "gl_FragStencilRefARB unsupported "
+ "in SIMD16+ mode.\n");
+ }
+
+ for (int target = 0; target < key->nr_color_regions; target++) {
+ /* Skip over outputs that weren't written. */
+ if (this->outputs[target].file == BAD_FILE)
+ continue;
+
+ const fs_builder abld = bld.annotate(
+ ralloc_asprintf(this->mem_ctx, "FB write target %d", target));
+
+ fs_reg src0_alpha;
+ if (devinfo->gen >= 6 && key->replicate_alpha && target != 0)
+ src0_alpha = offset(outputs[0], bld, 3);
+
+ inst = emit_single_fb_write(abld, this->outputs[target],
+ this->dual_src_output, src0_alpha, 4);
+ inst->target = target;
+ }
+
+ prog_data->dual_src_blend = (this->dual_src_output.file != BAD_FILE);
+ assert(!prog_data->dual_src_blend || key->nr_color_regions == 1);
+
+ if (inst == NULL) {
+ /* Even if there's no color buffers enabled, we still need to send
+ * alpha out the pipeline to our null renderbuffer to support
+ * alpha-testing, alpha-to-coverage, and so on.
+ */
+ /* FINISHME: Factor out this frequently recurring pattern into a
+ * helper function.
+ */
+ const fs_reg srcs[] = { reg_undef, reg_undef,
+ reg_undef, offset(this->outputs[0], bld, 3) };
+ const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+ bld.LOAD_PAYLOAD(tmp, srcs, 4, 0);
+
+ inst = emit_single_fb_write(bld, tmp, reg_undef, reg_undef, 4);
+ inst->target = 0;
+ }
+
+ inst->eot = true;
+}
+
+void
+fs_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
+{
+ const struct brw_vs_prog_key *key =
+ (const struct brw_vs_prog_key *) this->key;
+
+ for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
+ this->userplane[i] = fs_reg(UNIFORM, uniforms);
+ for (int j = 0; j < 4; ++j) {
+ stage_prog_data->param[uniforms + j] =
+ (gl_constant_value *) &clip_planes[i][j];
+ }
+ uniforms += 4;
+ }
+}
+
+/**
+ * Lower legacy fixed-function and gl_ClipVertex clipping to clip distances.
+ *
+ * This does nothing if the shader uses gl_ClipDistance or user clipping is
+ * disabled altogether.
+ */
+void fs_visitor::compute_clip_distance(gl_clip_plane *clip_planes)
+{
+ struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
+ const struct brw_vs_prog_key *key =
+ (const struct brw_vs_prog_key *) this->key;
+
+ /* Bail unless some sort of legacy clipping is enabled */
+ if (key->nr_userclip_plane_consts == 0)
+ return;
+
+ /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
+ *
+ * "If a linked set of shaders forming the vertex stage contains no
+ * static write to gl_ClipVertex or gl_ClipDistance, but the
+ * application has requested clipping against user clip planes through
+ * the API, then the coordinate written to gl_Position is used for
+ * comparison against the user clip planes."
+ *
+ * This function is only called if the shader didn't write to
+ * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
+ * if the user wrote to it; otherwise we use gl_Position.
+ */
+
+ gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
+ if (!(vue_prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX))
+ clip_vertex = VARYING_SLOT_POS;
+
+ /* If the clip vertex isn't written, skip this. Typically this means
+ * the GS will set up clipping. */
+ if (outputs[clip_vertex].file == BAD_FILE)
+ return;
+
+ setup_uniform_clipplane_values(clip_planes);
+
+ const fs_builder abld = bld.annotate("user clip distances");
+
+ this->outputs[VARYING_SLOT_CLIP_DIST0] = vgrf(glsl_type::vec4_type);
+ this->outputs[VARYING_SLOT_CLIP_DIST1] = vgrf(glsl_type::vec4_type);
+
+ for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
+ fs_reg u = userplane[i];
+ const fs_reg output = offset(outputs[VARYING_SLOT_CLIP_DIST0 + i / 4],
+ bld, i & 3);
+
+ abld.MUL(output, outputs[clip_vertex], u);
+ for (int j = 1; j < 4; j++) {
+ u.nr = userplane[i].nr + j;
+ abld.MAD(output, output, offset(outputs[clip_vertex], bld, j), u);
+ }
+ }
+}
+
+void
+fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
+{
+ int slot, urb_offset, length;
+ int starting_urb_offset = 0;
+ const struct brw_vue_prog_data *vue_prog_data =
+ brw_vue_prog_data(this->prog_data);
+ const struct brw_vs_prog_key *vs_key =
+ (const struct brw_vs_prog_key *) this->key;
+ const GLbitfield64 psiz_mask =
+ VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ;
+ const struct brw_vue_map *vue_map = &vue_prog_data->vue_map;
+ bool flush;
+ fs_reg sources[8];
+ fs_reg urb_handle;
+
+ if (stage == MESA_SHADER_TESS_EVAL)
+ urb_handle = fs_reg(retype(brw_vec8_grf(4, 0), BRW_REGISTER_TYPE_UD));
+ else
+ urb_handle = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+
+ /* If we don't have any valid slots to write, just do a minimal urb write
+ * send to terminate the shader. This includes 1 slot of undefined data,
+ * because it's invalid to write 0 data:
+ *
+ * From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions -
+ * Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read >
+ * Write Data Payload:
+ *
+ * "The write data payload can be between 1 and 8 message phases long."
+ */
+ if (vue_map->slots_valid == 0) {
+ /* For GS, just turn EmitVertex() into a no-op. We don't want it to
+ * end the thread, and emit_gs_thread_end() already emits a SEND with
+ * EOT at the end of the program for us.
+ */
+ if (stage == MESA_SHADER_GEOMETRY)
+ return;
+
+ fs_reg payload = fs_reg(VGRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD);
+ bld.exec_all().MOV(payload, urb_handle);
+
+ fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
+ inst->eot = true;
+ inst->mlen = 2;
+ inst->offset = 1;
+ return;
+ }
+
+ opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
+ int header_size = 1;
+ fs_reg per_slot_offsets;
+
+ if (stage == MESA_SHADER_GEOMETRY) {
+ const struct brw_gs_prog_data *gs_prog_data =
+ brw_gs_prog_data(this->prog_data);
+
+ /* We need to increment the Global Offset to skip over the control data
+ * header and the extra "Vertex Count" field (1 HWord) at the beginning
+ * of the VUE. We're counting in OWords, so the units are doubled.
+ */
+ starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords;
+ if (gs_prog_data->static_vertex_count == -1)
+ starting_urb_offset += 2;
+
+ /* We also need to use per-slot offsets. The per-slot offset is the
+ * Vertex Count. SIMD8 mode processes 8 different primitives at a
+ * time; each may output a different number of vertices.
+ */
+ opcode = SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT;
+ header_size++;
+
+ /* The URB offset is in 128-bit units, so we need to multiply by 2 */
+ const int output_vertex_size_owords =
+ gs_prog_data->output_vertex_size_hwords * 2;
+
+ if (gs_vertex_count.file == IMM) {
+ per_slot_offsets = brw_imm_ud(output_vertex_size_owords *
+ gs_vertex_count.ud);
+ } else {
+ per_slot_offsets = vgrf(glsl_type::int_type);
+ bld.MUL(per_slot_offsets, gs_vertex_count,
+ brw_imm_ud(output_vertex_size_owords));
+ }
+ }
+
+ length = 0;
+ urb_offset = starting_urb_offset;
+ flush = false;
+
+ /* SSO shaders can have VUE slots allocated which are never actually
+ * written to, so ignore them when looking for the last (written) slot.
+ */
+ int last_slot = vue_map->num_slots - 1;
+ while (last_slot > 0 &&
+ (vue_map->slot_to_varying[last_slot] == BRW_VARYING_SLOT_PAD ||
+ outputs[vue_map->slot_to_varying[last_slot]].file == BAD_FILE)) {
+ last_slot--;
+ }
+
+ for (slot = 0; slot < vue_map->num_slots; slot++) {
+ int varying = vue_map->slot_to_varying[slot];
+ switch (varying) {
+ case VARYING_SLOT_PSIZ: {
+ /* The point size varying slot is the vue header and is always in the
+ * vue map. But often none of the special varyings that live there
+ * are written and in that case we can skip writing to the vue
+ * header, provided the corresponding state properly clamps the
+ * values further down the pipeline. */
+ if ((vue_map->slots_valid & psiz_mask) == 0) {
+ assert(length == 0);
+ urb_offset++;
+ break;
+ }
+
+ fs_reg zero(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+ bld.MOV(zero, brw_imm_ud(0u));
+
+ sources[length++] = zero;
+ if (vue_map->slots_valid & VARYING_BIT_LAYER)
+ sources[length++] = this->outputs[VARYING_SLOT_LAYER];
+ else
+ sources[length++] = zero;
+
+ if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
+ sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
+ else
+ sources[length++] = zero;
+
+ if (vue_map->slots_valid & VARYING_BIT_PSIZ)
+ sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
+ else
+ sources[length++] = zero;
+ break;
+ }
+ case BRW_VARYING_SLOT_NDC:
+ case VARYING_SLOT_EDGE:
+ unreachable("unexpected scalar vs output");
+ break;
+
+ default:
+ /* gl_Position is always in the vue map, but isn't always written by
+ * the shader. Other varyings (clip distances) get added to the vue
+ * map but don't always get written. In those cases, the
+ * corresponding this->output[] slot will be invalid we and can skip
+ * the urb write for the varying. If we've already queued up a vue
+ * slot for writing we flush a mlen 5 urb write, otherwise we just
+ * advance the urb_offset.
+ */
+ if (varying == BRW_VARYING_SLOT_PAD ||
+ this->outputs[varying].file == BAD_FILE) {
+ if (length > 0)
+ flush = true;
+ else
+ urb_offset++;
+ break;
+ }
+
+ if (stage == MESA_SHADER_VERTEX && vs_key->clamp_vertex_color &&
+ (varying == VARYING_SLOT_COL0 ||
+ varying == VARYING_SLOT_COL1 ||
+ varying == VARYING_SLOT_BFC0 ||
+ varying == VARYING_SLOT_BFC1)) {
+ /* We need to clamp these guys, so do a saturating MOV into a
+ * temp register and use that for the payload.
+ */
+ for (int i = 0; i < 4; i++) {
+ fs_reg reg = fs_reg(VGRF, alloc.allocate(1), outputs[varying].type);
+ fs_reg src = offset(this->outputs[varying], bld, i);
+ set_saturate(true, bld.MOV(reg, src));
+ sources[length++] = reg;
+ }
+ } else {
+ for (unsigned i = 0; i < 4; i++)
+ sources[length++] = offset(this->outputs[varying], bld, i);
+ }
+ break;
+ }
+
+ const fs_builder abld = bld.annotate("URB write");
+
+ /* If we've queued up 8 registers of payload (2 VUE slots), if this is
+ * the last slot or if we need to flush (see BAD_FILE varying case
+ * above), emit a URB write send now to flush out the data.
+ */
+ if (length == 8 || slot == last_slot)
+ flush = true;
+ if (flush) {
+ fs_reg *payload_sources =
+ ralloc_array(mem_ctx, fs_reg, length + header_size);
+ fs_reg payload = fs_reg(VGRF, alloc.allocate(length + header_size),
+ BRW_REGISTER_TYPE_F);
+ payload_sources[0] = urb_handle;
+
+ if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT)
+ payload_sources[1] = per_slot_offsets;
+
+ memcpy(&payload_sources[header_size], sources,
+ length * sizeof sources[0]);
+
+ abld.LOAD_PAYLOAD(payload, payload_sources, length + header_size,
+ header_size);
+
+ fs_inst *inst = abld.emit(opcode, reg_undef, payload);
+ inst->eot = slot == last_slot && stage != MESA_SHADER_GEOMETRY;
+ inst->mlen = length + header_size;
+ inst->offset = urb_offset;
+ urb_offset = starting_urb_offset + slot + 1;
+ length = 0;
+ flush = false;
+ }
+ }
+}
+
+void
+fs_visitor::emit_cs_terminate()
+{
+ assert(devinfo->gen >= 7);
+
+ /* We are getting the thread ID from the compute shader header */
+ assert(stage == MESA_SHADER_COMPUTE);
+
+ /* We can't directly send from g0, since sends with EOT have to use
+ * g112-127. So, copy it to a virtual register, The register allocator will
+ * make sure it uses the appropriate register range.
+ */
+ struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD);
+ fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+ bld.group(8, 0).exec_all().MOV(payload, g0);
+
+ /* Send a message to the thread spawner to terminate the thread. */
+ fs_inst *inst = bld.exec_all()
+ .emit(CS_OPCODE_CS_TERMINATE, reg_undef, payload);
+ inst->eot = true;
+}
+
+void
+fs_visitor::emit_barrier()
+{
+ assert(devinfo->gen >= 7);
+ const uint32_t barrier_id_mask =
+ devinfo->gen >= 9 ? 0x8f000000u : 0x0f000000u;
+
+ /* We are getting the barrier ID from the compute shader header */
+ assert(stage == MESA_SHADER_COMPUTE);
+
+ fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+
+ const fs_builder pbld = bld.exec_all().group(8, 0);
+
+ /* Clear the message payload */
+ pbld.MOV(payload, brw_imm_ud(0u));
+
+ /* Copy the barrier id from r0.2 to the message payload reg.2 */
+ fs_reg r0_2 = fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD));
+ pbld.AND(component(payload, 2), r0_2, brw_imm_ud(barrier_id_mask));
+
+ /* Emit a gateway "barrier" message using the payload we set up, followed
+ * by a wait instruction.
+ */
+ bld.exec_all().emit(SHADER_OPCODE_BARRIER, reg_undef, payload);
+}
+
+fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
+ void *mem_ctx,
+ const void *key,
+ struct brw_stage_prog_data *prog_data,
+ struct gl_program *prog,
+ const nir_shader *shader,
+ unsigned dispatch_width,
+ int shader_time_index,
+ const struct brw_vue_map *input_vue_map)
+ : backend_shader(compiler, log_data, mem_ctx, shader, prog_data),
+ key(key), gs_compile(NULL), prog_data(prog_data), prog(prog),
+ input_vue_map(input_vue_map),
+ dispatch_width(dispatch_width),
+ shader_time_index(shader_time_index),
+ bld(fs_builder(this, dispatch_width).at_end())
+{
+ init();
+}
+
+fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
+ void *mem_ctx,
+ struct brw_gs_compile *c,
+ struct brw_gs_prog_data *prog_data,
+ const nir_shader *shader,
+ int shader_time_index)
+ : backend_shader(compiler, log_data, mem_ctx, shader,
+ &prog_data->base.base),
+ key(&c->key), gs_compile(c),
+ prog_data(&prog_data->base.base), prog(NULL),
+ dispatch_width(8),
+ shader_time_index(shader_time_index),
+ bld(fs_builder(this, dispatch_width).at_end())
+{
+ init();
+}
+
+
+void
+fs_visitor::init()
+{
+ switch (stage) {
+ case MESA_SHADER_FRAGMENT:
+ key_tex = &((const brw_wm_prog_key *) key)->tex;
+ break;
+ case MESA_SHADER_VERTEX:
+ key_tex = &((const brw_vs_prog_key *) key)->tex;
+ break;
+ case MESA_SHADER_TESS_CTRL:
+ key_tex = &((const brw_tcs_prog_key *) key)->tex;
+ break;
+ case MESA_SHADER_TESS_EVAL:
+ key_tex = &((const brw_tes_prog_key *) key)->tex;
+ break;
+ case MESA_SHADER_GEOMETRY:
+ key_tex = &((const brw_gs_prog_key *) key)->tex;
+ break;
+ case MESA_SHADER_COMPUTE:
+ key_tex = &((const brw_cs_prog_key*) key)->tex;
+ break;
+ default:
+ unreachable("unhandled shader stage");
+ }
+
+ if (stage == MESA_SHADER_COMPUTE) {
+ const struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
+ unsigned size = cs_prog_data->local_size[0] *
+ cs_prog_data->local_size[1] *
+ cs_prog_data->local_size[2];
+ size = DIV_ROUND_UP(size, devinfo->max_cs_threads);
+ min_dispatch_width = size > 16 ? 32 : (size > 8 ? 16 : 8);
+ } else {
+ min_dispatch_width = 8;
+ }
+
+ this->max_dispatch_width = 32;
+ this->prog_data = this->stage_prog_data;
+
+ this->failed = false;
+
+ this->nir_locals = NULL;
+ this->nir_ssa_values = NULL;
+
+ memset(&this->payload, 0, sizeof(this->payload));
+ this->source_depth_to_render_target = false;
+ this->runtime_check_aads_emit = false;
+ this->first_non_payload_grf = 0;
+ this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
+
+ this->virtual_grf_start = NULL;
+ this->virtual_grf_end = NULL;
+ this->live_intervals = NULL;
+ this->regs_live_at_ip = NULL;
+
+ this->uniforms = 0;
+ this->last_scratch = 0;
+ this->pull_constant_loc = NULL;
+ this->push_constant_loc = NULL;
+
+ this->promoted_constants = 0,
+
+ this->spilled_any_registers = false;
+}
+
+fs_visitor::~fs_visitor()
+{
+}
diff --git a/src/intel/compiler/brw_inst.h b/src/intel/compiler/brw_inst.h
new file mode 100644
index 00000000000..a0b8fb66dd6
--- /dev/null
+++ b/src/intel/compiler/brw_inst.h
@@ -0,0 +1,866 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file brw_inst.h
+ *
+ * A representation of i965 EU assembly instructions, with helper methods to
+ * get and set various fields. This is the actual hardware format.
+ */
+
+#ifndef BRW_INST_H
+#define BRW_INST_H
+
+#include <assert.h>
+#include <stdint.h>
+
+#include "brw_eu_defines.h"
+#include "common/gen_device_info.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* brw_context.h has a forward declaration of brw_inst, so name the struct. */
+typedef struct brw_inst {
+ uint64_t data[2];
+} brw_inst;
+
+static inline uint64_t brw_inst_bits(const brw_inst *inst,
+ unsigned high, unsigned low);
+static inline void brw_inst_set_bits(brw_inst *inst,
+ unsigned high, unsigned low,
+ uint64_t value);
+
+#define FC(name, high, low, assertions) \
+static inline void \
+brw_inst_set_##name(const struct gen_device_info *devinfo, \
+ brw_inst *inst, uint64_t v) \
+{ \
+ assert(assertions); \
+ (void) devinfo; \
+ brw_inst_set_bits(inst, high, low, v); \
+} \
+static inline uint64_t \
+brw_inst_##name(const struct gen_device_info *devinfo, \
+ const brw_inst *inst) \
+{ \
+ assert(assertions); \
+ (void) devinfo; \
+ return brw_inst_bits(inst, high, low); \
+}
+
+/* A simple macro for fields which stay in the same place on all generations. */
+#define F(name, high, low) FC(name, high, low, true)
+
+#define BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, hi7, lo7, hi8, lo8) \
+ unsigned high, low; \
+ if (devinfo->gen >= 8) { \
+ high = hi8; low = lo8; \
+ } else if (devinfo->gen >= 7) { \
+ high = hi7; low = lo7; \
+ } else if (devinfo->gen >= 6) { \
+ high = hi6; low = lo6; \
+ } else if (devinfo->gen >= 5) { \
+ high = hi5; low = lo5; \
+ } else if (devinfo->is_g4x) { \
+ high = hi45; low = lo45; \
+ } else { \
+ high = hi4; low = lo4; \
+ } \
+ assert(((int) high) != -1 && ((int) low) != -1); \
+
+/* A general macro for cases where the field has moved to several different
+ * bit locations across generations. GCC appears to combine cases where the
+ * bits are identical, removing some of the inefficiency.
+ */
+#define FF(name, hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, hi7, lo7, hi8, lo8)\
+static inline void \
+brw_inst_set_##name(const struct gen_device_info *devinfo, \
+ brw_inst *inst, uint64_t value) \
+{ \
+ BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, hi7, lo7, hi8, lo8) \
+ brw_inst_set_bits(inst, high, low, value); \
+} \
+static inline uint64_t \
+brw_inst_##name(const struct gen_device_info *devinfo, const brw_inst *inst) \
+{ \
+ BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, hi7, lo7, hi8, lo8) \
+ return brw_inst_bits(inst, high, low); \
+}
+
+/* A macro for fields which moved as of Gen8+. */
+#define F8(name, gen4_high, gen4_low, gen8_high, gen8_low) \
+FF(name, \
+ /* 4: */ gen4_high, gen4_low, \
+ /* 4.5: */ gen4_high, gen4_low, \
+ /* 5: */ gen4_high, gen4_low, \
+ /* 6: */ gen4_high, gen4_low, \
+ /* 7: */ gen4_high, gen4_low, \
+ /* 8: */ gen8_high, gen8_low);
+
+F(src1_vstride, 120, 117)
+F(src1_width, 116, 114)
+F(src1_da16_swiz_w, 115, 114)
+F(src1_da16_swiz_z, 113, 112)
+F(src1_hstride, 113, 112)
+F(src1_address_mode, 111, 111)
+/** Src1.SrcMod @{ */
+F(src1_negate, 110, 110)
+F(src1_abs, 109, 109)
+/** @} */
+F8(src1_ia_subreg_nr, /* 4+ */ 108, 106, /* 8+ */ 108, 105)
+F(src1_da_reg_nr, 108, 101)
+F(src1_da16_subreg_nr, 100, 100)
+F(src1_da1_subreg_nr, 100, 96)
+F(src1_da16_swiz_y, 99, 98)
+F(src1_da16_swiz_x, 97, 96)
+F8(src1_reg_type, /* 4+ */ 46, 44, /* 8+ */ 94, 91)
+F8(src1_reg_file, /* 4+ */ 43, 42, /* 8+ */ 90, 89)
+F(src0_vstride, 88, 85)
+F(src0_width, 84, 82)
+F(src0_da16_swiz_w, 83, 82)
+F(src0_da16_swiz_z, 81, 80)
+F(src0_hstride, 81, 80)
+F(src0_address_mode, 79, 79)
+/** Src0.SrcMod @{ */
+F(src0_negate, 78, 78)
+F(src0_abs, 77, 77)
+/** @} */
+F8(src0_ia_subreg_nr, /* 4+ */ 76, 74, /* 8+ */ 76, 73)
+F(src0_da_reg_nr, 76, 69)
+F(src0_da16_subreg_nr, 68, 68)
+F(src0_da1_subreg_nr, 68, 64)
+F(src0_da16_swiz_y, 67, 66)
+F(src0_da16_swiz_x, 65, 64)
+F(dst_address_mode, 63, 63)
+F(dst_hstride, 62, 61)
+F8(dst_ia_subreg_nr, /* 4+ */ 60, 58, /* 8+ */ 60, 57)
+F(dst_da_reg_nr, 60, 53)
+F(dst_da16_subreg_nr, 52, 52)
+F(dst_da1_subreg_nr, 52, 48)
+F(da16_writemask, 51, 48) /* Dst.ChanEn */
+F8(src0_reg_type, /* 4+ */ 41, 39, /* 8+ */ 46, 43)
+F8(src0_reg_file, /* 4+ */ 38, 37, /* 8+ */ 42, 41)
+F8(dst_reg_type, /* 4+ */ 36, 34, /* 8+ */ 40, 37)
+F8(dst_reg_file, /* 4+ */ 33, 32, /* 8+ */ 36, 35)
+F8(mask_control, /* 4+ */ 9, 9, /* 8+ */ 34, 34)
+FF(flag_reg_nr,
+ /* 4-6: doesn't exist */ -1, -1, -1, -1, -1, -1, -1, -1,
+ /* 7: */ 90, 90,
+ /* 8: */ 33, 33)
+F8(flag_subreg_nr, /* 4+ */ 89, 89, /* 8+ */ 32, 32)
+F(saturate, 31, 31)
+F(debug_control, 30, 30)
+F(cmpt_control, 29, 29)
+FC(branch_control, 28, 28, devinfo->gen >= 8)
+FC(acc_wr_control, 28, 28, devinfo->gen >= 6)
+FC(mask_control_ex, 28, 28, devinfo->is_g4x || devinfo->gen == 5)
+F(cond_modifier, 27, 24)
+FC(math_function, 27, 24, devinfo->gen >= 6)
+F(exec_size, 23, 21)
+F(pred_inv, 20, 20)
+F(pred_control, 19, 16)
+F(thread_control, 15, 14)
+F(qtr_control, 13, 12)
+FF(nib_control,
+ /* 4-6: doesn't exist */ -1, -1, -1, -1, -1, -1, -1, -1,
+ /* 7: */ 47, 47,
+ /* 8: */ 11, 11)
+F8(no_dd_check, /* 4+ */ 11, 11, /* 8+ */ 10, 10)
+F8(no_dd_clear, /* 4+ */ 10, 10, /* 8+ */ 9, 9)
+F(access_mode, 8, 8)
+/* Bit 7 is Reserved (for future Opcode expansion) */
+F(opcode, 6, 0)
+
+/**
+ * Three-source instructions:
+ * @{
+ */
+F(3src_src2_reg_nr, 125, 118)
+F(3src_src2_subreg_nr, 117, 115) /* Extra discontiguous bit on CHV? */
+F(3src_src2_swizzle, 114, 107)
+F(3src_src2_rep_ctrl, 106, 106)
+F(3src_src1_reg_nr, 104, 97)
+F(3src_src1_subreg_nr, 96, 94) /* Extra discontiguous bit on CHV? */
+F(3src_src1_swizzle, 93, 86)
+F(3src_src1_rep_ctrl, 85, 85)
+F(3src_src0_reg_nr, 83, 76)
+F(3src_src0_subreg_nr, 75, 73) /* Extra discontiguous bit on CHV? */
+F(3src_src0_swizzle, 72, 65)
+F(3src_src0_rep_ctrl, 64, 64)
+F(3src_dst_reg_nr, 63, 56)
+F(3src_dst_subreg_nr, 55, 53)
+F(3src_dst_writemask, 52, 49)
+F8(3src_nib_ctrl, 47, 47, 11, 11) /* only exists on IVB+ */
+F8(3src_dst_type, 45, 44, 48, 46) /* only exists on IVB+ */
+F8(3src_src_type, 43, 42, 45, 43)
+F8(3src_src2_negate, 41, 41, 42, 42)
+F8(3src_src2_abs, 40, 40, 41, 41)
+F8(3src_src1_negate, 39, 39, 40, 40)
+F8(3src_src1_abs, 38, 38, 39, 39)
+F8(3src_src0_negate, 37, 37, 38, 38)
+F8(3src_src0_abs, 36, 36, 37, 37)
+F8(3src_flag_reg_nr, 34, 34, 33, 33)
+F8(3src_flag_subreg_nr, 33, 33, 32, 32)
+FF(3src_dst_reg_file,
+ /* 4-5: doesn't exist - no 3-source instructions */ -1, -1, -1, -1, -1, -1,
+ /* 6: */ 32, 32,
+ /* 7-8: doesn't exist - no MRFs */ -1, -1, -1, -1)
+F(3src_saturate, 31, 31)
+F(3src_debug_control, 30, 30)
+F(3src_cmpt_control, 29, 29)
+F(3src_acc_wr_control, 28, 28)
+F(3src_cond_modifier, 27, 24)
+F(3src_exec_size, 23, 21)
+F(3src_pred_inv, 20, 20)
+F(3src_pred_control, 19, 16)
+F(3src_thread_control, 15, 14)
+F(3src_qtr_control, 13, 12)
+F8(3src_no_dd_check, 11, 11, 10, 10)
+F8(3src_no_dd_clear, 10, 10, 9, 9)
+F8(3src_mask_control, 9, 9, 34, 34)
+F(3src_access_mode, 8, 8)
+/* Bit 7 is Reserved (for future Opcode expansion) */
+F(3src_opcode, 6, 0)
+/** @} */
+
+/**
+ * Flow control instruction bits:
+ * @{
+ */
+static inline void
+brw_inst_set_uip(const struct gen_device_info *devinfo,
+ brw_inst *inst, int32_t value)
+{
+ assert(devinfo->gen >= 6);
+
+ if (devinfo->gen >= 8) {
+ brw_inst_set_bits(inst, 95, 64, (uint32_t)value);
+ } else {
+ assert(value <= (1 << 16) - 1);
+ assert(value > -(1 << 16));
+ brw_inst_set_bits(inst, 127, 112, (uint16_t)value);
+ }
+}
+
+static inline int32_t
+brw_inst_uip(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+ assert(devinfo->gen >= 6);
+
+ if (devinfo->gen >= 8) {
+ return brw_inst_bits(inst, 95, 64);
+ } else {
+ return (int16_t)brw_inst_bits(inst, 127, 112);
+ }
+}
+
+static inline void
+brw_inst_set_jip(const struct gen_device_info *devinfo,
+ brw_inst *inst, int32_t value)
+{
+ assert(devinfo->gen >= 6);
+
+ if (devinfo->gen >= 8) {
+ brw_inst_set_bits(inst, 127, 96, (uint32_t)value);
+ } else {
+ assert(value <= (1 << 15) - 1);
+ assert(value >= -(1 << 15));
+ brw_inst_set_bits(inst, 111, 96, (uint16_t)value);
+ }
+}
+
+static inline int32_t
+brw_inst_jip(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+ assert(devinfo->gen >= 6);
+
+ if (devinfo->gen >= 8) {
+ return brw_inst_bits(inst, 127, 96);
+ } else {
+ return (int16_t)brw_inst_bits(inst, 111, 96);
+ }
+}
+
+/** Like FC, but using int16_t to handle negative jump targets. */
+#define FJ(name, high, low, assertions) \
+static inline void \
+brw_inst_set_##name(const struct gen_device_info *devinfo, brw_inst *inst, int16_t v) \
+{ \
+ assert(assertions); \
+ (void) devinfo; \
+ brw_inst_set_bits(inst, high, low, (uint16_t) v); \
+} \
+static inline int16_t \
+brw_inst_##name(const struct gen_device_info *devinfo, const brw_inst *inst) \
+{ \
+ assert(assertions); \
+ (void) devinfo; \
+ return brw_inst_bits(inst, high, low); \
+}
+
+FJ(gen6_jump_count, 63, 48, devinfo->gen == 6)
+FJ(gen4_jump_count, 111, 96, devinfo->gen < 6)
+FC(gen4_pop_count, 115, 112, devinfo->gen < 6)
+/** @} */
+
+/* Message descriptor bits */
+#define MD(x) ((x) + 96)
+
+/**
+ * Fields for SEND messages:
+ * @{
+ */
+F(eot, 127, 127)
+FF(mlen,
+ /* 4: */ 119, 116,
+ /* 4.5: */ 119, 116,
+ /* 5: */ 124, 121,
+ /* 6: */ 124, 121,
+ /* 7: */ 124, 121,
+ /* 8: */ 124, 121);
+FF(rlen,
+ /* 4: */ 115, 112,
+ /* 4.5: */ 115, 112,
+ /* 5: */ 120, 116,
+ /* 6: */ 120, 116,
+ /* 7: */ 120, 116,
+ /* 8: */ 120, 116);
+FF(header_present,
+ /* 4: doesn't exist */ -1, -1, -1, -1,
+ /* 5: */ 115, 115,
+ /* 6: */ 115, 115,
+ /* 7: */ 115, 115,
+ /* 8: */ 115, 115)
+F(gateway_notify, MD(16), MD(15))
+FF(function_control,
+ /* 4: */ 111, 96,
+ /* 4.5: */ 111, 96,
+ /* 5: */ 114, 96,
+ /* 6: */ 114, 96,
+ /* 7: */ 114, 96,
+ /* 8: */ 114, 96)
+FF(gateway_subfuncid,
+ /* 4: */ MD(1), MD(0),
+ /* 4.5: */ MD(1), MD(0),
+ /* 5: */ MD(1), MD(0), /* 2:0, but bit 2 is reserved MBZ */
+ /* 6: */ MD(2), MD(0),
+ /* 7: */ MD(2), MD(0),
+ /* 8: */ MD(2), MD(0))
+FF(sfid,
+ /* 4: */ 123, 120, /* called msg_target */
+ /* 4.5 */ 123, 120,
+ /* 5: */ 95, 92,
+ /* 6: */ 27, 24,
+ /* 7: */ 27, 24,
+ /* 8: */ 27, 24)
+FC(base_mrf, 27, 24, devinfo->gen < 6);
+/** @} */
+
+/**
+ * URB message function control bits:
+ * @{
+ */
+FF(urb_per_slot_offset,
+ /* 4-6: */ -1, -1, -1, -1, -1, -1, -1, -1,
+ /* 7: */ MD(16), MD(16),
+ /* 8: */ MD(17), MD(17))
+FC(urb_channel_mask_present, MD(15), MD(15), devinfo->gen >= 8)
+FC(urb_complete, MD(15), MD(15), devinfo->gen < 8)
+FC(urb_used, MD(14), MD(14), devinfo->gen < 7)
+FC(urb_allocate, MD(13), MD(13), devinfo->gen < 7)
+FF(urb_swizzle_control,
+ /* 4: */ MD(11), MD(10),
+ /* 4.5: */ MD(11), MD(10),
+ /* 5: */ MD(11), MD(10),
+ /* 6: */ MD(11), MD(10),
+ /* 7: */ MD(14), MD(14),
+ /* 8: */ MD(15), MD(15))
+FF(urb_global_offset,
+ /* 4: */ MD( 9), MD(4),
+ /* 4.5: */ MD( 9), MD(4),
+ /* 5: */ MD( 9), MD(4),
+ /* 6: */ MD( 9), MD(4),
+ /* 7: */ MD(13), MD(3),
+ /* 8: */ MD(14), MD(4))
+FF(urb_opcode,
+ /* 4: */ MD( 3), MD(0),
+ /* 4.5: */ MD( 3), MD(0),
+ /* 5: */ MD( 3), MD(0),
+ /* 6: */ MD( 3), MD(0),
+ /* 7: */ MD( 2), MD(0),
+ /* 8: */ MD( 3), MD(0))
+/** @} */
+
+/**
+ * Gen4-5 math messages:
+ * @{
+ */
+FC(math_msg_data_type, MD(7), MD(7), devinfo->gen < 6)
+FC(math_msg_saturate, MD(6), MD(6), devinfo->gen < 6)
+FC(math_msg_precision, MD(5), MD(5), devinfo->gen < 6)
+FC(math_msg_signed_int, MD(4), MD(4), devinfo->gen < 6)
+FC(math_msg_function, MD(3), MD(0), devinfo->gen < 6)
+/** @} */
+
+/**
+ * Sampler message function control bits:
+ * @{
+ */
+FF(sampler_simd_mode,
+ /* 4: doesn't exist */ -1, -1, -1, -1,
+ /* 5: */ MD(17), MD(16),
+ /* 6: */ MD(17), MD(16),
+ /* 7: */ MD(18), MD(17),
+ /* 8: */ MD(18), MD(17))
+FF(sampler_msg_type,
+ /* 4: */ MD(15), MD(14),
+ /* 4.5: */ MD(15), MD(12),
+ /* 5: */ MD(15), MD(12),
+ /* 6: */ MD(15), MD(12),
+ /* 7: */ MD(16), MD(12),
+ /* 8: */ MD(16), MD(12))
+FC(sampler_return_format, MD(13), MD(12), devinfo->gen == 4 && !devinfo->is_g4x)
+F(sampler, MD(11), MD(8))
+F(binding_table_index, MD( 7), MD(0)) /* also used by other messages */
+/** @} */
+
+/**
+ * Data port message function control bits:
+ * @{
+ */
+FC(dp_category, MD(18), MD(18), devinfo->gen >= 7)
+
+/* Gen4-5 store fields in different bits for read/write messages. */
+FF(dp_read_msg_type,
+ /* 4: */ MD(13), MD(12),
+ /* 4.5: */ MD(13), MD(11),
+ /* 5: */ MD(13), MD(11),
+ /* 6: */ MD(16), MD(13),
+ /* 7: */ MD(17), MD(14),
+ /* 8: */ MD(17), MD(14))
+FF(dp_write_msg_type,
+ /* 4: */ MD(14), MD(12),
+ /* 4.5: */ MD(14), MD(12),
+ /* 5: */ MD(14), MD(12),
+ /* 6: */ MD(16), MD(13),
+ /* 7: */ MD(17), MD(14),
+ /* 8: */ MD(17), MD(14))
+FF(dp_read_msg_control,
+ /* 4: */ MD(11), MD( 8),
+ /* 4.5: */ MD(10), MD( 8),
+ /* 5: */ MD(10), MD( 8),
+ /* 6: */ MD(12), MD( 8),
+ /* 7: */ MD(13), MD( 8),
+ /* 8: */ MD(13), MD( 8))
+FF(dp_write_msg_control,
+ /* 4: */ MD(11), MD( 8),
+ /* 4.5: */ MD(11), MD( 8),
+ /* 5: */ MD(11), MD( 8),
+ /* 6: */ MD(12), MD( 8),
+ /* 7: */ MD(13), MD( 8),
+ /* 8: */ MD(13), MD( 8))
+FC(dp_read_target_cache, MD(15), MD(14), devinfo->gen < 6);
+
+FF(dp_write_commit,
+ /* 4: */ MD(15), MD(15),
+ /* 4.5: */ MD(15), MD(15),
+ /* 5: */ MD(15), MD(15),
+ /* 6: */ MD(17), MD(17),
+ /* 7+: does not exist */ -1, -1, -1, -1)
+
+/* Gen6+ use the same bit locations for everything. */
+FF(dp_msg_type,
+ /* 4-5: use dp_read_msg_type or dp_write_msg_type instead */
+ -1, -1, -1, -1, -1, -1,
+ /* 6: */ MD(16), MD(13),
+ /* 7: */ MD(17), MD(14),
+ /* 8: */ MD(17), MD(14))
+FF(dp_msg_control,
+ /* 4: */ MD(11), MD( 8),
+ /* 4.5-5: use dp_read_msg_control or dp_write_msg_control */ -1, -1, -1, -1,
+ /* 6: */ MD(12), MD( 8),
+ /* 7: */ MD(13), MD( 8),
+ /* 8: */ MD(13), MD( 8))
+/** @} */
+
+/**
+ * Scratch message bits (Gen7+):
+ * @{
+ */
+FC(scratch_read_write, MD(17), MD(17), devinfo->gen >= 7) /* 0 = read, 1 = write */
+FC(scratch_type, MD(16), MD(16), devinfo->gen >= 7) /* 0 = OWord, 1 = DWord */
+FC(scratch_invalidate_after_read, MD(15), MD(15), devinfo->gen >= 7)
+FC(scratch_block_size, MD(13), MD(12), devinfo->gen >= 7)
+FC(scratch_addr_offset, MD(11), MD( 0), devinfo->gen >= 7)
+/** @} */
+
+/**
+ * Render Target message function control bits:
+ * @{
+ */
+FF(rt_last,
+ /* 4: */ MD(11), MD(11),
+ /* 4.5: */ MD(11), MD(11),
+ /* 5: */ MD(11), MD(11),
+ /* 6: */ MD(12), MD(12),
+ /* 7: */ MD(12), MD(12),
+ /* 8: */ MD(12), MD(12))
+FC(rt_slot_group, MD(11), MD(11), devinfo->gen >= 6)
+F(rt_message_type, MD(10), MD( 8))
+/** @} */
+
+/**
+ * Thread Spawn message function control bits:
+ * @{
+ */
+F(ts_resource_select, MD( 4), MD( 4))
+F(ts_request_type, MD( 1), MD( 1))
+F(ts_opcode, MD( 0), MD( 0))
+/** @} */
+
+/**
+ * Pixel Interpolator message function control bits:
+ * @{
+ */
+F(pi_simd_mode, MD(16), MD(16))
+F(pi_nopersp, MD(14), MD(14))
+F(pi_message_type, MD(13), MD(12))
+F(pi_slot_group, MD(11), MD(11))
+F(pi_message_data, MD(7), MD(0))
+/** @} */
+
+/**
+ * Immediates:
+ * @{
+ */
+static inline int
+brw_inst_imm_d(const struct gen_device_info *devinfo, const brw_inst *insn)
+{
+ (void) devinfo;
+ return brw_inst_bits(insn, 127, 96);
+}
+
+static inline unsigned
+brw_inst_imm_ud(const struct gen_device_info *devinfo, const brw_inst *insn)
+{
+ (void) devinfo;
+ return brw_inst_bits(insn, 127, 96);
+}
+
+static inline float
+brw_inst_imm_f(const struct gen_device_info *devinfo, const brw_inst *insn)
+{
+ union {
+ float f;
+ uint32_t u;
+ } ft;
+ (void) devinfo;
+ ft.u = brw_inst_bits(insn, 127, 96);
+ return ft.f;
+}
+
+static inline double
+brw_inst_imm_df(const struct gen_device_info *devinfo, const brw_inst *insn)
+{
+ union {
+ double d;
+ uint64_t u;
+ } dt;
+ (void) devinfo;
+ dt.u = brw_inst_bits(insn, 127, 64);
+ return dt.d;
+}
+
+static inline void
+brw_inst_set_imm_d(const struct gen_device_info *devinfo,
+ brw_inst *insn, int value)
+{
+ (void) devinfo;
+ return brw_inst_set_bits(insn, 127, 96, value);
+}
+
+static inline void
+brw_inst_set_imm_ud(const struct gen_device_info *devinfo,
+ brw_inst *insn, unsigned value)
+{
+ (void) devinfo;
+ return brw_inst_set_bits(insn, 127, 96, value);
+}
+
+static inline void
+brw_inst_set_imm_f(const struct gen_device_info *devinfo,
+ brw_inst *insn, float value)
+{
+ union {
+ float f;
+ uint32_t u;
+ } ft;
+ (void) devinfo;
+ ft.f = value;
+ brw_inst_set_bits(insn, 127, 96, ft.u);
+}
+
+static inline void
+brw_inst_set_imm_df(const struct gen_device_info *devinfo,
+ brw_inst *insn, double value)
+{
+ union {
+ double d;
+ uint64_t u;
+ } dt;
+ (void) devinfo;
+ dt.d = value;
+ brw_inst_set_bits(insn, 127, 64, dt.u);
+}
+
+static inline void
+brw_inst_set_imm_uq(const struct gen_device_info *devinfo,
+ brw_inst *insn, uint64_t value)
+{
+ (void) devinfo;
+ brw_inst_set_bits(insn, 127, 64, value);
+}
+
+/** @} */
+
+/* The AddrImm fields are split into two discontiguous sections on Gen8+ */
+#define BRW_IA1_ADDR_IMM(reg, g4_high, g4_low, g8_nine, g8_high, g8_low) \
+static inline void \
+brw_inst_set_##reg##_ia1_addr_imm(const struct gen_device_info *devinfo, \
+ brw_inst *inst, \
+ unsigned value) \
+{ \
+ assert((value & ~0x3ff) == 0); \
+ if (devinfo->gen >= 8) { \
+ brw_inst_set_bits(inst, g8_high, g8_low, value & 0x1ff); \
+ brw_inst_set_bits(inst, g8_nine, g8_nine, value >> 9); \
+ } else { \
+ brw_inst_set_bits(inst, g4_high, g4_low, value); \
+ } \
+} \
+static inline unsigned \
+brw_inst_##reg##_ia1_addr_imm(const struct gen_device_info *devinfo, \
+ const brw_inst *inst) \
+{ \
+ if (devinfo->gen >= 8) { \
+ return brw_inst_bits(inst, g8_high, g8_low) | \
+ (brw_inst_bits(inst, g8_nine, g8_nine) << 9); \
+ } else { \
+ return brw_inst_bits(inst, g4_high, g4_low); \
+ } \
+}
+
+/* AddrImm[9:0] for Align1 Indirect Addressing */
+/* -Gen 4- ----Gen8---- */
+BRW_IA1_ADDR_IMM(src1, 105, 96, 121, 104, 96)
+BRW_IA1_ADDR_IMM(src0, 73, 64, 95, 72, 64)
+BRW_IA1_ADDR_IMM(dst, 57, 48, 47, 56, 48)
+
+#define BRW_IA16_ADDR_IMM(reg, g4_high, g4_low, g8_nine, g8_high, g8_low) \
+static inline void \
+brw_inst_set_##reg##_ia16_addr_imm(const struct gen_device_info *devinfo, \
+ brw_inst *inst, unsigned value) \
+{ \
+ assert((value & ~0x3ff) == 0); \
+ if (devinfo->gen >= 8) { \
+ brw_inst_set_bits(inst, g8_high, g8_low, value & 0x1ff); \
+ brw_inst_set_bits(inst, g8_nine, g8_nine, value >> 9); \
+ } else { \
+ brw_inst_set_bits(inst, g4_high, g4_low, value >> 9); \
+ } \
+} \
+static inline unsigned \
+brw_inst_##reg##_ia16_addr_imm(const struct gen_device_info *devinfo, \
+ const brw_inst *inst) \
+{ \
+ if (devinfo->gen >= 8) { \
+ return brw_inst_bits(inst, g8_high, g8_low) | \
+ (brw_inst_bits(inst, g8_nine, g8_nine) << 9); \
+ } else { \
+ return brw_inst_bits(inst, g4_high, g4_low); \
+ } \
+}
+
+/* AddrImm[9:0] for Align16 Indirect Addressing:
+ * Compared to Align1, these are missing the low 4 bits.
+ * -Gen 4- ----Gen8----
+ */
+BRW_IA16_ADDR_IMM(src1, 105, 96, 121, 104, 100)
+BRW_IA16_ADDR_IMM(src0, 73, 64, 95, 72, 68)
+BRW_IA16_ADDR_IMM(dst, 57, 52, 47, 56, 52)
+
+/**
+ * Fetch a set of contiguous bits from the instruction.
+ *
+ * Bits indices range from 0..127; fields may not cross 64-bit boundaries.
+ */
+static inline uint64_t
+brw_inst_bits(const brw_inst *inst, unsigned high, unsigned low)
+{
+ /* We assume the field doesn't cross 64-bit boundaries. */
+ const unsigned word = high / 64;
+ assert(word == low / 64);
+
+ high %= 64;
+ low %= 64;
+
+ const uint64_t mask = (~0ull >> (64 - (high - low + 1)));
+
+ return (inst->data[word] >> low) & mask;
+}
+
+/**
+ * Set bits in the instruction, with proper shifting and masking.
+ *
+ * Bits indices range from 0..127; fields may not cross 64-bit boundaries.
+ */
+static inline void
+brw_inst_set_bits(brw_inst *inst, unsigned high, unsigned low, uint64_t value)
+{
+ const unsigned word = high / 64;
+ assert(word == low / 64);
+
+ high %= 64;
+ low %= 64;
+
+ const uint64_t mask = (~0ull >> (64 - (high - low + 1))) << low;
+
+ /* Make sure the supplied value actually fits in the given bitfield. */
+ assert((value & (mask >> low)) == value);
+
+ inst->data[word] = (inst->data[word] & ~mask) | (value << low);
+}
+
+#undef BRW_IA16_ADDR_IMM
+#undef BRW_IA1_ADDR_IMM
+#undef MD
+#undef F8
+#undef FF
+#undef BOUNDS
+#undef F
+#undef FC
+
+typedef struct {
+ uint64_t data;
+} brw_compact_inst;
+
+/**
+ * Fetch a set of contiguous bits from the compacted instruction.
+ *
+ * Bits indices range from 0..63.
+ */
+static inline unsigned
+brw_compact_inst_bits(const brw_compact_inst *inst, unsigned high, unsigned low)
+{
+ const uint64_t mask = (1ull << (high - low + 1)) - 1;
+
+ return (inst->data >> low) & mask;
+}
+
+/**
+ * Set bits in the compacted instruction.
+ *
+ * Bits indices range from 0..63.
+ */
+static inline void
+brw_compact_inst_set_bits(brw_compact_inst *inst, unsigned high, unsigned low,
+ uint64_t value)
+{
+ const uint64_t mask = ((1ull << (high - low + 1)) - 1) << low;
+
+ /* Make sure the supplied value actually fits in the given bitfield. */
+ assert((value & (mask >> low)) == value);
+
+ inst->data = (inst->data & ~mask) | (value << low);
+}
+
+#define FC(name, high, low, assertions) \
+static inline void \
+brw_compact_inst_set_##name(const struct gen_device_info *devinfo, \
+ brw_compact_inst *inst, unsigned v) \
+{ \
+ assert(assertions); \
+ (void) devinfo; \
+ brw_compact_inst_set_bits(inst, high, low, v); \
+} \
+static inline unsigned \
+brw_compact_inst_##name(const struct gen_device_info *devinfo, \
+ const brw_compact_inst *inst) \
+{ \
+ assert(assertions); \
+ (void) devinfo; \
+ return brw_compact_inst_bits(inst, high, low); \
+}
+
+/* A simple macro for fields which stay in the same place on all generations. */
+#define F(name, high, low) FC(name, high, low, true)
+
+F(src1_reg_nr, 63, 56)
+F(src0_reg_nr, 55, 48)
+F(dst_reg_nr, 47, 40)
+F(src1_index, 39, 35)
+F(src0_index, 34, 30)
+F(cmpt_control, 29, 29) /* Same location as brw_inst */
+FC(flag_subreg_nr, 28, 28, devinfo->gen <= 6)
+F(cond_modifier, 27, 24) /* Same location as brw_inst */
+FC(acc_wr_control, 23, 23, devinfo->gen >= 6)
+FC(mask_control_ex, 23, 23, devinfo->is_g4x || devinfo->gen == 5)
+F(subreg_index, 22, 18)
+F(datatype_index, 17, 13)
+F(control_index, 12, 8)
+F(debug_control, 7, 7)
+F(opcode, 6, 0) /* Same location as brw_inst */
+
+/**
+ * (Gen8+) Compacted three-source instructions:
+ * @{
+ */
+FC(3src_src2_reg_nr, 63, 57, devinfo->gen >= 8)
+FC(3src_src1_reg_nr, 56, 50, devinfo->gen >= 8)
+FC(3src_src0_reg_nr, 49, 43, devinfo->gen >= 8)
+FC(3src_src2_subreg_nr, 42, 40, devinfo->gen >= 8)
+FC(3src_src1_subreg_nr, 39, 37, devinfo->gen >= 8)
+FC(3src_src0_subreg_nr, 36, 34, devinfo->gen >= 8)
+FC(3src_src2_rep_ctrl, 33, 33, devinfo->gen >= 8)
+FC(3src_src1_rep_ctrl, 32, 32, devinfo->gen >= 8)
+FC(3src_saturate, 31, 31, devinfo->gen >= 8)
+FC(3src_debug_control, 30, 30, devinfo->gen >= 8)
+FC(3src_cmpt_control, 29, 29, devinfo->gen >= 8)
+FC(3src_src0_rep_ctrl, 28, 28, devinfo->gen >= 8)
+/* Reserved */
+FC(3src_dst_reg_nr, 18, 12, devinfo->gen >= 8)
+FC(3src_source_index, 11, 10, devinfo->gen >= 8)
+FC(3src_control_index, 9, 8, devinfo->gen >= 8)
+/* Bit 7 is Reserved (for future Opcode expansion) */
+FC(3src_opcode, 6, 0, devinfo->gen >= 8)
+/** @} */
+
+#undef F
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/intel/compiler/brw_interpolation_map.c b/src/intel/compiler/brw_interpolation_map.c
new file mode 100644
index 00000000000..7b9f58eb6ee
--- /dev/null
+++ b/src/intel/compiler/brw_interpolation_map.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_compiler.h"
+#include "compiler/nir/nir.h"
+
+static char const *get_qual_name(int mode)
+{
+ switch (mode) {
+ case INTERP_MODE_NONE: return "none";
+ case INTERP_MODE_FLAT: return "flat";
+ case INTERP_MODE_SMOOTH: return "smooth";
+ case INTERP_MODE_NOPERSPECTIVE: return "nopersp";
+ default: return "???";
+ }
+}
+
+static void
+gen4_frag_prog_set_interp_modes(struct brw_wm_prog_data *prog_data,
+ struct brw_vue_map *vue_map,
+ unsigned location, unsigned slot_count,
+ enum glsl_interp_mode interp)
+{
+ for (unsigned k = 0; k < slot_count; k++) {
+ unsigned slot = vue_map->varying_to_slot[location + k];
+ if (slot != -1 && prog_data->interp_mode[slot] == INTERP_MODE_NONE) {
+ prog_data->interp_mode[slot] = interp;
+
+ if (prog_data->interp_mode[slot] == INTERP_MODE_FLAT) {
+ prog_data->contains_flat_varying = true;
+ } else if (prog_data->interp_mode[slot] == INTERP_MODE_NOPERSPECTIVE) {
+ prog_data->contains_noperspective_varying = true;
+ }
+ }
+ }
+}
+
+/* Set up interpolation modes for every element in the VUE */
+void
+brw_setup_vue_interpolation(struct brw_vue_map *vue_map, nir_shader *nir,
+ struct brw_wm_prog_data *prog_data,
+ const struct gen_device_info *devinfo)
+{
+ /* Initialise interp_mode. INTERP_MODE_NONE == 0 */
+ memset(prog_data->interp_mode, 0, sizeof(prog_data->interp_mode));
+
+ if (!vue_map)
+ return;
+
+ /* HPOS always wants noperspective. setting it up here allows
+ * us to not need special handling in the SF program.
+ */
+ unsigned pos_slot = vue_map->varying_to_slot[VARYING_SLOT_POS];
+ if (pos_slot != -1) {;
+ prog_data->interp_mode[pos_slot] = INTERP_MODE_NOPERSPECTIVE;
+ prog_data->contains_noperspective_varying = true;
+ }
+
+ foreach_list_typed(nir_variable, var, node, &nir->inputs) {
+ unsigned location = var->data.location;
+ unsigned slot_count = glsl_count_attribute_slots(var->type, false);
+
+ gen4_frag_prog_set_interp_modes(prog_data, vue_map, location, slot_count,
+ var->data.interpolation);
+
+ if (location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1) {
+ location = location + VARYING_SLOT_BFC0 - VARYING_SLOT_COL0;
+ gen4_frag_prog_set_interp_modes(prog_data, vue_map, location,
+ slot_count, var->data.interpolation);
+ }
+ }
+
+ bool debug = false;
+ if (debug) {
+ fprintf(stderr, "VUE map:\n");
+ for (int i = 0; i < vue_map->num_slots; i++) {
+ int varying = vue_map->slot_to_varying[i];
+ if (varying == -1) {
+ fprintf(stderr, "%d: --\n", i);
+ continue;
+ }
+
+ fprintf(stderr, "%d: %d %s ofs %d\n",
+ i, varying,
+ get_qual_name(prog_data->interp_mode[i]),
+ brw_vue_slot_to_offset(i));
+ }
+ }
+}
diff --git a/src/intel/compiler/brw_ir_allocator.h b/src/intel/compiler/brw_ir_allocator.h
new file mode 100644
index 00000000000..b1237ed38e7
--- /dev/null
+++ b/src/intel/compiler/brw_ir_allocator.h
@@ -0,0 +1,87 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2010-2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_IR_ALLOCATOR_H
+#define BRW_IR_ALLOCATOR_H
+
+#include "main/macros.h"
+
+namespace brw {
+ /**
+ * Simple allocator used to keep track of virtual GRFs.
+ */
+ class simple_allocator {
+ public:
+ simple_allocator() :
+ sizes(NULL), offsets(NULL), count(0), total_size(0), capacity(0)
+ {
+ }
+
+ ~simple_allocator()
+ {
+ free(offsets);
+ free(sizes);
+ }
+
+ unsigned
+ allocate(unsigned size)
+ {
+ if (capacity <= count) {
+ capacity = MAX2(16, capacity * 2);
+ sizes = (unsigned *)realloc(sizes, capacity * sizeof(unsigned));
+ offsets = (unsigned *)realloc(offsets, capacity * sizeof(unsigned));
+ }
+
+ sizes[count] = size;
+ offsets[count] = total_size;
+ total_size += size;
+
+ return count++;
+ }
+
+ /**
+ * Array of sizes for each allocation. The allocation unit is up to the
+ * back-end, but it's expected to be one scalar value in the FS back-end
+ * and one vec4 in the VEC4 back-end.
+ */
+ unsigned *sizes;
+
+ /**
+ * Array of offsets from the start of the VGRF space in allocation
+ * units.
+ */
+ unsigned *offsets;
+
+ /** Total number of VGRFs allocated. */
+ unsigned count;
+
+ /** Cumulative size in allocation units. */
+ unsigned total_size;
+
+ private:
+ unsigned capacity;
+ };
+}
+
+#endif
diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h
new file mode 100644
index 00000000000..cad371248c4
--- /dev/null
+++ b/src/intel/compiler/brw_ir_fs.h
@@ -0,0 +1,451 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2010-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_IR_FS_H
+#define BRW_IR_FS_H
+
+#include "brw_shader.h"
+
+class fs_inst;
+
+class fs_reg : public backend_reg {
+public:
+ DECLARE_RALLOC_CXX_OPERATORS(fs_reg)
+
+ void init();
+
+ fs_reg();
+ fs_reg(struct ::brw_reg reg);
+ fs_reg(enum brw_reg_file file, int nr);
+ fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type);
+
+ bool equals(const fs_reg &r) const;
+ bool is_contiguous() const;
+
+ /**
+ * Return the size in bytes of a single logical component of the
+ * register assuming the given execution width.
+ */
+ unsigned component_size(unsigned width) const;
+
+ /** Register region horizontal stride */
+ uint8_t stride;
+};
+
+static inline fs_reg
+negate(fs_reg reg)
+{
+ assert(reg.file != IMM);
+ reg.negate = !reg.negate;
+ return reg;
+}
+
+static inline fs_reg
+retype(fs_reg reg, enum brw_reg_type type)
+{
+ reg.type = type;
+ return reg;
+}
+
+static inline fs_reg
+byte_offset(fs_reg reg, unsigned delta)
+{
+ switch (reg.file) {
+ case BAD_FILE:
+ break;
+ case VGRF:
+ case ATTR:
+ case UNIFORM:
+ reg.offset += delta;
+ break;
+ case MRF: {
+ const unsigned suboffset = reg.offset + delta;
+ reg.nr += suboffset / REG_SIZE;
+ reg.offset = suboffset % REG_SIZE;
+ break;
+ }
+ case ARF:
+ case FIXED_GRF: {
+ const unsigned suboffset = reg.subnr + delta;
+ reg.nr += suboffset / REG_SIZE;
+ reg.subnr = suboffset % REG_SIZE;
+ break;
+ }
+ case IMM:
+ default:
+ assert(delta == 0);
+ }
+ return reg;
+}
+
+static inline fs_reg
+horiz_offset(const fs_reg &reg, unsigned delta)
+{
+ switch (reg.file) {
+ case BAD_FILE:
+ case UNIFORM:
+ case IMM:
+ /* These only have a single component that is implicitly splatted. A
+ * horizontal offset should be a harmless no-op.
+ * XXX - Handle vector immediates correctly.
+ */
+ return reg;
+ case VGRF:
+ case MRF:
+ case ATTR:
+ return byte_offset(reg, delta * reg.stride * type_sz(reg.type));
+ case ARF:
+ case FIXED_GRF:
+ if (reg.is_null()) {
+ return reg;
+ } else {
+ const unsigned stride = reg.hstride ? 1 << (reg.hstride - 1) : 0;
+ return byte_offset(reg, delta * stride * type_sz(reg.type));
+ }
+ }
+ unreachable("Invalid register file");
+}
+
+static inline fs_reg
+offset(fs_reg reg, unsigned width, unsigned delta)
+{
+ switch (reg.file) {
+ case BAD_FILE:
+ break;
+ case ARF:
+ case FIXED_GRF:
+ case MRF:
+ case VGRF:
+ case ATTR:
+ case UNIFORM:
+ return byte_offset(reg, delta * reg.component_size(width));
+ case IMM:
+ assert(delta == 0);
+ }
+ return reg;
+}
+
+/**
+ * Get the scalar channel of \p reg given by \p idx and replicate it to all
+ * channels of the result.
+ */
+static inline fs_reg
+component(fs_reg reg, unsigned idx)
+{
+ reg = horiz_offset(reg, idx);
+ reg.stride = 0;
+ return reg;
+}
+
+/**
+ * Return an integer identifying the discrete address space a register is
+ * contained in. A register is by definition fully contained in the single
+ * reg_space it belongs to, so two registers with different reg_space ids are
+ * guaranteed not to overlap. Most register files are a single reg_space of
+ * its own, only the VGRF file is composed of multiple discrete address
+ * spaces, one for each VGRF allocation.
+ */
+static inline uint32_t
+reg_space(const fs_reg &r)
+{
+ return r.file << 16 | (r.file == VGRF ? r.nr : 0);
+}
+
+/**
+ * Return the base offset in bytes of a register relative to the start of its
+ * reg_space().
+ */
+static inline unsigned
+reg_offset(const fs_reg &r)
+{
+ return (r.file == VGRF || r.file == IMM ? 0 : r.nr) *
+ (r.file == UNIFORM ? 4 : REG_SIZE) + r.offset +
+ (r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0);
+}
+
+/**
+ * Return the amount of padding in bytes left unused between individual
+ * components of register \p r due to a (horizontal) stride value greater than
+ * one, or zero if components are tightly packed in the register file.
+ */
+static inline unsigned
+reg_padding(const fs_reg &r)
+{
+ const unsigned stride = ((r.file != ARF && r.file != FIXED_GRF) ? r.stride :
+ r.hstride == 0 ? 0 :
+ 1 << (r.hstride - 1));
+ return (MAX2(1, stride) - 1) * type_sz(r.type);
+}
+
+/**
+ * Return whether the register region starting at \p r and spanning \p dr
+ * bytes could potentially overlap the register region starting at \p s and
+ * spanning \p ds bytes.
+ */
+static inline bool
+regions_overlap(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
+{
+ if (r.file == MRF && (r.nr & BRW_MRF_COMPR4)) {
+ fs_reg t = r;
+ t.nr &= ~BRW_MRF_COMPR4;
+ /* COMPR4 regions are translated by the hardware during decompression
+ * into two separate half-regions 4 MRFs apart from each other.
+ */
+ return regions_overlap(t, dr / 2, s, ds) ||
+ regions_overlap(byte_offset(t, 4 * REG_SIZE), dr / 2, s, ds);
+
+ } else if (s.file == MRF && (s.nr & BRW_MRF_COMPR4)) {
+ return regions_overlap(s, ds, r, dr);
+
+ } else {
+ return reg_space(r) == reg_space(s) &&
+ !(reg_offset(r) + dr <= reg_offset(s) ||
+ reg_offset(s) + ds <= reg_offset(r));
+ }
+}
+
+/**
+ * Check that the register region given by r [r.offset, r.offset + dr[
+ * is fully contained inside the register region given by s
+ * [s.offset, s.offset + ds[.
+ */
+static inline bool
+region_contained_in(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
+{
+ return reg_space(r) == reg_space(s) &&
+ reg_offset(r) >= reg_offset(s) &&
+ reg_offset(r) + dr <= reg_offset(s) + ds;
+}
+
+/**
+ * Return whether the given register region is n-periodic, i.e. whether the
+ * original region remains invariant after shifting it by \p n scalar
+ * channels.
+ */
+static inline bool
+is_periodic(const fs_reg &reg, unsigned n)
+{
+ if (reg.file == BAD_FILE || reg.is_null()) {
+ return true;
+
+ } else if (reg.file == IMM) {
+ const unsigned period = (reg.type == BRW_REGISTER_TYPE_UV ||
+ reg.type == BRW_REGISTER_TYPE_V ? 8 :
+ reg.type == BRW_REGISTER_TYPE_VF ? 4 :
+ 1);
+ return n % period == 0;
+
+ } else if (reg.file == ARF || reg.file == FIXED_GRF) {
+ const unsigned period = (reg.hstride == 0 && reg.vstride == 0 ? 1 :
+ reg.vstride == 0 ? 1 << reg.width :
+ ~0);
+ return n % period == 0;
+
+ } else {
+ return reg.stride == 0;
+ }
+}
+
+static inline bool
+is_uniform(const fs_reg &reg)
+{
+ return is_periodic(reg, 1);
+}
+
+/**
+ * Get the specified 8-component quarter of a register.
+ * XXX - Maybe come up with a less misleading name for this (e.g. quarter())?
+ */
+static inline fs_reg
+half(const fs_reg &reg, unsigned idx)
+{
+ assert(idx < 2);
+ return horiz_offset(reg, 8 * idx);
+}
+
+/**
+ * Reinterpret each channel of register \p reg as a vector of values of the
+ * given smaller type and take the i-th subcomponent from each.
+ */
+static inline fs_reg
+subscript(fs_reg reg, brw_reg_type type, unsigned i)
+{
+ assert((i + 1) * type_sz(type) <= type_sz(reg.type));
+
+ if (reg.file == ARF || reg.file == FIXED_GRF) {
+ /* The stride is encoded inconsistently for fixed GRF and ARF registers
+ * as the log2 of the actual vertical and horizontal strides.
+ */
+ const int delta = _mesa_logbase2(type_sz(reg.type)) -
+ _mesa_logbase2(type_sz(type));
+ reg.hstride += (reg.hstride ? delta : 0);
+ reg.vstride += (reg.vstride ? delta : 0);
+
+ } else if (reg.file == IMM) {
+ assert(reg.type == type);
+
+ } else {
+ reg.stride *= type_sz(reg.type) / type_sz(type);
+ }
+
+ return byte_offset(retype(reg, type), i * type_sz(type));
+}
+
+static const fs_reg reg_undef;
+
+class fs_inst : public backend_instruction {
+ fs_inst &operator=(const fs_inst &);
+
+ void init(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
+ const fs_reg *src, unsigned sources);
+
+public:
+ DECLARE_RALLOC_CXX_OPERATORS(fs_inst)
+
+ fs_inst();
+ fs_inst(enum opcode opcode, uint8_t exec_size);
+ fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst);
+ fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+ const fs_reg &src0);
+ fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+ const fs_reg &src0, const fs_reg &src1);
+ fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+ const fs_reg &src0, const fs_reg &src1, const fs_reg &src2);
+ fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+ const fs_reg src[], unsigned sources);
+ fs_inst(const fs_inst &that);
+ ~fs_inst();
+
+ void resize_sources(uint8_t num_sources);
+
+ bool equals(fs_inst *inst) const;
+ bool is_send_from_grf() const;
+ bool is_partial_write() const;
+ bool is_copy_payload(const brw::simple_allocator &grf_alloc) const;
+ unsigned components_read(unsigned i) const;
+ unsigned size_read(int arg) const;
+ bool can_do_source_mods(const struct gen_device_info *devinfo);
+ bool can_change_types() const;
+ bool has_side_effects() const;
+ bool has_source_and_destination_hazard() const;
+
+ /**
+ * Return the subset of flag registers read by the instruction as a bitset
+ * with byte granularity.
+ */
+ unsigned flags_read(const gen_device_info *devinfo) const;
+
+ /**
+ * Return the subset of flag registers updated by the instruction (either
+ * partially or fully) as a bitset with byte granularity.
+ */
+ unsigned flags_written() const;
+
+ fs_reg dst;
+ fs_reg *src;
+
+ uint8_t sources; /**< Number of fs_reg sources. */
+
+ bool eot:1;
+ bool pi_noperspective:1; /**< Pixel interpolator noperspective flag */
+};
+
+/**
+ * Make the execution of \p inst dependent on the evaluation of a possibly
+ * inverted predicate.
+ */
+static inline fs_inst *
+set_predicate_inv(enum brw_predicate pred, bool inverse,
+ fs_inst *inst)
+{
+ inst->predicate = pred;
+ inst->predicate_inverse = inverse;
+ return inst;
+}
+
+/**
+ * Make the execution of \p inst dependent on the evaluation of a predicate.
+ */
+static inline fs_inst *
+set_predicate(enum brw_predicate pred, fs_inst *inst)
+{
+ return set_predicate_inv(pred, false, inst);
+}
+
+/**
+ * Write the result of evaluating the condition given by \p mod to a flag
+ * register.
+ */
+static inline fs_inst *
+set_condmod(enum brw_conditional_mod mod, fs_inst *inst)
+{
+ inst->conditional_mod = mod;
+ return inst;
+}
+
+/**
+ * Clamp the result of \p inst to the saturation range of its destination
+ * datatype.
+ */
+static inline fs_inst *
+set_saturate(bool saturate, fs_inst *inst)
+{
+ inst->saturate = saturate;
+ return inst;
+}
+
+/**
+ * Return the number of dataflow registers written by the instruction (either
+ * fully or partially) counted from 'floor(reg_offset(inst->dst) /
+ * register_size)'. The somewhat arbitrary register size unit is 4B for the
+ * UNIFORM and IMM files and 32B for all other files.
+ */
+inline unsigned
+regs_written(const fs_inst *inst)
+{
+ assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
+ return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE +
+ inst->size_written -
+ MIN2(inst->size_written, reg_padding(inst->dst)),
+ REG_SIZE);
+}
+
+/**
+ * Return the number of dataflow registers read by the instruction (either
+ * fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
+ * register_size)'. The somewhat arbitrary register size unit is 4B for the
+ * UNIFORM and IMM files and 32B for all other files.
+ */
+inline unsigned
+regs_read(const fs_inst *inst, unsigned i)
+{
+ const unsigned reg_size =
+ inst->src[i].file == UNIFORM || inst->src[i].file == IMM ? 4 : REG_SIZE;
+ return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size +
+ inst->size_read(i) -
+ MIN2(inst->size_read(i), reg_padding(inst->src[i])),
+ reg_size);
+}
+
+#endif
diff --git a/src/intel/compiler/brw_ir_vec4.h b/src/intel/compiler/brw_ir_vec4.h
new file mode 100644
index 00000000000..bd026eb2aeb
--- /dev/null
+++ b/src/intel/compiler/brw_ir_vec4.h
@@ -0,0 +1,409 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2011-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_IR_VEC4_H
+#define BRW_IR_VEC4_H
+
+#include "brw_shader.h"
+
+namespace brw {
+
+class dst_reg;
+
+class src_reg : public backend_reg
+{
+public:
+ DECLARE_RALLOC_CXX_OPERATORS(src_reg)
+
+ void init();
+
+ src_reg(enum brw_reg_file file, int nr, const glsl_type *type);
+ src_reg();
+ src_reg(struct ::brw_reg reg);
+
+ bool equals(const src_reg &r) const;
+
+ src_reg(class vec4_visitor *v, const struct glsl_type *type);
+ src_reg(class vec4_visitor *v, const struct glsl_type *type, int size);
+
+ explicit src_reg(const dst_reg &reg);
+
+ src_reg *reladdr;
+};
+
+static inline src_reg
+retype(src_reg reg, enum brw_reg_type type)
+{
+ reg.type = type;
+ return reg;
+}
+
+namespace detail {
+
+static inline void
+add_byte_offset(backend_reg *reg, unsigned bytes)
+{
+ switch (reg->file) {
+ case BAD_FILE:
+ break;
+ case VGRF:
+ case ATTR:
+ case UNIFORM:
+ reg->offset += bytes;
+ assert(reg->offset % 16 == 0);
+ break;
+ case MRF: {
+ const unsigned suboffset = reg->offset + bytes;
+ reg->nr += suboffset / REG_SIZE;
+ reg->offset = suboffset % REG_SIZE;
+ assert(reg->offset % 16 == 0);
+ break;
+ }
+ case ARF:
+ case FIXED_GRF: {
+ const unsigned suboffset = reg->subnr + bytes;
+ reg->nr += suboffset / REG_SIZE;
+ reg->subnr = suboffset % REG_SIZE;
+ assert(reg->subnr % 16 == 0);
+ break;
+ }
+ default:
+ assert(bytes == 0);
+ }
+}
+
+} /* namepace detail */
+
+static inline src_reg
+byte_offset(src_reg reg, unsigned bytes)
+{
+ detail::add_byte_offset(&reg, bytes);
+ return reg;
+}
+
+static inline src_reg
+offset(src_reg reg, unsigned width, unsigned delta)
+{
+ const unsigned stride = (reg.file == UNIFORM ? 0 : 4);
+ const unsigned num_components = MAX2(width / 4 * stride, 4);
+ return byte_offset(reg, num_components * type_sz(reg.type) * delta);
+}
+
+static inline src_reg
+horiz_offset(src_reg reg, unsigned delta)
+{
+ return byte_offset(reg, delta * type_sz(reg.type));
+}
+
+/**
+ * Reswizzle a given source register.
+ * \sa brw_swizzle().
+ */
+static inline src_reg
+swizzle(src_reg reg, unsigned swizzle)
+{
+ if (reg.file == IMM)
+ reg.ud = brw_swizzle_immediate(reg.type, reg.ud, swizzle);
+ else
+ reg.swizzle = brw_compose_swizzle(swizzle, reg.swizzle);
+
+ return reg;
+}
+
+static inline src_reg
+negate(src_reg reg)
+{
+ assert(reg.file != IMM);
+ reg.negate = !reg.negate;
+ return reg;
+}
+
+static inline bool
+is_uniform(const src_reg &reg)
+{
+ return (reg.file == IMM || reg.file == UNIFORM || reg.is_null()) &&
+ (!reg.reladdr || is_uniform(*reg.reladdr));
+}
+
+class dst_reg : public backend_reg
+{
+public:
+ DECLARE_RALLOC_CXX_OPERATORS(dst_reg)
+
+ void init();
+
+ dst_reg();
+ dst_reg(enum brw_reg_file file, int nr);
+ dst_reg(enum brw_reg_file file, int nr, const glsl_type *type,
+ unsigned writemask);
+ dst_reg(enum brw_reg_file file, int nr, brw_reg_type type,
+ unsigned writemask);
+ dst_reg(struct ::brw_reg reg);
+ dst_reg(class vec4_visitor *v, const struct glsl_type *type);
+
+ explicit dst_reg(const src_reg &reg);
+
+ bool equals(const dst_reg &r) const;
+
+ src_reg *reladdr;
+};
+
+static inline dst_reg
+retype(dst_reg reg, enum brw_reg_type type)
+{
+ reg.type = type;
+ return reg;
+}
+
+static inline dst_reg
+byte_offset(dst_reg reg, unsigned bytes)
+{
+ detail::add_byte_offset(&reg, bytes);
+ return reg;
+}
+
+static inline dst_reg
+offset(dst_reg reg, unsigned width, unsigned delta)
+{
+ const unsigned stride = (reg.file == UNIFORM ? 0 : 4);
+ const unsigned num_components = MAX2(width / 4 * stride, 4);
+ return byte_offset(reg, num_components * type_sz(reg.type) * delta);
+}
+
+static inline dst_reg
+horiz_offset(dst_reg reg, unsigned delta)
+{
+ return byte_offset(reg, delta * type_sz(reg.type));
+}
+
+static inline dst_reg
+writemask(dst_reg reg, unsigned mask)
+{
+ assert(reg.file != IMM);
+ assert((reg.writemask & mask) != 0);
+ reg.writemask &= mask;
+ return reg;
+}
+
+/**
+ * Return an integer identifying the discrete address space a register is
+ * contained in. A register is by definition fully contained in the single
+ * reg_space it belongs to, so two registers with different reg_space ids are
+ * guaranteed not to overlap. Most register files are a single reg_space of
+ * its own, only the VGRF file is composed of multiple discrete address
+ * spaces, one for each VGRF allocation.
+ */
+static inline uint32_t
+reg_space(const backend_reg &r)
+{
+ return r.file << 16 | (r.file == VGRF ? r.nr : 0);
+}
+
+/**
+ * Return the base offset in bytes of a register relative to the start of its
+ * reg_space().
+ */
+static inline unsigned
+reg_offset(const backend_reg &r)
+{
+ return (r.file == VGRF || r.file == IMM ? 0 : r.nr) *
+ (r.file == UNIFORM ? 16 : REG_SIZE) + r.offset +
+ (r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0);
+}
+
+/**
+ * Return whether the register region starting at \p r and spanning \p dr
+ * bytes could potentially overlap the register region starting at \p s and
+ * spanning \p ds bytes.
+ */
+static inline bool
+regions_overlap(const backend_reg &r, unsigned dr,
+ const backend_reg &s, unsigned ds)
+{
+ if (r.file == MRF && (r.nr & BRW_MRF_COMPR4)) {
+ /* COMPR4 regions are translated by the hardware during decompression
+ * into two separate half-regions 4 MRFs apart from each other.
+ */
+ backend_reg t0 = r;
+ t0.nr &= ~BRW_MRF_COMPR4;
+ backend_reg t1 = t0;
+ t1.offset += 4 * REG_SIZE;
+ return regions_overlap(t0, dr / 2, s, ds) ||
+ regions_overlap(t1, dr / 2, s, ds);
+
+ } else if (s.file == MRF && (s.nr & BRW_MRF_COMPR4)) {
+ return regions_overlap(s, ds, r, dr);
+
+ } else {
+ return reg_space(r) == reg_space(s) &&
+ !(reg_offset(r) + dr <= reg_offset(s) ||
+ reg_offset(s) + ds <= reg_offset(r));
+ }
+}
+
+class vec4_instruction : public backend_instruction {
+public:
+ DECLARE_RALLOC_CXX_OPERATORS(vec4_instruction)
+
+ vec4_instruction(enum opcode opcode,
+ const dst_reg &dst = dst_reg(),
+ const src_reg &src0 = src_reg(),
+ const src_reg &src1 = src_reg(),
+ const src_reg &src2 = src_reg());
+
+ dst_reg dst;
+ src_reg src[3];
+
+ enum brw_urb_write_flags urb_write_flags;
+
+ unsigned sol_binding; /**< gen6: SOL binding table index */
+ bool sol_final_write; /**< gen6: send commit message */
+ unsigned sol_vertex; /**< gen6: used for setting dst index in SVB header */
+
+ bool is_send_from_grf();
+ unsigned size_read(unsigned arg) const;
+ bool can_reswizzle(const struct gen_device_info *devinfo, int dst_writemask,
+ int swizzle, int swizzle_mask);
+ void reswizzle(int dst_writemask, int swizzle);
+ bool can_do_source_mods(const struct gen_device_info *devinfo);
+ bool can_do_writemask(const struct gen_device_info *devinfo);
+ bool can_change_types() const;
+ bool has_source_and_destination_hazard() const;
+
+ bool is_align1_partial_write()
+ {
+ return opcode == VEC4_OPCODE_SET_LOW_32BIT ||
+ opcode == VEC4_OPCODE_SET_HIGH_32BIT;
+ }
+
+ bool reads_flag()
+ {
+ return predicate || opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2;
+ }
+
+ bool reads_flag(unsigned c)
+ {
+ if (opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2)
+ return true;
+
+ switch (predicate) {
+ case BRW_PREDICATE_NONE:
+ return false;
+ case BRW_PREDICATE_ALIGN16_REPLICATE_X:
+ return c == 0;
+ case BRW_PREDICATE_ALIGN16_REPLICATE_Y:
+ return c == 1;
+ case BRW_PREDICATE_ALIGN16_REPLICATE_Z:
+ return c == 2;
+ case BRW_PREDICATE_ALIGN16_REPLICATE_W:
+ return c == 3;
+ default:
+ return true;
+ }
+ }
+
+ bool writes_flag()
+ {
+ return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
+ opcode != BRW_OPCODE_IF &&
+ opcode != BRW_OPCODE_WHILE));
+ }
+};
+
+/**
+ * Make the execution of \p inst dependent on the evaluation of a possibly
+ * inverted predicate.
+ */
+inline vec4_instruction *
+set_predicate_inv(enum brw_predicate pred, bool inverse,
+ vec4_instruction *inst)
+{
+ inst->predicate = pred;
+ inst->predicate_inverse = inverse;
+ return inst;
+}
+
+/**
+ * Make the execution of \p inst dependent on the evaluation of a predicate.
+ */
+inline vec4_instruction *
+set_predicate(enum brw_predicate pred, vec4_instruction *inst)
+{
+ return set_predicate_inv(pred, false, inst);
+}
+
+/**
+ * Write the result of evaluating the condition given by \p mod to a flag
+ * register.
+ */
+inline vec4_instruction *
+set_condmod(enum brw_conditional_mod mod, vec4_instruction *inst)
+{
+ inst->conditional_mod = mod;
+ return inst;
+}
+
+/**
+ * Clamp the result of \p inst to the saturation range of its destination
+ * datatype.
+ */
+inline vec4_instruction *
+set_saturate(bool saturate, vec4_instruction *inst)
+{
+ inst->saturate = saturate;
+ return inst;
+}
+
+/**
+ * Return the number of dataflow registers written by the instruction (either
+ * fully or partially) counted from 'floor(reg_offset(inst->dst) /
+ * register_size)'. The somewhat arbitrary register size unit is 16B for the
+ * UNIFORM and IMM files and 32B for all other files.
+ */
+inline unsigned
+regs_written(const vec4_instruction *inst)
+{
+ assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
+ return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE + inst->size_written,
+ REG_SIZE);
+}
+
+/**
+ * Return the number of dataflow registers read by the instruction (either
+ * fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
+ * register_size)'. The somewhat arbitrary register size unit is 16B for the
+ * UNIFORM and IMM files and 32B for all other files.
+ */
+inline unsigned
+regs_read(const vec4_instruction *inst, unsigned i)
+{
+ const unsigned reg_size =
+ inst->src[i].file == UNIFORM || inst->src[i].file == IMM ? 16 : REG_SIZE;
+ return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size + inst->size_read(i),
+ reg_size);
+}
+
+} /* namespace brw */
+
+#endif
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
new file mode 100644
index 00000000000..f86308521e9
--- /dev/null
+++ b/src/intel/compiler/brw_nir.c
@@ -0,0 +1,764 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "brw_shader.h"
+#include "common/gen_debug.h"
+#include "compiler/glsl_types.h"
+#include "compiler/nir/nir_builder.h"
+
+static bool
+is_input(nir_intrinsic_instr *intrin)
+{
+ return intrin->intrinsic == nir_intrinsic_load_input ||
+ intrin->intrinsic == nir_intrinsic_load_per_vertex_input ||
+ intrin->intrinsic == nir_intrinsic_load_interpolated_input;
+}
+
+static bool
+is_output(nir_intrinsic_instr *intrin)
+{
+ return intrin->intrinsic == nir_intrinsic_load_output ||
+ intrin->intrinsic == nir_intrinsic_load_per_vertex_output ||
+ intrin->intrinsic == nir_intrinsic_store_output ||
+ intrin->intrinsic == nir_intrinsic_store_per_vertex_output;
+}
+
+/**
+ * In many cases, we just add the base and offset together, so there's no
+ * reason to keep them separate. Sometimes, combining them is essential:
+ * if a shader only accesses part of a compound variable (such as a matrix
+ * or array), the variable's base may not actually exist in the VUE map.
+ *
+ * This pass adds constant offsets to instr->const_index[0], and resets
+ * the offset source to 0. Non-constant offsets remain unchanged - since
+ * we don't know what part of a compound variable is accessed, we allocate
+ * storage for the entire thing.
+ */
+
+static bool
+add_const_offset_to_base_block(nir_block *block, nir_builder *b,
+ nir_variable_mode mode)
+{
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+ if ((mode == nir_var_shader_in && is_input(intrin)) ||
+ (mode == nir_var_shader_out && is_output(intrin))) {
+ nir_src *offset = nir_get_io_offset_src(intrin);
+ nir_const_value *const_offset = nir_src_as_const_value(*offset);
+
+ if (const_offset) {
+ intrin->const_index[0] += const_offset->u32[0];
+ b->cursor = nir_before_instr(&intrin->instr);
+ nir_instr_rewrite_src(&intrin->instr, offset,
+ nir_src_for_ssa(nir_imm_int(b, 0)));
+ }
+ }
+ }
+ return true;
+}
+
+static void
+add_const_offset_to_base(nir_shader *nir, nir_variable_mode mode)
+{
+ nir_foreach_function(f, nir) {
+ if (f->impl) {
+ nir_builder b;
+ nir_builder_init(&b, f->impl);
+ nir_foreach_block(block, f->impl) {
+ add_const_offset_to_base_block(block, &b, mode);
+ }
+ }
+ }
+}
+
+static bool
+remap_vs_attrs(nir_block *block, shader_info *nir_info)
+{
+ nir_foreach_instr(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+ if (intrin->intrinsic == nir_intrinsic_load_input) {
+ /* Attributes come in a contiguous block, ordered by their
+ * gl_vert_attrib value. That means we can compute the slot
+ * number for an attribute by masking out the enabled attributes
+ * before it and counting the bits.
+ */
+ int attr = intrin->const_index[0];
+ int slot = _mesa_bitcount_64(nir_info->inputs_read &
+ BITFIELD64_MASK(attr));
+ intrin->const_index[0] = 4 * slot;
+ }
+ }
+ return true;
+}
+
+static bool
+remap_inputs_with_vue_map(nir_block *block, const struct brw_vue_map *vue_map)
+{
+ nir_foreach_instr(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+ if (intrin->intrinsic == nir_intrinsic_load_input ||
+ intrin->intrinsic == nir_intrinsic_load_per_vertex_input) {
+ int vue_slot = vue_map->varying_to_slot[intrin->const_index[0]];
+ assert(vue_slot != -1);
+ intrin->const_index[0] = vue_slot;
+ }
+ }
+ return true;
+}
+
+static bool
+remap_tess_levels(nir_builder *b, nir_intrinsic_instr *intr,
+ GLenum primitive_mode)
+{
+ const int location = nir_intrinsic_base(intr);
+ const unsigned component = nir_intrinsic_component(intr);
+ bool out_of_bounds;
+
+ if (location == VARYING_SLOT_TESS_LEVEL_INNER) {
+ switch (primitive_mode) {
+ case GL_QUADS:
+ /* gl_TessLevelInner[0..1] lives at DWords 3-2 (reversed). */
+ nir_intrinsic_set_base(intr, 0);
+ nir_intrinsic_set_component(intr, 3 - component);
+ out_of_bounds = false;
+ break;
+ case GL_TRIANGLES:
+ /* gl_TessLevelInner[0] lives at DWord 4. */
+ nir_intrinsic_set_base(intr, 1);
+ out_of_bounds = component > 0;
+ break;
+ case GL_ISOLINES:
+ out_of_bounds = true;
+ break;
+ default:
+ unreachable("Bogus tessellation domain");
+ }
+ } else if (location == VARYING_SLOT_TESS_LEVEL_OUTER) {
+ if (primitive_mode == GL_ISOLINES) {
+ /* gl_TessLevelOuter[0..1] lives at DWords 6-7 (in order). */
+ nir_intrinsic_set_base(intr, 1);
+ nir_intrinsic_set_component(intr, 2 + nir_intrinsic_component(intr));
+ out_of_bounds = component > 1;
+ } else {
+ /* Triangles use DWords 7-5 (reversed); Quads use 7-4 (reversed) */
+ nir_intrinsic_set_base(intr, 1);
+ nir_intrinsic_set_component(intr, 3 - nir_intrinsic_component(intr));
+ out_of_bounds = component == 3 && primitive_mode == GL_TRIANGLES;
+ }
+ } else {
+ return false;
+ }
+
+ if (out_of_bounds) {
+ if (nir_intrinsic_infos[intr->intrinsic].has_dest) {
+ b->cursor = nir_before_instr(&intr->instr);
+ nir_ssa_def *undef = nir_ssa_undef(b, 1, 32);
+ nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(undef));
+ }
+ nir_instr_remove(&intr->instr);
+ }
+
+ return true;
+}
+
+static bool
+remap_patch_urb_offsets(nir_block *block, nir_builder *b,
+ const struct brw_vue_map *vue_map,
+ GLenum tes_primitive_mode)
+{
+ const bool is_passthrough_tcs = b->shader->info->name &&
+ strcmp(b->shader->info->name, "passthrough") == 0;
+
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+ gl_shader_stage stage = b->shader->stage;
+
+ if ((stage == MESA_SHADER_TESS_CTRL && is_output(intrin)) ||
+ (stage == MESA_SHADER_TESS_EVAL && is_input(intrin))) {
+
+ if (!is_passthrough_tcs &&
+ remap_tess_levels(b, intrin, tes_primitive_mode))
+ continue;
+
+ int vue_slot = vue_map->varying_to_slot[intrin->const_index[0]];
+ assert(vue_slot != -1);
+ intrin->const_index[0] = vue_slot;
+
+ nir_src *vertex = nir_get_io_vertex_index_src(intrin);
+ if (vertex) {
+ nir_const_value *const_vertex = nir_src_as_const_value(*vertex);
+ if (const_vertex) {
+ intrin->const_index[0] += const_vertex->u32[0] *
+ vue_map->num_per_vertex_slots;
+ } else {
+ b->cursor = nir_before_instr(&intrin->instr);
+
+ /* Multiply by the number of per-vertex slots. */
+ nir_ssa_def *vertex_offset =
+ nir_imul(b,
+ nir_ssa_for_src(b, *vertex, 1),
+ nir_imm_int(b,
+ vue_map->num_per_vertex_slots));
+
+ /* Add it to the existing offset */
+ nir_src *offset = nir_get_io_offset_src(intrin);
+ nir_ssa_def *total_offset =
+ nir_iadd(b, vertex_offset,
+ nir_ssa_for_src(b, *offset, 1));
+
+ nir_instr_rewrite_src(&intrin->instr, offset,
+ nir_src_for_ssa(total_offset));
+ }
+ }
+ }
+ }
+ return true;
+}
+
+void
+brw_nir_lower_vs_inputs(nir_shader *nir,
+ bool is_scalar,
+ bool use_legacy_snorm_formula,
+ const uint8_t *vs_attrib_wa_flags)
+{
+ /* Start with the location of the variable's base. */
+ foreach_list_typed(nir_variable, var, node, &nir->inputs) {
+ var->data.driver_location = var->data.location;
+ }
+
+ /* Now use nir_lower_io to walk dereference chains. Attribute arrays are
+ * loaded as one vec4 or dvec4 per element (or matrix column), depending on
+ * whether it is a double-precision type or not.
+ */
+ nir_lower_io(nir, nir_var_shader_in, type_size_vec4, 0);
+
+ /* This pass needs actual constants */
+ nir_opt_constant_folding(nir);
+
+ add_const_offset_to_base(nir, nir_var_shader_in);
+
+ brw_nir_apply_attribute_workarounds(nir, use_legacy_snorm_formula,
+ vs_attrib_wa_flags);
+
+ if (is_scalar) {
+ /* Finally, translate VERT_ATTRIB_* values into the actual registers. */
+
+ nir_foreach_function(function, nir) {
+ if (function->impl) {
+ nir_foreach_block(block, function->impl) {
+ remap_vs_attrs(block, nir->info);
+ }
+ }
+ }
+ }
+}
+
+void
+brw_nir_lower_vue_inputs(nir_shader *nir, bool is_scalar,
+ const struct brw_vue_map *vue_map)
+{
+ foreach_list_typed(nir_variable, var, node, &nir->inputs) {
+ var->data.driver_location = var->data.location;
+ }
+
+ /* Inputs are stored in vec4 slots, so use type_size_vec4(). */
+ nir_lower_io(nir, nir_var_shader_in, type_size_vec4, 0);
+
+ if (is_scalar || nir->stage != MESA_SHADER_GEOMETRY) {
+ /* This pass needs actual constants */
+ nir_opt_constant_folding(nir);
+
+ add_const_offset_to_base(nir, nir_var_shader_in);
+
+ nir_foreach_function(function, nir) {
+ if (function->impl) {
+ nir_foreach_block(block, function->impl) {
+ remap_inputs_with_vue_map(block, vue_map);
+ }
+ }
+ }
+ }
+}
+
+void
+brw_nir_lower_tes_inputs(nir_shader *nir, const struct brw_vue_map *vue_map)
+{
+ foreach_list_typed(nir_variable, var, node, &nir->inputs) {
+ var->data.driver_location = var->data.location;
+ }
+
+ nir_lower_io(nir, nir_var_shader_in, type_size_vec4, 0);
+
+ /* This pass needs actual constants */
+ nir_opt_constant_folding(nir);
+
+ add_const_offset_to_base(nir, nir_var_shader_in);
+
+ nir_foreach_function(function, nir) {
+ if (function->impl) {
+ nir_builder b;
+ nir_builder_init(&b, function->impl);
+ nir_foreach_block(block, function->impl) {
+ remap_patch_urb_offsets(block, &b, vue_map,
+ nir->info->tess.primitive_mode);
+ }
+ }
+ }
+}
+
+void
+brw_nir_lower_fs_inputs(nir_shader *nir,
+ const struct gen_device_info *devinfo,
+ const struct brw_wm_prog_key *key)
+{
+ foreach_list_typed(nir_variable, var, node, &nir->inputs) {
+ var->data.driver_location = var->data.location;
+
+ /* Apply default interpolation mode.
+ *
+ * Everything defaults to smooth except for the legacy GL color
+ * built-in variables, which might be flat depending on API state.
+ */
+ if (var->data.interpolation == INTERP_MODE_NONE) {
+ const bool flat = key->flat_shade &&
+ (var->data.location == VARYING_SLOT_COL0 ||
+ var->data.location == VARYING_SLOT_COL1);
+
+ var->data.interpolation = flat ? INTERP_MODE_FLAT
+ : INTERP_MODE_SMOOTH;
+ }
+
+ /* On Ironlake and below, there is only one interpolation mode.
+ * Centroid interpolation doesn't mean anything on this hardware --
+ * there is no multisampling.
+ */
+ if (devinfo->gen < 6) {
+ var->data.centroid = false;
+ var->data.sample = false;
+ }
+ }
+
+ nir_lower_io_options lower_io_options = 0;
+ if (key->persample_interp)
+ lower_io_options |= nir_lower_io_force_sample_interpolation;
+
+ nir_lower_io(nir, nir_var_shader_in, type_size_vec4, lower_io_options);
+
+ /* This pass needs actual constants */
+ nir_opt_constant_folding(nir);
+
+ add_const_offset_to_base(nir, nir_var_shader_in);
+}
+
+void
+brw_nir_lower_vue_outputs(nir_shader *nir,
+ bool is_scalar)
+{
+ nir_foreach_variable(var, &nir->outputs) {
+ var->data.driver_location = var->data.location;
+ }
+
+ nir_lower_io(nir, nir_var_shader_out, type_size_vec4, 0);
+}
+
+void
+brw_nir_lower_tcs_outputs(nir_shader *nir, const struct brw_vue_map *vue_map,
+ GLenum tes_primitive_mode)
+{
+ nir_foreach_variable(var, &nir->outputs) {
+ var->data.driver_location = var->data.location;
+ }
+
+ nir_lower_io(nir, nir_var_shader_out, type_size_vec4, 0);
+
+ /* This pass needs actual constants */
+ nir_opt_constant_folding(nir);
+
+ add_const_offset_to_base(nir, nir_var_shader_out);
+
+ nir_foreach_function(function, nir) {
+ if (function->impl) {
+ nir_builder b;
+ nir_builder_init(&b, function->impl);
+ nir_foreach_block(block, function->impl) {
+ remap_patch_urb_offsets(block, &b, vue_map, tes_primitive_mode);
+ }
+ }
+ }
+}
+
+void
+brw_nir_lower_fs_outputs(nir_shader *nir)
+{
+ nir_foreach_variable(var, &nir->outputs) {
+ var->data.driver_location =
+ SET_FIELD(var->data.index, BRW_NIR_FRAG_OUTPUT_INDEX) |
+ SET_FIELD(var->data.location, BRW_NIR_FRAG_OUTPUT_LOCATION);
+ }
+
+ nir_lower_io(nir, nir_var_shader_out, type_size_dvec4, 0);
+}
+
+void
+brw_nir_lower_cs_shared(nir_shader *nir)
+{
+ nir_assign_var_locations(&nir->shared, &nir->num_shared,
+ type_size_scalar_bytes);
+ nir_lower_io(nir, nir_var_shared, type_size_scalar_bytes, 0);
+}
+
+#define OPT(pass, ...) ({ \
+ bool this_progress = false; \
+ NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
+ if (this_progress) \
+ progress = true; \
+ this_progress; \
+})
+
+#define OPT_V(pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
+
+static nir_shader *
+nir_optimize(nir_shader *nir, const struct brw_compiler *compiler,
+ bool is_scalar)
+{
+ nir_variable_mode indirect_mask = 0;
+ if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectInput)
+ indirect_mask |= nir_var_shader_in;
+ if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectOutput)
+ indirect_mask |= nir_var_shader_out;
+ if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectTemp)
+ indirect_mask |= nir_var_local;
+
+ bool progress;
+ do {
+ progress = false;
+ OPT_V(nir_lower_vars_to_ssa);
+ OPT(nir_opt_copy_prop_vars);
+
+ if (is_scalar) {
+ OPT(nir_lower_alu_to_scalar);
+ }
+
+ OPT(nir_copy_prop);
+
+ if (is_scalar) {
+ OPT(nir_lower_phis_to_scalar);
+ }
+
+ OPT(nir_copy_prop);
+ OPT(nir_opt_dce);
+ OPT(nir_opt_cse);
+ OPT(nir_opt_peephole_select, 0);
+ OPT(nir_opt_algebraic);
+ OPT(nir_opt_constant_folding);
+ OPT(nir_opt_dead_cf);
+ if (OPT(nir_opt_trivial_continues)) {
+ /* If nir_opt_trivial_continues makes progress, then we need to clean
+ * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
+ * to make progress.
+ */
+ OPT(nir_copy_prop);
+ OPT(nir_opt_dce);
+ }
+ OPT(nir_opt_if);
+ if (nir->options->max_unroll_iterations != 0) {
+ OPT(nir_opt_loop_unroll, indirect_mask);
+ }
+ OPT(nir_opt_remove_phis);
+ OPT(nir_opt_undef);
+ OPT_V(nir_lower_doubles, nir_lower_drcp |
+ nir_lower_dsqrt |
+ nir_lower_drsq |
+ nir_lower_dtrunc |
+ nir_lower_dfloor |
+ nir_lower_dceil |
+ nir_lower_dfract |
+ nir_lower_dround_even |
+ nir_lower_dmod);
+ OPT_V(nir_lower_64bit_pack);
+ } while (progress);
+
+ return nir;
+}
+
+/* Does some simple lowering and runs the standard suite of optimizations
+ *
+ * This is intended to be called more-or-less directly after you get the
+ * shader out of GLSL or some other source. While it is geared towards i965,
+ * it is not at all generator-specific except for the is_scalar flag. Even
+ * there, it is safe to call with is_scalar = false for a shader that is
+ * intended for the FS backend as long as nir_optimize is called again with
+ * is_scalar = true to scalarize everything prior to code gen.
+ */
+nir_shader *
+brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir)
+{
+ const struct gen_device_info *devinfo = compiler->devinfo;
+ bool progress; /* Written by OPT and OPT_V */
+ (void)progress;
+
+ const bool is_scalar = compiler->scalar_stage[nir->stage];
+
+ if (nir->stage == MESA_SHADER_GEOMETRY)
+ OPT(nir_lower_gs_intrinsics);
+
+ /* See also brw_nir_trig_workarounds.py */
+ if (compiler->precise_trig &&
+ !(devinfo->gen >= 10 || devinfo->is_kabylake))
+ OPT(brw_nir_apply_trig_workarounds);
+
+ static const nir_lower_tex_options tex_options = {
+ .lower_txp = ~0,
+ .lower_txf_offset = true,
+ .lower_rect_offset = true,
+ .lower_txd_cube_map = true,
+ };
+
+ OPT(nir_lower_tex, &tex_options);
+ OPT(nir_normalize_cubemap_coords);
+
+ OPT(nir_lower_global_vars_to_local);
+
+ OPT(nir_split_var_copies);
+
+ nir = nir_optimize(nir, compiler, is_scalar);
+
+ if (is_scalar) {
+ OPT_V(nir_lower_load_const_to_scalar);
+ }
+
+ /* Lower a bunch of stuff */
+ OPT_V(nir_lower_var_copies);
+
+ OPT_V(nir_lower_clip_cull_distance_arrays);
+
+ nir_variable_mode indirect_mask = 0;
+ if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectInput)
+ indirect_mask |= nir_var_shader_in;
+ if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectOutput)
+ indirect_mask |= nir_var_shader_out;
+ if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectTemp)
+ indirect_mask |= nir_var_local;
+
+ nir_lower_indirect_derefs(nir, indirect_mask);
+
+ nir_lower_int64(nir, nir_lower_imul64 |
+ nir_lower_isign64 |
+ nir_lower_divmod64);
+
+ /* Get rid of split copies */
+ nir = nir_optimize(nir, compiler, is_scalar);
+
+ OPT(nir_remove_dead_variables, nir_var_local);
+
+ return nir;
+}
+
+/* Prepare the given shader for codegen
+ *
+ * This function is intended to be called right before going into the actual
+ * backend and is highly backend-specific. Also, once this function has been
+ * called on a shader, it will no longer be in SSA form so most optimizations
+ * will not work.
+ */
+nir_shader *
+brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
+ bool is_scalar)
+{
+ const struct gen_device_info *devinfo = compiler->devinfo;
+ bool debug_enabled =
+ (INTEL_DEBUG & intel_debug_flag_for_shader_stage(nir->stage));
+
+ bool progress; /* Written by OPT and OPT_V */
+ (void)progress;
+
+ nir = nir_optimize(nir, compiler, is_scalar);
+
+ if (devinfo->gen >= 6) {
+ /* Try and fuse multiply-adds */
+ OPT(brw_nir_opt_peephole_ffma);
+ }
+
+ OPT(nir_opt_algebraic_late);
+
+ OPT_V(nir_lower_to_source_mods);
+ OPT(nir_copy_prop);
+ OPT(nir_opt_dce);
+ OPT(nir_opt_move_comparisons);
+
+ OPT(nir_lower_locals_to_regs);
+
+ if (unlikely(debug_enabled)) {
+ /* Re-index SSA defs so we print more sensible numbers. */
+ nir_foreach_function(function, nir) {
+ if (function->impl)
+ nir_index_ssa_defs(function->impl);
+ }
+
+ fprintf(stderr, "NIR (SSA form) for %s shader:\n",
+ _mesa_shader_stage_to_string(nir->stage));
+ nir_print_shader(nir, stderr);
+ }
+
+ OPT_V(nir_convert_from_ssa, true);
+
+ if (!is_scalar) {
+ OPT_V(nir_move_vec_src_uses_to_dest);
+ OPT(nir_lower_vec_to_movs);
+ }
+
+ /* This is the last pass we run before we start emitting stuff. It
+ * determines when we need to insert boolean resolves on Gen <= 5. We
+ * run it last because it stashes data in instr->pass_flags and we don't
+ * want that to be squashed by other NIR passes.
+ */
+ if (devinfo->gen <= 5)
+ brw_nir_analyze_boolean_resolves(nir);
+
+ nir_sweep(nir);
+
+ if (unlikely(debug_enabled)) {
+ fprintf(stderr, "NIR (final form) for %s shader:\n",
+ _mesa_shader_stage_to_string(nir->stage));
+ nir_print_shader(nir, stderr);
+ }
+
+ return nir;
+}
+
+nir_shader *
+brw_nir_apply_sampler_key(nir_shader *nir,
+ const struct brw_compiler *compiler,
+ const struct brw_sampler_prog_key_data *key_tex,
+ bool is_scalar)
+{
+ const struct gen_device_info *devinfo = compiler->devinfo;
+ nir_lower_tex_options tex_options = { 0 };
+
+ /* Iron Lake and prior require lowering of all rectangle textures */
+ if (devinfo->gen < 6)
+ tex_options.lower_rect = true;
+
+ /* Prior to Broadwell, our hardware can't actually do GL_CLAMP */
+ if (devinfo->gen < 8) {
+ tex_options.saturate_s = key_tex->gl_clamp_mask[0];
+ tex_options.saturate_t = key_tex->gl_clamp_mask[1];
+ tex_options.saturate_r = key_tex->gl_clamp_mask[2];
+ }
+
+ /* Prior to Haswell, we have to fake texture swizzle */
+ for (unsigned s = 0; s < MAX_SAMPLERS; s++) {
+ if (key_tex->swizzles[s] == SWIZZLE_NOOP)
+ continue;
+
+ tex_options.swizzle_result |= (1 << s);
+ for (unsigned c = 0; c < 4; c++)
+ tex_options.swizzles[s][c] = GET_SWZ(key_tex->swizzles[s], c);
+ }
+
+ /* Prior to Haswell, we have to lower gradients on shadow samplers */
+ tex_options.lower_txd_shadow = devinfo->gen < 8 && !devinfo->is_haswell;
+
+ tex_options.lower_y_uv_external = key_tex->y_uv_image_mask;
+ tex_options.lower_y_u_v_external = key_tex->y_u_v_image_mask;
+ tex_options.lower_yx_xuxv_external = key_tex->yx_xuxv_image_mask;
+
+ if (nir_lower_tex(nir, &tex_options)) {
+ nir_validate_shader(nir);
+ nir = nir_optimize(nir, compiler, is_scalar);
+ }
+
+ return nir;
+}
+
+enum brw_reg_type
+brw_type_for_nir_type(const struct gen_device_info *devinfo, nir_alu_type type)
+{
+ switch (type) {
+ case nir_type_uint:
+ case nir_type_uint32:
+ return BRW_REGISTER_TYPE_UD;
+ case nir_type_bool:
+ case nir_type_int:
+ case nir_type_bool32:
+ case nir_type_int32:
+ return BRW_REGISTER_TYPE_D;
+ case nir_type_float:
+ case nir_type_float32:
+ return BRW_REGISTER_TYPE_F;
+ case nir_type_float64:
+ return BRW_REGISTER_TYPE_DF;
+ case nir_type_int64:
+ return devinfo->gen < 8 ? BRW_REGISTER_TYPE_DF : BRW_REGISTER_TYPE_Q;
+ case nir_type_uint64:
+ return devinfo->gen < 8 ? BRW_REGISTER_TYPE_DF : BRW_REGISTER_TYPE_UQ;
+ default:
+ unreachable("unknown type");
+ }
+
+ return BRW_REGISTER_TYPE_F;
+}
+
+/* Returns the glsl_base_type corresponding to a nir_alu_type.
+ * This is used by both brw_vec4_nir and brw_fs_nir.
+ */
+enum glsl_base_type
+brw_glsl_base_type_for_nir_type(nir_alu_type type)
+{
+ switch (type) {
+ case nir_type_float:
+ case nir_type_float32:
+ return GLSL_TYPE_FLOAT;
+
+ case nir_type_float64:
+ return GLSL_TYPE_DOUBLE;
+
+ case nir_type_int:
+ case nir_type_int32:
+ return GLSL_TYPE_INT;
+
+ case nir_type_uint:
+ case nir_type_uint32:
+ return GLSL_TYPE_UINT;
+
+ default:
+ unreachable("bad type");
+ }
+}
diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h
new file mode 100644
index 00000000000..76d7ec89f9b
--- /dev/null
+++ b/src/intel/compiler/brw_nir.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "brw_reg.h"
+#include "compiler/nir/nir.h"
+#include "brw_compiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int type_size_scalar(const struct glsl_type *type);
+int type_size_vec4(const struct glsl_type *type);
+int type_size_dvec4(const struct glsl_type *type);
+
+static inline int
+type_size_scalar_bytes(const struct glsl_type *type)
+{
+ return type_size_scalar(type) * 4;
+}
+
+static inline int
+type_size_vec4_bytes(const struct glsl_type *type)
+{
+ return type_size_vec4(type) * 16;
+}
+
+/* Flags set in the instr->pass_flags field by i965 analysis passes */
+enum {
+ BRW_NIR_NON_BOOLEAN = 0x0,
+
+ /* Indicates that the given instruction's destination is a boolean
+ * value but that it needs to be resolved before it can be used.
+ * On Gen <= 5, CMP instructions return a 32-bit value where the bottom
+ * bit represents the actual true/false value of the compare and the top
+ * 31 bits are undefined. In order to use this value, we have to do a
+ * "resolve" operation by replacing the value of the CMP with -(x & 1)
+ * to sign-extend the bottom bit to 0/~0.
+ */
+ BRW_NIR_BOOLEAN_NEEDS_RESOLVE = 0x1,
+
+ /* Indicates that the given instruction's destination is a boolean
+ * value that has intentionally been left unresolved. Not all boolean
+ * values need to be resolved immediately. For instance, if we have
+ *
+ * CMP r1 r2 r3
+ * CMP r4 r5 r6
+ * AND r7 r1 r4
+ *
+ * We don't have to resolve the result of the two CMP instructions
+ * immediately because the AND still does an AND of the bottom bits.
+ * Instead, we can save ourselves instructions by delaying the resolve
+ * until after the AND. The result of the two CMP instructions is left
+ * as BRW_NIR_BOOLEAN_UNRESOLVED.
+ */
+ BRW_NIR_BOOLEAN_UNRESOLVED = 0x2,
+
+ /* Indicates a that the given instruction's destination is a boolean
+ * value that does not need a resolve. For instance, if you AND two
+ * values that are BRW_NIR_BOOLEAN_NEEDS_RESOLVE then we know that both
+ * values will be 0/~0 before we get them and the result of the AND is
+ * also guaranteed to be 0/~0 and does not need a resolve.
+ */
+ BRW_NIR_BOOLEAN_NO_RESOLVE = 0x3,
+
+ /* A mask to mask the boolean status values off of instr->pass_flags */
+ BRW_NIR_BOOLEAN_MASK = 0x3,
+};
+
+void brw_nir_analyze_boolean_resolves(nir_shader *nir);
+
+nir_shader *brw_preprocess_nir(const struct brw_compiler *compiler,
+ nir_shader *nir);
+
+bool brw_nir_lower_intrinsics(nir_shader *nir,
+ struct brw_stage_prog_data *prog_data);
+void brw_nir_lower_vs_inputs(nir_shader *nir,
+ bool is_scalar,
+ bool use_legacy_snorm_formula,
+ const uint8_t *vs_attrib_wa_flags);
+void brw_nir_lower_vue_inputs(nir_shader *nir, bool is_scalar,
+ const struct brw_vue_map *vue_map);
+void brw_nir_lower_tes_inputs(nir_shader *nir, const struct brw_vue_map *vue);
+void brw_nir_lower_fs_inputs(nir_shader *nir,
+ const struct gen_device_info *devinfo,
+ const struct brw_wm_prog_key *key);
+void brw_nir_lower_vue_outputs(nir_shader *nir, bool is_scalar);
+void brw_nir_lower_tcs_outputs(nir_shader *nir, const struct brw_vue_map *vue,
+ GLenum tes_primitive_mode);
+void brw_nir_lower_fs_outputs(nir_shader *nir);
+void brw_nir_lower_cs_shared(nir_shader *nir);
+
+nir_shader *brw_postprocess_nir(nir_shader *nir,
+ const struct brw_compiler *compiler,
+ bool is_scalar);
+
+bool brw_nir_apply_attribute_workarounds(nir_shader *nir,
+ bool use_legacy_snorm_formula,
+ const uint8_t *attrib_wa_flags);
+
+bool brw_nir_apply_trig_workarounds(nir_shader *nir);
+
+void brw_nir_apply_tcs_quads_workaround(nir_shader *nir);
+
+nir_shader *brw_nir_apply_sampler_key(nir_shader *nir,
+ const struct brw_compiler *compiler,
+ const struct brw_sampler_prog_key_data *key,
+ bool is_scalar);
+
+enum brw_reg_type brw_type_for_nir_type(const struct gen_device_info *devinfo,
+ nir_alu_type type);
+
+enum glsl_base_type brw_glsl_base_type_for_nir_type(nir_alu_type type);
+
+void brw_nir_setup_glsl_uniforms(nir_shader *shader,
+ const struct gl_program *prog,
+ struct brw_stage_prog_data *stage_prog_data,
+ bool is_scalar);
+
+void brw_nir_setup_arb_uniforms(nir_shader *shader, struct gl_program *prog,
+ struct brw_stage_prog_data *stage_prog_data);
+
+bool brw_nir_opt_peephole_ffma(nir_shader *shader);
+
+#define BRW_NIR_FRAG_OUTPUT_INDEX_SHIFT 0
+#define BRW_NIR_FRAG_OUTPUT_INDEX_MASK INTEL_MASK(0, 0)
+#define BRW_NIR_FRAG_OUTPUT_LOCATION_SHIFT 1
+#define BRW_NIR_FRAG_OUTPUT_LOCATION_MASK INTEL_MASK(31, 1)
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/intel/compiler/brw_nir_analyze_boolean_resolves.c b/src/intel/compiler/brw_nir_analyze_boolean_resolves.c
new file mode 100644
index 00000000000..4ad26e21103
--- /dev/null
+++ b/src/intel/compiler/brw_nir_analyze_boolean_resolves.c
@@ -0,0 +1,269 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ * Jason Ekstrand <[email protected]>
+ */
+
+#include "brw_nir.h"
+
+/*
+ * This file implements an analysis pass that determines when we have to do
+ * a boolean resolve on Gen <= 5. Instructions that need a boolean resolve
+ * will have the booleans portion of the instr->pass_flags field set to
+ * BRW_NIR_BOOLEAN_NEEDS_RESOLVE.
+ */
+
+
+/** Returns the resolve status for the given source
+ *
+ * If the source has a parent instruction then the resolve status is the
+ * status of the parent instruction. If the source does not have a parent
+ * instruction then we don't know so we return NON_BOOLEAN.
+ */
+static uint8_t
+get_resolve_status_for_src(nir_src *src)
+{
+ if (src->is_ssa) {
+ nir_instr *src_instr = src->ssa->parent_instr;
+ uint8_t resolve_status = src_instr->pass_flags & BRW_NIR_BOOLEAN_MASK;
+
+ /* If the source instruction needs resolve, then from the perspective
+ * of the user, it's a true boolean.
+ */
+ if (resolve_status == BRW_NIR_BOOLEAN_NEEDS_RESOLVE)
+ resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE;
+ return resolve_status;
+ } else {
+ return BRW_NIR_NON_BOOLEAN;
+ }
+}
+
+/** Marks the given source as needing a resolve
+ *
+ * If the given source corresponds to an unresolved boolean it marks it as
+ * needing a resolve. Otherwise, we leave it alone.
+ */
+static bool
+src_mark_needs_resolve(nir_src *src, void *void_state)
+{
+ if (src->is_ssa) {
+ nir_instr *src_instr = src->ssa->parent_instr;
+ uint8_t resolve_status = src_instr->pass_flags & BRW_NIR_BOOLEAN_MASK;
+
+ /* If the source instruction is unresolved, then mark it as needing
+ * to be resolved.
+ */
+ if (resolve_status == BRW_NIR_BOOLEAN_UNRESOLVED) {
+ src_instr->pass_flags &= ~BRW_NIR_BOOLEAN_MASK;
+ src_instr->pass_flags |= BRW_NIR_BOOLEAN_NEEDS_RESOLVE;
+ }
+
+ }
+
+ return true;
+}
+
+static bool
+analyze_boolean_resolves_block(nir_block *block)
+{
+ nir_foreach_instr(instr, block) {
+ switch (instr->type) {
+ case nir_instr_type_alu: {
+ /* For ALU instructions, the resolve status is handled in a
+ * three-step process.
+ *
+ * 1) Look at the instruction type and sources and determine if it
+ * can be left unresolved.
+ *
+ * 2) Look at the destination and see if we have to resolve
+ * anyway. (This is the case if this instruction is not the
+ * only instruction writing to a given register.)
+ *
+ * 3) If the instruction has a resolve status other than
+ * BOOL_UNRESOLVED or BOOL_NEEDS_RESOLVE then we walk through
+ * the sources and ensure that they are also resolved. This
+ * ensures that we don't end up with any stray unresolved
+ * booleans going into ADDs or something like that.
+ */
+
+ uint8_t resolve_status;
+ nir_alu_instr *alu = nir_instr_as_alu(instr);
+ switch (alu->op) {
+ case nir_op_ball_fequal2:
+ case nir_op_ball_iequal2:
+ case nir_op_ball_fequal3:
+ case nir_op_ball_iequal3:
+ case nir_op_ball_fequal4:
+ case nir_op_ball_iequal4:
+ case nir_op_bany_fnequal2:
+ case nir_op_bany_inequal2:
+ case nir_op_bany_fnequal3:
+ case nir_op_bany_inequal3:
+ case nir_op_bany_fnequal4:
+ case nir_op_bany_inequal4:
+ /* These are only implemented by the vec4 backend and its
+ * implementation emits resolved booleans. At some point in the
+ * future, this may change and we'll have to remove some of the
+ * above cases.
+ */
+ resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE;
+ break;
+
+ case nir_op_imov:
+ case nir_op_inot:
+ /* This is a single-source instruction. Just copy the resolve
+ * status from the source.
+ */
+ resolve_status = get_resolve_status_for_src(&alu->src[0].src);
+ break;
+
+ case nir_op_iand:
+ case nir_op_ior:
+ case nir_op_ixor: {
+ uint8_t src0_status = get_resolve_status_for_src(&alu->src[0].src);
+ uint8_t src1_status = get_resolve_status_for_src(&alu->src[1].src);
+
+ if (src0_status == src1_status) {
+ resolve_status = src0_status;
+ } else if (src0_status == BRW_NIR_NON_BOOLEAN ||
+ src1_status == BRW_NIR_NON_BOOLEAN) {
+ /* If one of the sources is a non-boolean then the whole
+ * thing is a non-boolean.
+ */
+ resolve_status = BRW_NIR_NON_BOOLEAN;
+ } else {
+ /* At this point one of them is a true boolean and one is a
+ * boolean that needs a resolve. We could either resolve the
+ * unresolved source or we could resolve here. If we resolve
+ * the unresolved source then we get two resolves for the price
+ * of one. Just set this one to BOOLEAN_NO_RESOLVE and we'll
+ * let the code below force a resolve on the unresolved source.
+ */
+ resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE;
+ }
+ break;
+ }
+
+ default:
+ if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) == nir_type_bool) {
+ /* This instructions will turn into a CMP when we actually emit
+ * them so the result will have to be resolved before it can be
+ * used.
+ */
+ resolve_status = BRW_NIR_BOOLEAN_UNRESOLVED;
+
+ /* Even though the destination is allowed to be left
+ * unresolved, the sources are treated as regular integers or
+ * floats so they need to be resolved.
+ */
+ nir_foreach_src(instr, src_mark_needs_resolve, NULL);
+ } else {
+ resolve_status = BRW_NIR_NON_BOOLEAN;
+ }
+ }
+
+ /* If the destination is SSA, go ahead allow unresolved booleans.
+ * If the destination register doesn't have a well-defined parent_instr
+ * we need to resolve immediately.
+ */
+ if (!alu->dest.dest.is_ssa &&
+ resolve_status == BRW_NIR_BOOLEAN_UNRESOLVED) {
+ resolve_status = BRW_NIR_BOOLEAN_NEEDS_RESOLVE;
+ }
+
+ instr->pass_flags = (instr->pass_flags & ~BRW_NIR_BOOLEAN_MASK) |
+ resolve_status;
+
+ /* Finally, resolve sources if it's needed */
+ switch (resolve_status) {
+ case BRW_NIR_BOOLEAN_NEEDS_RESOLVE:
+ case BRW_NIR_BOOLEAN_UNRESOLVED:
+ /* This instruction is either unresolved or we're doing the
+ * resolve here; leave the sources alone.
+ */
+ break;
+
+ case BRW_NIR_BOOLEAN_NO_RESOLVE:
+ case BRW_NIR_NON_BOOLEAN:
+ nir_foreach_src(instr, src_mark_needs_resolve, NULL);
+ break;
+
+ default:
+ unreachable("Invalid boolean flag");
+ }
+
+ break;
+ }
+
+ case nir_instr_type_load_const: {
+ nir_load_const_instr *load = nir_instr_as_load_const(instr);
+
+ /* For load_const instructions, it's a boolean exactly when it holds
+ * one of the values NIR_TRUE or NIR_FALSE.
+ *
+ * Since load_const instructions don't have any sources, we don't
+ * have to worry about resolving them.
+ */
+ instr->pass_flags &= ~BRW_NIR_BOOLEAN_MASK;
+ if (load->value.u32[0] == NIR_TRUE || load->value.u32[0] == NIR_FALSE) {
+ instr->pass_flags |= BRW_NIR_BOOLEAN_NO_RESOLVE;
+ } else {
+ instr->pass_flags |= BRW_NIR_NON_BOOLEAN;
+ }
+ continue;
+ }
+
+ default:
+ /* Everything else is an unknown non-boolean value and needs to
+ * have all sources resolved.
+ */
+ instr->pass_flags = (instr->pass_flags & ~BRW_NIR_BOOLEAN_MASK) |
+ BRW_NIR_NON_BOOLEAN;
+ nir_foreach_src(instr, src_mark_needs_resolve, NULL);
+ continue;
+ }
+ }
+
+ nir_if *following_if = nir_block_get_following_if(block);
+ if (following_if)
+ src_mark_needs_resolve(&following_if->condition, NULL);
+
+ return true;
+}
+
+static void
+analyze_boolean_resolves_impl(nir_function_impl *impl)
+{
+ nir_foreach_block(block, impl) {
+ analyze_boolean_resolves_block(block);
+ }
+}
+
+void
+brw_nir_analyze_boolean_resolves(nir_shader *shader)
+{
+ nir_foreach_function(function, shader) {
+ if (function->impl)
+ analyze_boolean_resolves_impl(function->impl);
+ }
+}
diff --git a/src/intel/compiler/brw_nir_attribute_workarounds.c b/src/intel/compiler/brw_nir_attribute_workarounds.c
new file mode 100644
index 00000000000..d695771f04a
--- /dev/null
+++ b/src/intel/compiler/brw_nir_attribute_workarounds.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "brw_nir.h"
+
+/**
+ * Prior to Haswell, the hardware can't natively support GL_FIXED or
+ * 2_10_10_10_REV vertex formats. This pass inserts extra shader code
+ * to produce the correct values.
+ */
+
+struct attr_wa_state {
+ nir_builder builder;
+ bool impl_progress;
+ bool use_legacy_snorm_formula;
+ const uint8_t *wa_flags;
+};
+
+static bool
+apply_attr_wa_block(nir_block *block, struct attr_wa_state *state)
+{
+ nir_builder *b = &state->builder;
+
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ if (intrin->intrinsic != nir_intrinsic_load_input)
+ continue;
+
+ uint8_t wa_flags = state->wa_flags[intrin->const_index[0]];
+ if (wa_flags == 0)
+ continue;
+
+ b->cursor = nir_after_instr(instr);
+
+ nir_ssa_def *val = &intrin->dest.ssa;
+
+ /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
+ * come in as floating point conversions of the integer values.
+ */
+ if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
+ nir_ssa_def *scaled =
+ nir_fmul(b, val, nir_imm_float(b, 1.0f / 65536.0f));
+ nir_ssa_def *comps[4];
+ for (int i = 0; i < val->num_components; i++) {
+ bool rescale = i < (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK);
+ comps[i] = nir_channel(b, rescale ? scaled : val, i);
+ }
+ val = nir_vec(b, comps, val->num_components);
+ }
+
+ /* Do sign recovery for 2101010 formats if required. */
+ if (wa_flags & BRW_ATTRIB_WA_SIGN) {
+ /* sign recovery shift: <22, 22, 22, 30> */
+ nir_ssa_def *shift = nir_imm_ivec4(b, 22, 22, 22, 30);
+ val = nir_ishr(b, nir_ishl(b, val, shift), shift);
+ }
+
+ /* Apply BGRA swizzle if required. */
+ if (wa_flags & BRW_ATTRIB_WA_BGRA) {
+ val = nir_swizzle(b, val, (unsigned[4]){2,1,0,3}, 4, true);
+ }
+
+ if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
+ /* ES 3.0 has different rules for converting signed normalized
+ * fixed-point numbers than desktop GL.
+ */
+ if ((wa_flags & BRW_ATTRIB_WA_SIGN) &&
+ !state->use_legacy_snorm_formula) {
+ /* According to equation 2.2 of the ES 3.0 specification,
+ * signed normalization conversion is done by:
+ *
+ * f = c / (2^(b-1)-1)
+ */
+ nir_ssa_def *es3_normalize_factor =
+ nir_imm_vec4(b, 1.0f / ((1 << 9) - 1), 1.0f / ((1 << 9) - 1),
+ 1.0f / ((1 << 9) - 1), 1.0f / ((1 << 1) - 1));
+ val = nir_fmax(b,
+ nir_fmul(b, nir_i2f(b, val), es3_normalize_factor),
+ nir_imm_float(b, -1.0f));
+ } else {
+ /* The following equations are from the OpenGL 3.2 specification:
+ *
+ * 2.1 unsigned normalization
+ * f = c/(2^n-1)
+ *
+ * 2.2 signed normalization
+ * f = (2c+1)/(2^n-1)
+ *
+ * Both of these share a common divisor, which we handle by
+ * multiplying by 1 / (2^b - 1) for b = <10, 10, 10, 2>.
+ */
+ nir_ssa_def *normalize_factor =
+ nir_imm_vec4(b, 1.0f / ((1 << 10) - 1), 1.0f / ((1 << 10) - 1),
+ 1.0f / ((1 << 10) - 1), 1.0f / ((1 << 2) - 1));
+
+ if (wa_flags & BRW_ATTRIB_WA_SIGN) {
+ /* For signed normalization, the numerator is 2c+1. */
+ nir_ssa_def *two = nir_imm_float(b, 2.0f);
+ nir_ssa_def *one = nir_imm_float(b, 1.0f);
+ val = nir_fadd(b, nir_fmul(b, nir_i2f(b, val), two), one);
+ } else {
+ /* For unsigned normalization, the numerator is just c. */
+ val = nir_u2f(b, val);
+ }
+ val = nir_fmul(b, val, normalize_factor);
+ }
+ }
+
+ if (wa_flags & BRW_ATTRIB_WA_SCALE) {
+ val = (wa_flags & BRW_ATTRIB_WA_SIGN) ? nir_i2f(b, val)
+ : nir_u2f(b, val);
+ }
+
+ nir_ssa_def_rewrite_uses_after(&intrin->dest.ssa, nir_src_for_ssa(val),
+ val->parent_instr);
+ state->impl_progress = true;
+ }
+
+ return true;
+}
+
+bool
+brw_nir_apply_attribute_workarounds(nir_shader *shader,
+ bool use_legacy_snorm_formula,
+ const uint8_t *attrib_wa_flags)
+{
+ bool progress = false;
+ struct attr_wa_state state = {
+ .use_legacy_snorm_formula = use_legacy_snorm_formula,
+ .wa_flags = attrib_wa_flags,
+ };
+
+ nir_foreach_function(func, shader) {
+ if (!func->impl)
+ continue;
+
+ nir_builder_init(&state.builder, func->impl);
+ state.impl_progress = false;
+
+ nir_foreach_block(block, func->impl) {
+ apply_attr_wa_block(block, &state);
+ }
+
+ if (state.impl_progress) {
+ nir_metadata_preserve(func->impl, nir_metadata_block_index |
+ nir_metadata_dominance);
+ progress = true;
+ }
+ }
+
+ return progress;
+}
diff --git a/src/intel/compiler/brw_nir_intrinsics.c b/src/intel/compiler/brw_nir_intrinsics.c
new file mode 100644
index 00000000000..901a1fb0ab9
--- /dev/null
+++ b/src/intel/compiler/brw_nir_intrinsics.c
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+struct lower_intrinsics_state {
+ nir_shader *nir;
+ union {
+ struct brw_stage_prog_data *prog_data;
+ struct brw_cs_prog_data *cs_prog_data;
+ };
+ nir_function_impl *impl;
+ bool progress;
+ nir_builder builder;
+ bool cs_thread_id_used;
+};
+
+static nir_ssa_def *
+read_thread_local_id(struct lower_intrinsics_state *state)
+{
+ nir_builder *b = &state->builder;
+ nir_shader *nir = state->nir;
+ const unsigned *sizes = nir->info->cs.local_size;
+ const unsigned group_size = sizes[0] * sizes[1] * sizes[2];
+
+ /* Some programs have local_size dimensions so small that the thread local
+ * ID will always be 0.
+ */
+ if (group_size <= 8)
+ return nir_imm_int(b, 0);
+
+ assert(state->cs_prog_data->thread_local_id_index >= 0);
+ state->cs_thread_id_used = true;
+ const int id_index = state->cs_prog_data->thread_local_id_index;
+
+ nir_intrinsic_instr *load =
+ nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform);
+ load->num_components = 1;
+ load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
+ nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
+ nir_intrinsic_set_base(load, id_index * sizeof(uint32_t));
+ nir_intrinsic_set_range(load, sizeof(uint32_t));
+ nir_builder_instr_insert(b, &load->instr);
+ return &load->dest.ssa;
+}
+
+static bool
+lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
+ nir_block *block)
+{
+ bool progress = false;
+ nir_builder *b = &state->builder;
+ nir_shader *nir = state->nir;
+
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
+
+ b->cursor = nir_after_instr(&intrinsic->instr);
+
+ nir_ssa_def *sysval;
+ switch (intrinsic->intrinsic) {
+ case nir_intrinsic_load_local_invocation_index: {
+ assert(nir->stage == MESA_SHADER_COMPUTE);
+ /* We construct the local invocation index from:
+ *
+ * gl_LocalInvocationIndex =
+ * cs_thread_local_id + channel_num;
+ */
+ nir_ssa_def *thread_local_id = read_thread_local_id(state);
+ nir_ssa_def *channel = nir_load_channel_num(b);
+ sysval = nir_iadd(b, channel, thread_local_id);
+ break;
+ }
+
+ case nir_intrinsic_load_local_invocation_id: {
+ assert(nir->stage == MESA_SHADER_COMPUTE);
+ /* We lower gl_LocalInvocationID from gl_LocalInvocationIndex based
+ * on this formula:
+ *
+ * gl_LocalInvocationID.x =
+ * gl_LocalInvocationIndex % gl_WorkGroupSize.x;
+ * gl_LocalInvocationID.y =
+ * (gl_LocalInvocationIndex / gl_WorkGroupSize.x) %
+ * gl_WorkGroupSize.y;
+ * gl_LocalInvocationID.z =
+ * (gl_LocalInvocationIndex /
+ * (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) %
+ * gl_WorkGroupSize.z;
+ */
+ unsigned *size = nir->info->cs.local_size;
+
+ nir_ssa_def *local_index = nir_load_local_invocation_index(b);
+
+ nir_const_value uvec3;
+ uvec3.u32[0] = 1;
+ uvec3.u32[1] = size[0];
+ uvec3.u32[2] = size[0] * size[1];
+ nir_ssa_def *div_val = nir_build_imm(b, 3, 32, uvec3);
+ uvec3.u32[0] = size[0];
+ uvec3.u32[1] = size[1];
+ uvec3.u32[2] = size[2];
+ nir_ssa_def *mod_val = nir_build_imm(b, 3, 32, uvec3);
+
+ sysval = nir_umod(b, nir_udiv(b, local_index, div_val), mod_val);
+ break;
+ }
+
+ default:
+ continue;
+ }
+
+ nir_ssa_def_rewrite_uses(&intrinsic->dest.ssa, nir_src_for_ssa(sysval));
+ nir_instr_remove(&intrinsic->instr);
+
+ state->progress = true;
+ }
+
+ return progress;
+}
+
+static void
+lower_cs_intrinsics_convert_impl(struct lower_intrinsics_state *state)
+{
+ nir_builder_init(&state->builder, state->impl);
+
+ nir_foreach_block(block, state->impl) {
+ lower_cs_intrinsics_convert_block(state, block);
+ }
+
+ nir_metadata_preserve(state->impl,
+ nir_metadata_block_index | nir_metadata_dominance);
+}
+
+bool
+brw_nir_lower_intrinsics(nir_shader *nir, struct brw_stage_prog_data *prog_data)
+{
+ /* Currently we only lower intrinsics for compute shaders */
+ if (nir->stage != MESA_SHADER_COMPUTE)
+ return false;
+
+ bool progress = false;
+ struct lower_intrinsics_state state;
+ memset(&state, 0, sizeof(state));
+ state.nir = nir;
+ state.prog_data = prog_data;
+
+ do {
+ state.progress = false;
+ nir_foreach_function(function, nir) {
+ if (function->impl) {
+ state.impl = function->impl;
+ lower_cs_intrinsics_convert_impl(&state);
+ }
+ }
+ progress |= state.progress;
+ } while (state.progress);
+
+ if (nir->stage == MESA_SHADER_COMPUTE && !state.cs_thread_id_used)
+ state.cs_prog_data->thread_local_id_index = -1;
+
+ return progress;
+}
diff --git a/src/intel/compiler/brw_nir_opt_peephole_ffma.c b/src/intel/compiler/brw_nir_opt_peephole_ffma.c
new file mode 100644
index 00000000000..cc225e1847b
--- /dev/null
+++ b/src/intel/compiler/brw_nir_opt_peephole_ffma.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ * Jason Ekstrand ([email protected])
+ *
+ */
+
+#include "brw_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+/*
+ * Implements a small peephole optimization that looks for a multiply that
+ * is only ever used in an add and replaces both with an fma.
+ */
+
+static inline bool
+are_all_uses_fadd(nir_ssa_def *def)
+{
+ if (!list_empty(&def->if_uses))
+ return false;
+
+ nir_foreach_use(use_src, def) {
+ nir_instr *use_instr = use_src->parent_instr;
+
+ if (use_instr->type != nir_instr_type_alu)
+ return false;
+
+ nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
+ switch (use_alu->op) {
+ case nir_op_fadd:
+ break; /* This one's ok */
+
+ case nir_op_imov:
+ case nir_op_fmov:
+ case nir_op_fneg:
+ case nir_op_fabs:
+ assert(use_alu->dest.dest.is_ssa);
+ if (!are_all_uses_fadd(&use_alu->dest.dest.ssa))
+ return false;
+ break;
+
+ default:
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static nir_alu_instr *
+get_mul_for_src(nir_alu_src *src, int num_components,
+ uint8_t swizzle[4], bool *negate, bool *abs)
+{
+ uint8_t swizzle_tmp[4];
+ assert(src->src.is_ssa && !src->abs && !src->negate);
+
+ nir_instr *instr = src->src.ssa->parent_instr;
+ if (instr->type != nir_instr_type_alu)
+ return NULL;
+
+ nir_alu_instr *alu = nir_instr_as_alu(instr);
+
+ /* We want to bail if any of the other ALU operations involved is labled
+ * exact. One reason for this is that, while the value that is changing is
+ * actually the result of the add and not the multiply, the intention of
+ * the user when they specify an exact multiply is that they want *that*
+ * value and what they don't care about is the add. Another reason is that
+ * SPIR-V explicitly requires this behaviour.
+ */
+ if (alu->exact)
+ return NULL;
+
+ switch (alu->op) {
+ case nir_op_imov:
+ case nir_op_fmov:
+ alu = get_mul_for_src(&alu->src[0], num_components, swizzle, negate, abs);
+ break;
+
+ case nir_op_fneg:
+ alu = get_mul_for_src(&alu->src[0], num_components, swizzle, negate, abs);
+ *negate = !*negate;
+ break;
+
+ case nir_op_fabs:
+ alu = get_mul_for_src(&alu->src[0], num_components, swizzle, negate, abs);
+ *negate = false;
+ *abs = true;
+ break;
+
+ case nir_op_fmul:
+ /* Only absorb a fmul into a ffma if the fmul is only used in fadd
+ * operations. This prevents us from being too aggressive with our
+ * fusing which can actually lead to more instructions.
+ */
+ if (!are_all_uses_fadd(&alu->dest.dest.ssa))
+ return NULL;
+ break;
+
+ default:
+ return NULL;
+ }
+
+ if (!alu)
+ return NULL;
+
+ /* Copy swizzle data before overwriting it to avoid setting a wrong swizzle.
+ *
+ * Example:
+ * Former swizzle[] = xyzw
+ * src->swizzle[] = zyxx
+ *
+ * Expected output swizzle = zyxx
+ * If we reuse swizzle in the loop, then output swizzle would be zyzz.
+ */
+ memcpy(swizzle_tmp, swizzle, 4*sizeof(uint8_t));
+ for (int i = 0; i < num_components; i++)
+ swizzle[i] = swizzle_tmp[src->swizzle[i]];
+
+ return alu;
+}
+
+/**
+ * Given a list of (at least two) nir_alu_src's, tells if any of them is a
+ * constant value and is used only once.
+ */
+static bool
+any_alu_src_is_a_constant(nir_alu_src srcs[])
+{
+ for (unsigned i = 0; i < 2; i++) {
+ if (srcs[i].src.ssa->parent_instr->type == nir_instr_type_load_const) {
+ nir_load_const_instr *load_const =
+ nir_instr_as_load_const (srcs[i].src.ssa->parent_instr);
+
+ if (list_is_singular(&load_const->def.uses) &&
+ list_empty(&load_const->def.if_uses)) {
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+static bool
+brw_nir_opt_peephole_ffma_block(nir_builder *b, nir_block *block)
+{
+ bool progress = false;
+
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_alu)
+ continue;
+
+ nir_alu_instr *add = nir_instr_as_alu(instr);
+ if (add->op != nir_op_fadd)
+ continue;
+
+ assert(add->dest.dest.is_ssa);
+ if (add->exact)
+ continue;
+
+ assert(add->src[0].src.is_ssa && add->src[1].src.is_ssa);
+
+ /* This, is the case a + a. We would rather handle this with an
+ * algebraic reduction than fuse it. Also, we want to only fuse
+ * things where the multiply is used only once and, in this case,
+ * it would be used twice by the same instruction.
+ */
+ if (add->src[0].src.ssa == add->src[1].src.ssa)
+ continue;
+
+ nir_alu_instr *mul;
+ uint8_t add_mul_src, swizzle[4];
+ bool negate, abs;
+ for (add_mul_src = 0; add_mul_src < 2; add_mul_src++) {
+ for (unsigned i = 0; i < 4; i++)
+ swizzle[i] = i;
+
+ negate = false;
+ abs = false;
+
+ mul = get_mul_for_src(&add->src[add_mul_src],
+ add->dest.dest.ssa.num_components,
+ swizzle, &negate, &abs);
+
+ if (mul != NULL)
+ break;
+ }
+
+ if (mul == NULL)
+ continue;
+
+ unsigned bit_size = add->dest.dest.ssa.bit_size;
+
+ nir_ssa_def *mul_src[2];
+ mul_src[0] = mul->src[0].src.ssa;
+ mul_src[1] = mul->src[1].src.ssa;
+
+ /* If any of the operands of the fmul and any of the fadd is a constant,
+ * we bypass because it will be more efficient as the constants will be
+ * propagated as operands, potentially saving two load_const instructions.
+ */
+ if (any_alu_src_is_a_constant(mul->src) &&
+ any_alu_src_is_a_constant(add->src)) {
+ continue;
+ }
+
+ b->cursor = nir_before_instr(&add->instr);
+
+ if (abs) {
+ for (unsigned i = 0; i < 2; i++)
+ mul_src[i] = nir_fabs(b, mul_src[i]);
+ }
+
+ if (negate)
+ mul_src[0] = nir_fneg(b, mul_src[0]);
+
+ nir_alu_instr *ffma = nir_alu_instr_create(b->shader, nir_op_ffma);
+ ffma->dest.saturate = add->dest.saturate;
+ ffma->dest.write_mask = add->dest.write_mask;
+
+ for (unsigned i = 0; i < 2; i++) {
+ ffma->src[i].src = nir_src_for_ssa(mul_src[i]);
+ for (unsigned j = 0; j < add->dest.dest.ssa.num_components; j++)
+ ffma->src[i].swizzle[j] = mul->src[i].swizzle[swizzle[j]];
+ }
+ nir_alu_src_copy(&ffma->src[2], &add->src[1 - add_mul_src], ffma);
+
+ assert(add->dest.dest.is_ssa);
+
+ nir_ssa_dest_init(&ffma->instr, &ffma->dest.dest,
+ add->dest.dest.ssa.num_components,
+ bit_size,
+ add->dest.dest.ssa.name);
+ nir_ssa_def_rewrite_uses(&add->dest.dest.ssa,
+ nir_src_for_ssa(&ffma->dest.dest.ssa));
+
+ nir_builder_instr_insert(b, &ffma->instr);
+ assert(list_empty(&add->dest.dest.ssa.uses));
+ nir_instr_remove(&add->instr);
+
+ progress = true;
+ }
+
+ return progress;
+}
+
+static bool
+brw_nir_opt_peephole_ffma_impl(nir_function_impl *impl)
+{
+ bool progress = false;
+
+ nir_builder builder;
+ nir_builder_init(&builder, impl);
+
+ nir_foreach_block(block, impl) {
+ progress |= brw_nir_opt_peephole_ffma_block(&builder, block);
+ }
+
+ if (progress)
+ nir_metadata_preserve(impl, nir_metadata_block_index |
+ nir_metadata_dominance);
+
+ return progress;
+}
+
+bool
+brw_nir_opt_peephole_ffma(nir_shader *shader)
+{
+ bool progress = false;
+
+ nir_foreach_function(function, shader) {
+ if (function->impl)
+ progress |= brw_nir_opt_peephole_ffma_impl(function->impl);
+ }
+
+ return progress;
+}
diff --git a/src/intel/compiler/brw_nir_tcs_workarounds.c b/src/intel/compiler/brw_nir_tcs_workarounds.c
new file mode 100644
index 00000000000..a85f493c704
--- /dev/null
+++ b/src/intel/compiler/brw_nir_tcs_workarounds.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "brw_nir.h"
+
+/**
+ * Implements the WaPreventHSTessLevelsInterference workaround (for Gen7-8).
+ *
+ * From the Broadwell PRM, Volume 7 (3D-Media-GPGPU), Page 494 (below the
+ * definition of the patch header layouts):
+ *
+ * "HW Bug: The Tessellation stage will incorrectly add domain points
+ * along patch edges under the following conditions, which may result
+ * in conformance failures and/or cracking artifacts:
+ *
+ * * QUAD domain
+ * * INTEGER partitioning
+ * * All three TessFactors in a given U or V direction (e.g., V
+ * direction: UEQ0, InsideV, UEQ1) are all exactly 1.0
+ * * All three TessFactors in the other direction are > 1.0 and all
+ * round up to the same integer value (e.g, U direction:
+ * VEQ0 = 3.1, InsideU = 3.7, VEQ1 = 3.4)
+ *
+ * The suggested workaround (to be implemented as part of the postamble
+ * to the HS shader in the HS kernel) is:
+ *
+ * if (
+ * (TF[UEQ0] > 1.0) ||
+ * (TF[VEQ0] > 1.0) ||
+ * (TF[UEQ1] > 1.0) ||
+ * (TF[VEQ1] > 1.0) ||
+ * (TF[INSIDE_U] > 1.0) ||
+ * (TF[INSIDE_V] > 1.0) )
+ * {
+ * TF[INSIDE_U] = (TF[INSIDE_U] == 1.0) ? 2.0 : TF[INSIDE_U];
+ * TF[INSIDE_V] = (TF[INSIDE_V] == 1.0) ? 2.0 : TF[INSIDE_V];
+ * }"
+ *
+ * There's a subtlety here. Intel internal HSD-ES bug 1208668495 notes
+ * that the above workaround fails to fix certain GL/ES CTS tests which
+ * have inside tessellation factors of -1.0. This can be explained by
+ * a quote from the ARB_tessellation_shader specification:
+ *
+ * "If "equal_spacing" is used, the floating-point tessellation level is
+ * first clamped to the range [1,<max>], where <max> is implementation-
+ * dependent maximum tessellation level (MAX_TESS_GEN_LEVEL)."
+ *
+ * In other words, the actual inner tessellation factor used is
+ * clamp(TF[INSIDE_*], 1.0, 64.0). So we want to compare the clamped
+ * value against 1.0. To accomplish this, we change the comparison from
+ * (TF[INSIDE_*] == 1.0) to (TF[INSIDE_*] <= 1.0).
+ */
+
+static inline nir_ssa_def *
+load_output(nir_builder *b, int num_components, int offset, int component)
+{
+ nir_intrinsic_instr *load =
+ nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_output);
+ nir_ssa_dest_init(&load->instr, &load->dest, num_components, 32, NULL);
+ load->num_components = num_components;
+ load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
+ nir_intrinsic_set_base(load, offset);
+ nir_intrinsic_set_component(load, component);
+
+ nir_builder_instr_insert(b, &load->instr);
+
+ return &load->dest.ssa;
+}
+
+static void
+emit_quads_workaround(nir_builder *b, nir_block *block)
+{
+ b->cursor = nir_after_block_before_jump(block);
+
+ nir_ssa_def *inner = load_output(b, 2, 0, 2);
+ nir_ssa_def *outer = load_output(b, 4, 1, 0);
+
+ nir_ssa_def *any_greater_than_1 =
+ nir_ior(b, nir_bany(b, nir_flt(b, nir_imm_float(b, 1.0f), outer)),
+ nir_bany(b, nir_flt(b, nir_imm_float(b, 1.0f), inner)));
+
+ nir_if *if_stmt = nir_if_create(b->shader);
+ if_stmt->condition = nir_src_for_ssa(any_greater_than_1);
+ nir_builder_cf_insert(b, &if_stmt->cf_node);
+
+ /* Fill out the new then-block */
+ b->cursor = nir_after_cf_list(&if_stmt->then_list);
+
+ inner = nir_bcsel(b, nir_fge(b, nir_imm_float(b, 1.0f), inner),
+ nir_imm_float(b, 2.0f), inner);
+
+ nir_intrinsic_instr *store =
+ nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output);
+ store->num_components = 2;
+ nir_intrinsic_set_write_mask(store, WRITEMASK_XY);
+ nir_intrinsic_set_component(store, 2);
+ store->src[0] = nir_src_for_ssa(inner);
+ store->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
+ nir_builder_instr_insert(b, &store->instr);
+}
+
+void
+brw_nir_apply_tcs_quads_workaround(nir_shader *nir)
+{
+ assert(nir->stage == MESA_SHADER_TESS_CTRL);
+
+ nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+
+ nir_builder b;
+ nir_builder_init(&b, impl);
+
+ /* emit_quads_workaround() inserts an if statement into each block,
+ * which splits it in two. This changes the set of predecessors of
+ * the end block. We want to process the original set, so to be safe,
+ * save it off to an array first.
+ */
+ const unsigned num_end_preds = impl->end_block->predecessors->entries;
+ nir_block *end_preds[num_end_preds];
+ unsigned i = 0;
+ struct set_entry *entry;
+
+ set_foreach(impl->end_block->predecessors, entry) {
+ end_preds[i++] = (nir_block *) entry->key;
+ }
+
+ for (i = 0; i < num_end_preds; i++) {
+ emit_quads_workaround(&b, end_preds[i]);
+ }
+
+ nir_metadata_preserve(impl, 0);
+}
diff --git a/src/intel/compiler/brw_nir_trig_workarounds.py b/src/intel/compiler/brw_nir_trig_workarounds.py
new file mode 100644
index 00000000000..6a77d64dbd4
--- /dev/null
+++ b/src/intel/compiler/brw_nir_trig_workarounds.py
@@ -0,0 +1,43 @@
+#
+# Copyright (C) 2016 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+import nir_algebraic
+
+# Prior to Kaby Lake, The SIN and COS instructions on Intel hardware can
+# produce values slightly outside of the [-1.0, 1.0] range for a small set of
+# values. Obviously, this can break everyone's expectations about trig
+# functions. This appears to be fixed in Kaby Lake.
+#
+# According to an internal presentation, the COS instruction can produce
+# a value up to 1.000027 for inputs in the range (0.08296, 0.09888). One
+# suggested workaround is to multiply by 0.99997, scaling down the
+# amplitude slightly. Apparently this also minimizes the error function,
+# reducing the maximum error from 0.00006 to about 0.00003.
+
+trig_workarounds = [
+ (('fsin', 'x'), ('fmul', ('fsin', 'x'), 0.99997)),
+ (('fcos', 'x'), ('fmul', ('fcos', 'x'), 0.99997)),
+]
+
+print '#include "brw_nir.h"'
+print nir_algebraic.AlgebraicPass("brw_nir_apply_trig_workarounds",
+ trig_workarounds).render()
diff --git a/src/intel/compiler/brw_packed_float.c b/src/intel/compiler/brw_packed_float.c
new file mode 100644
index 00000000000..9b7687a756f
--- /dev/null
+++ b/src/intel/compiler/brw_packed_float.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include "brw_reg.h"
+
+union fu {
+ float f;
+ unsigned u;
+ struct {
+ unsigned mantissa:23;
+ unsigned exponent:8;
+ unsigned sign:1;
+ } s;
+};
+
+int
+brw_float_to_vf(float f)
+{
+ union fu fu = { .f = f };
+
+ /* ±0.0f is special cased. */
+ if (f == 0.0f)
+ return fu.s.sign << 7;
+
+ unsigned mantissa = fu.s.mantissa >> (23 - 4);
+ unsigned exponent = fu.s.exponent - (127 - 3);
+ unsigned vf = (fu.s.sign << 7) | (exponent << 4) | mantissa;
+
+ /* 0.125 would have had the same representation as 0.0, so reject it. */
+ if ((vf & 0x7f) == 0)
+ return -1;
+
+ /* Make sure the mantissa fits in 4-bits and the exponent in 3-bits. */
+ if (fu.u & 0x7ffff || exponent > 7)
+ return -1;
+
+ return vf;
+}
+
+float
+brw_vf_to_float(unsigned char vf)
+{
+ union fu fu;
+
+ /* ±0.0f is special cased. */
+ if (vf == 0x00 || vf == 0x80) {
+ fu.u = vf << 24;
+ return fu.f;
+ }
+
+ fu.s.sign = vf >> 7;
+ fu.s.exponent = ((vf & 0x70) >> 4) + (127 - 3);
+ fu.s.mantissa = (vf & 0xf) << (23 - 4);
+
+ return fu.f;
+}
diff --git a/src/intel/compiler/brw_predicated_break.cpp b/src/intel/compiler/brw_predicated_break.cpp
new file mode 100644
index 00000000000..607715dace4
--- /dev/null
+++ b/src/intel/compiler/brw_predicated_break.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_cfg.h"
+
+using namespace brw;
+
+/** @file brw_predicated_break.cpp
+ *
+ * Loops are often structured as
+ *
+ * loop:
+ * CMP.f0
+ * (+f0) IF
+ * BREAK
+ * ENDIF
+ * ...
+ * WHILE loop
+ *
+ * This peephole pass removes the IF and ENDIF instructions and predicates the
+ * BREAK, dropping two instructions from the loop body.
+ *
+ * If the loop was a DO { ... } WHILE loop, it looks like
+ *
+ * loop:
+ * ...
+ * CMP.f0
+ * (+f0) IF
+ * BREAK
+ * ENDIF
+ * WHILE loop
+ *
+ * and we can remove the BREAK instruction and predicate the WHILE.
+ */
+
+bool
+opt_predicated_break(backend_shader *s)
+{
+ bool progress = false;
+
+ foreach_block (block, s->cfg) {
+ if (block->start_ip != block->end_ip)
+ continue;
+
+ /* BREAK and CONTINUE instructions, by definition, can only be found at
+ * the ends of basic blocks.
+ */
+ backend_instruction *jump_inst = block->end();
+ if (jump_inst->opcode != BRW_OPCODE_BREAK &&
+ jump_inst->opcode != BRW_OPCODE_CONTINUE)
+ continue;
+
+ backend_instruction *if_inst = block->prev()->end();
+ if (if_inst->opcode != BRW_OPCODE_IF)
+ continue;
+
+ backend_instruction *endif_inst = block->next()->start();
+ if (endif_inst->opcode != BRW_OPCODE_ENDIF)
+ continue;
+
+ bblock_t *jump_block = block;
+ bblock_t *if_block = jump_block->prev();
+ bblock_t *endif_block = jump_block->next();
+
+ jump_inst->predicate = if_inst->predicate;
+ jump_inst->predicate_inverse = if_inst->predicate_inverse;
+
+ bblock_t *earlier_block = if_block;
+ if (if_block->start_ip == if_block->end_ip) {
+ earlier_block = if_block->prev();
+ }
+
+ if_inst->remove(if_block);
+
+ bblock_t *later_block = endif_block;
+ if (endif_block->start_ip == endif_block->end_ip) {
+ later_block = endif_block->next();
+ }
+ endif_inst->remove(endif_block);
+
+ if (!earlier_block->ends_with_control_flow()) {
+ earlier_block->children.make_empty();
+ earlier_block->add_successor(s->cfg->mem_ctx, jump_block);
+ }
+
+ if (!later_block->starts_with_control_flow()) {
+ later_block->parents.make_empty();
+ }
+ jump_block->add_successor(s->cfg->mem_ctx, later_block);
+
+ if (earlier_block->can_combine_with(jump_block)) {
+ earlier_block->combine_with(jump_block);
+
+ block = earlier_block;
+ }
+
+ /* Now look at the first instruction of the block following the BREAK. If
+ * it's a WHILE, we can delete the break, predicate the WHILE, and join
+ * the two basic blocks.
+ */
+ bblock_t *while_block = earlier_block->next();
+ backend_instruction *while_inst = while_block->start();
+
+ if (jump_inst->opcode == BRW_OPCODE_BREAK &&
+ while_inst->opcode == BRW_OPCODE_WHILE &&
+ while_inst->predicate == BRW_PREDICATE_NONE) {
+ jump_inst->remove(earlier_block);
+ while_inst->predicate = jump_inst->predicate;
+ while_inst->predicate_inverse = !jump_inst->predicate_inverse;
+
+ earlier_block->children.make_empty();
+ earlier_block->add_successor(s->cfg->mem_ctx, while_block);
+
+ assert(earlier_block->can_combine_with(while_block));
+ earlier_block->combine_with(while_block);
+
+ earlier_block->next()->parents.make_empty();
+ earlier_block->add_successor(s->cfg->mem_ctx, earlier_block->next());
+ }
+
+ progress = true;
+ }
+
+ if (progress)
+ s->invalidate_live_intervals();
+
+ return progress;
+}
diff --git a/src/intel/compiler/brw_reg.h b/src/intel/compiler/brw_reg.h
new file mode 100644
index 00000000000..f8c3340e452
--- /dev/null
+++ b/src/intel/compiler/brw_reg.h
@@ -0,0 +1,1135 @@
+/*
+ Copyright (C) Intel Corp. 2006. All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+ * Authors:
+ * Keith Whitwell <[email protected]>
+ */
+
+/** @file brw_reg.h
+ *
+ * This file defines struct brw_reg, which is our representation for EU
+ * registers. They're not a hardware specific format, just an abstraction
+ * that intends to capture the full flexibility of the hardware registers.
+ *
+ * The brw_eu_emit.c layer's brw_set_dest/brw_set_src[01] functions encode
+ * the abstract brw_reg type into the actual hardware instruction encoding.
+ */
+
+#ifndef BRW_REG_H
+#define BRW_REG_H
+
+#include <stdbool.h>
+#include "main/compiler.h"
+#include "main/macros.h"
+#include "program/prog_instruction.h"
+#include "brw_eu_defines.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct gen_device_info;
+
+/** Number of general purpose registers (VS, WM, etc) */
+#define BRW_MAX_GRF 128
+
+/**
+ * First GRF used for the MRF hack.
+ *
+ * On gen7, MRFs are no longer used, and contiguous GRFs are used instead. We
+ * haven't converted our compiler to be aware of this, so it asks for MRFs and
+ * brw_eu_emit.c quietly converts them to be accesses of the top GRFs. The
+ * register allocators have to be careful of this to avoid corrupting the "MRF"s
+ * with actual GRF allocations.
+ */
+#define GEN7_MRF_HACK_START 112
+
+/** Number of message register file registers */
+#define BRW_MAX_MRF(gen) (gen == 6 ? 24 : 16)
+
+#define BRW_SWIZZLE4(a,b,c,d) (((a)<<0) | ((b)<<2) | ((c)<<4) | ((d)<<6))
+#define BRW_GET_SWZ(swz, idx) (((swz) >> ((idx)*2)) & 0x3)
+
+#define BRW_SWIZZLE_NOOP BRW_SWIZZLE4(0,1,2,3)
+#define BRW_SWIZZLE_XYZW BRW_SWIZZLE4(0,1,2,3)
+#define BRW_SWIZZLE_XXXX BRW_SWIZZLE4(0,0,0,0)
+#define BRW_SWIZZLE_YYYY BRW_SWIZZLE4(1,1,1,1)
+#define BRW_SWIZZLE_ZZZZ BRW_SWIZZLE4(2,2,2,2)
+#define BRW_SWIZZLE_WWWW BRW_SWIZZLE4(3,3,3,3)
+#define BRW_SWIZZLE_XYXY BRW_SWIZZLE4(0,1,0,1)
+#define BRW_SWIZZLE_YXYX BRW_SWIZZLE4(1,0,1,0)
+#define BRW_SWIZZLE_XZXZ BRW_SWIZZLE4(0,2,0,2)
+#define BRW_SWIZZLE_YZXW BRW_SWIZZLE4(1,2,0,3)
+#define BRW_SWIZZLE_YWYW BRW_SWIZZLE4(1,3,1,3)
+#define BRW_SWIZZLE_ZXYW BRW_SWIZZLE4(2,0,1,3)
+#define BRW_SWIZZLE_ZWZW BRW_SWIZZLE4(2,3,2,3)
+#define BRW_SWIZZLE_WZWZ BRW_SWIZZLE4(3,2,3,2)
+#define BRW_SWIZZLE_WZYX BRW_SWIZZLE4(3,2,1,0)
+#define BRW_SWIZZLE_XXZZ BRW_SWIZZLE4(0,0,2,2)
+#define BRW_SWIZZLE_YYWW BRW_SWIZZLE4(1,1,3,3)
+#define BRW_SWIZZLE_YXWZ BRW_SWIZZLE4(1,0,3,2)
+
+#define BRW_SWZ_COMP_INPUT(comp) (BRW_SWIZZLE_XYZW >> ((comp)*2))
+#define BRW_SWZ_COMP_OUTPUT(comp) (BRW_SWIZZLE_XYZW << ((comp)*2))
+
+static inline bool
+brw_is_single_value_swizzle(unsigned swiz)
+{
+ return (swiz == BRW_SWIZZLE_XXXX ||
+ swiz == BRW_SWIZZLE_YYYY ||
+ swiz == BRW_SWIZZLE_ZZZZ ||
+ swiz == BRW_SWIZZLE_WWWW);
+}
+
+/**
+ * Compute the swizzle obtained from the application of \p swz0 on the result
+ * of \p swz1. The argument ordering is expected to match function
+ * composition.
+ */
+static inline unsigned
+brw_compose_swizzle(unsigned swz0, unsigned swz1)
+{
+ return BRW_SWIZZLE4(
+ BRW_GET_SWZ(swz1, BRW_GET_SWZ(swz0, 0)),
+ BRW_GET_SWZ(swz1, BRW_GET_SWZ(swz0, 1)),
+ BRW_GET_SWZ(swz1, BRW_GET_SWZ(swz0, 2)),
+ BRW_GET_SWZ(swz1, BRW_GET_SWZ(swz0, 3)));
+}
+
+/**
+ * Return the result of applying swizzle \p swz to shuffle the bits of \p mask
+ * (AKA image).
+ */
+static inline unsigned
+brw_apply_swizzle_to_mask(unsigned swz, unsigned mask)
+{
+ unsigned result = 0;
+
+ for (unsigned i = 0; i < 4; i++) {
+ if (mask & (1 << BRW_GET_SWZ(swz, i)))
+ result |= 1 << i;
+ }
+
+ return result;
+}
+
+/**
+ * Return the result of applying the inverse of swizzle \p swz to shuffle the
+ * bits of \p mask (AKA preimage). Useful to find out which components are
+ * read from a swizzled source given the instruction writemask.
+ */
+static inline unsigned
+brw_apply_inv_swizzle_to_mask(unsigned swz, unsigned mask)
+{
+ unsigned result = 0;
+
+ for (unsigned i = 0; i < 4; i++) {
+ if (mask & (1 << i))
+ result |= 1 << BRW_GET_SWZ(swz, i);
+ }
+
+ return result;
+}
+
+/**
+ * Construct an identity swizzle for the set of enabled channels given by \p
+ * mask. The result will only reference channels enabled in the provided \p
+ * mask, assuming that \p mask is non-zero. The constructed swizzle will
+ * satisfy the property that for any instruction OP and any mask:
+ *
+ * brw_OP(p, brw_writemask(dst, mask),
+ * brw_swizzle(src, brw_swizzle_for_mask(mask)));
+ *
+ * will be equivalent to the same instruction without swizzle:
+ *
+ * brw_OP(p, brw_writemask(dst, mask), src);
+ */
+static inline unsigned
+brw_swizzle_for_mask(unsigned mask)
+{
+ unsigned last = (mask ? ffs(mask) - 1 : 0);
+ unsigned swz[4];
+
+ for (unsigned i = 0; i < 4; i++)
+ last = swz[i] = (mask & (1 << i) ? i : last);
+
+ return BRW_SWIZZLE4(swz[0], swz[1], swz[2], swz[3]);
+}
+
+/**
+ * Construct an identity swizzle for the first \p n components of a vector.
+ * When only a subset of channels of a vec4 are used we don't want to
+ * reference the other channels, as that will tell optimization passes that
+ * those other channels are used.
+ */
+static inline unsigned
+brw_swizzle_for_size(unsigned n)
+{
+ return brw_swizzle_for_mask((1 << n) - 1);
+}
+
+/**
+ * Converse of brw_swizzle_for_mask(). Returns the mask of components
+ * accessed by the specified swizzle \p swz.
+ */
+static inline unsigned
+brw_mask_for_swizzle(unsigned swz)
+{
+ return brw_apply_inv_swizzle_to_mask(swz, ~0);
+}
+
+enum PACKED brw_reg_type {
+ BRW_REGISTER_TYPE_UD = 0,
+ BRW_REGISTER_TYPE_D,
+ BRW_REGISTER_TYPE_UW,
+ BRW_REGISTER_TYPE_W,
+ BRW_REGISTER_TYPE_F,
+
+ /** Non-immediates only: @{ */
+ BRW_REGISTER_TYPE_UB,
+ BRW_REGISTER_TYPE_B,
+ /** @} */
+
+ /** Immediates only: @{ */
+ BRW_REGISTER_TYPE_UV, /* Gen6+ */
+ BRW_REGISTER_TYPE_V,
+ BRW_REGISTER_TYPE_VF,
+ /** @} */
+
+ BRW_REGISTER_TYPE_DF, /* Gen7+ (no immediates until Gen8+) */
+
+ /* Gen8+ */
+ BRW_REGISTER_TYPE_HF,
+ BRW_REGISTER_TYPE_UQ,
+ BRW_REGISTER_TYPE_Q,
+};
+
+unsigned brw_reg_type_to_hw_type(const struct gen_device_info *devinfo,
+ enum brw_reg_type type, enum brw_reg_file file);
+
+#define brw_element_size(devinfo, inst, operand) \
+ brw_hw_reg_type_to_size(devinfo, \
+ brw_inst_ ## operand ## _reg_type(devinfo, inst), \
+ brw_inst_ ## operand ## _reg_file(devinfo, inst))
+unsigned brw_hw_reg_type_to_size(const struct gen_device_info *devinfo,
+ unsigned type, enum brw_reg_file file);
+
+const char *brw_reg_type_letters(unsigned brw_reg_type);
+uint32_t brw_swizzle_immediate(enum brw_reg_type type, uint32_t x, unsigned swz);
+
+#define REG_SIZE (8*4)
+
+/* These aren't hardware structs, just something useful for us to pass around:
+ *
+ * Align1 operation has a lot of control over input ranges. Used in
+ * WM programs to implement shaders decomposed into "channel serial"
+ * or "structure of array" form:
+ */
+struct brw_reg {
+ union {
+ struct {
+ enum brw_reg_type type:4;
+ enum brw_reg_file file:3; /* :2 hardware format */
+ unsigned negate:1; /* source only */
+ unsigned abs:1; /* source only */
+ unsigned address_mode:1; /* relative addressing, hopefully! */
+ unsigned pad0:1;
+ unsigned subnr:5; /* :1 in align16 */
+ unsigned nr:16;
+ };
+ uint32_t bits;
+ };
+
+ union {
+ struct {
+ unsigned swizzle:8; /* src only, align16 only */
+ unsigned writemask:4; /* dest only, align16 only */
+ int indirect_offset:10; /* relative addressing offset */
+ unsigned vstride:4; /* source only */
+ unsigned width:3; /* src only, align1 only */
+ unsigned hstride:2; /* align1 only */
+ unsigned pad1:1;
+ };
+
+ double df;
+ uint64_t u64;
+ int64_t d64;
+ float f;
+ int d;
+ unsigned ud;
+ };
+};
+
+static inline bool
+brw_regs_equal(const struct brw_reg *a, const struct brw_reg *b)
+{
+ const bool df = a->type == BRW_REGISTER_TYPE_DF && a->file == IMM;
+ return a->bits == b->bits && (df ? a->u64 == b->u64 : a->ud == b->ud);
+}
+
+struct brw_indirect {
+ unsigned addr_subnr:4;
+ int addr_offset:10;
+ unsigned pad:18;
+};
+
+
+static inline unsigned
+type_sz(unsigned type)
+{
+ switch(type) {
+ case BRW_REGISTER_TYPE_UQ:
+ case BRW_REGISTER_TYPE_Q:
+ case BRW_REGISTER_TYPE_DF:
+ return 8;
+ case BRW_REGISTER_TYPE_UD:
+ case BRW_REGISTER_TYPE_D:
+ case BRW_REGISTER_TYPE_F:
+ case BRW_REGISTER_TYPE_VF:
+ return 4;
+ case BRW_REGISTER_TYPE_UW:
+ case BRW_REGISTER_TYPE_W:
+ case BRW_REGISTER_TYPE_UV:
+ case BRW_REGISTER_TYPE_V:
+ case BRW_REGISTER_TYPE_HF:
+ return 2;
+ case BRW_REGISTER_TYPE_UB:
+ case BRW_REGISTER_TYPE_B:
+ return 1;
+ default:
+ unreachable("not reached");
+ }
+}
+
+/**
+ * Return an integer type of the requested size and signedness.
+ */
+static inline enum brw_reg_type
+brw_int_type(unsigned sz, bool is_signed)
+{
+ switch (sz) {
+ case 1:
+ return (is_signed ? BRW_REGISTER_TYPE_B : BRW_REGISTER_TYPE_UB);
+ case 2:
+ return (is_signed ? BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW);
+ case 4:
+ return (is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD);
+ case 8:
+ return (is_signed ? BRW_REGISTER_TYPE_Q : BRW_REGISTER_TYPE_UQ);
+ default:
+ unreachable("Not reached.");
+ }
+}
+
+/**
+ * Construct a brw_reg.
+ * \param file one of the BRW_x_REGISTER_FILE values
+ * \param nr register number/index
+ * \param subnr register sub number
+ * \param negate register negate modifier
+ * \param abs register abs modifier
+ * \param type one of BRW_REGISTER_TYPE_x
+ * \param vstride one of BRW_VERTICAL_STRIDE_x
+ * \param width one of BRW_WIDTH_x
+ * \param hstride one of BRW_HORIZONTAL_STRIDE_x
+ * \param swizzle one of BRW_SWIZZLE_x
+ * \param writemask WRITEMASK_X/Y/Z/W bitfield
+ */
+static inline struct brw_reg
+brw_reg(enum brw_reg_file file,
+ unsigned nr,
+ unsigned subnr,
+ unsigned negate,
+ unsigned abs,
+ enum brw_reg_type type,
+ unsigned vstride,
+ unsigned width,
+ unsigned hstride,
+ unsigned swizzle,
+ unsigned writemask)
+{
+ struct brw_reg reg;
+ if (file == BRW_GENERAL_REGISTER_FILE)
+ assert(nr < BRW_MAX_GRF);
+ else if (file == BRW_ARCHITECTURE_REGISTER_FILE)
+ assert(nr <= BRW_ARF_TIMESTAMP);
+ /* Asserting on the MRF register number requires to know the hardware gen
+ * (gen6 has 24 MRF registers), which we don't know here, so we assert
+ * for that in the generators and in brw_eu_emit.c
+ */
+
+ reg.type = type;
+ reg.file = file;
+ reg.negate = negate;
+ reg.abs = abs;
+ reg.address_mode = BRW_ADDRESS_DIRECT;
+ reg.pad0 = 0;
+ reg.subnr = subnr * type_sz(type);
+ reg.nr = nr;
+
+ /* Could do better: If the reg is r5.3<0;1,0>, we probably want to
+ * set swizzle and writemask to W, as the lower bits of subnr will
+ * be lost when converted to align16. This is probably too much to
+ * keep track of as you'd want it adjusted by suboffset(), etc.
+ * Perhaps fix up when converting to align16?
+ */
+ reg.swizzle = swizzle;
+ reg.writemask = writemask;
+ reg.indirect_offset = 0;
+ reg.vstride = vstride;
+ reg.width = width;
+ reg.hstride = hstride;
+ reg.pad1 = 0;
+ return reg;
+}
+
+/** Construct float[16] register */
+static inline struct brw_reg
+brw_vec16_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+ return brw_reg(file,
+ nr,
+ subnr,
+ 0,
+ 0,
+ BRW_REGISTER_TYPE_F,
+ BRW_VERTICAL_STRIDE_16,
+ BRW_WIDTH_16,
+ BRW_HORIZONTAL_STRIDE_1,
+ BRW_SWIZZLE_XYZW,
+ WRITEMASK_XYZW);
+}
+
+/** Construct float[8] register */
+static inline struct brw_reg
+brw_vec8_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+ return brw_reg(file,
+ nr,
+ subnr,
+ 0,
+ 0,
+ BRW_REGISTER_TYPE_F,
+ BRW_VERTICAL_STRIDE_8,
+ BRW_WIDTH_8,
+ BRW_HORIZONTAL_STRIDE_1,
+ BRW_SWIZZLE_XYZW,
+ WRITEMASK_XYZW);
+}
+
+/** Construct float[4] register */
+static inline struct brw_reg
+brw_vec4_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+ return brw_reg(file,
+ nr,
+ subnr,
+ 0,
+ 0,
+ BRW_REGISTER_TYPE_F,
+ BRW_VERTICAL_STRIDE_4,
+ BRW_WIDTH_4,
+ BRW_HORIZONTAL_STRIDE_1,
+ BRW_SWIZZLE_XYZW,
+ WRITEMASK_XYZW);
+}
+
+/** Construct float[2] register */
+static inline struct brw_reg
+brw_vec2_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+ return brw_reg(file,
+ nr,
+ subnr,
+ 0,
+ 0,
+ BRW_REGISTER_TYPE_F,
+ BRW_VERTICAL_STRIDE_2,
+ BRW_WIDTH_2,
+ BRW_HORIZONTAL_STRIDE_1,
+ BRW_SWIZZLE_XYXY,
+ WRITEMASK_XY);
+}
+
+/** Construct float[1] register */
+static inline struct brw_reg
+brw_vec1_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+ return brw_reg(file,
+ nr,
+ subnr,
+ 0,
+ 0,
+ BRW_REGISTER_TYPE_F,
+ BRW_VERTICAL_STRIDE_0,
+ BRW_WIDTH_1,
+ BRW_HORIZONTAL_STRIDE_0,
+ BRW_SWIZZLE_XXXX,
+ WRITEMASK_X);
+}
+
+static inline struct brw_reg
+brw_vecn_reg(unsigned width, enum brw_reg_file file,
+ unsigned nr, unsigned subnr)
+{
+ switch (width) {
+ case 1:
+ return brw_vec1_reg(file, nr, subnr);
+ case 2:
+ return brw_vec2_reg(file, nr, subnr);
+ case 4:
+ return brw_vec4_reg(file, nr, subnr);
+ case 8:
+ return brw_vec8_reg(file, nr, subnr);
+ case 16:
+ return brw_vec16_reg(file, nr, subnr);
+ default:
+ unreachable("Invalid register width");
+ }
+}
+
+static inline struct brw_reg
+retype(struct brw_reg reg, enum brw_reg_type type)
+{
+ reg.type = type;
+ return reg;
+}
+
+static inline struct brw_reg
+firsthalf(struct brw_reg reg)
+{
+ return reg;
+}
+
+static inline struct brw_reg
+sechalf(struct brw_reg reg)
+{
+ if (reg.vstride)
+ reg.nr++;
+ return reg;
+}
+
+static inline struct brw_reg
+offset(struct brw_reg reg, unsigned delta)
+{
+ reg.nr += delta;
+ return reg;
+}
+
+
+static inline struct brw_reg
+byte_offset(struct brw_reg reg, unsigned bytes)
+{
+ unsigned newoffset = reg.nr * REG_SIZE + reg.subnr + bytes;
+ reg.nr = newoffset / REG_SIZE;
+ reg.subnr = newoffset % REG_SIZE;
+ return reg;
+}
+
+static inline struct brw_reg
+suboffset(struct brw_reg reg, unsigned delta)
+{
+ return byte_offset(reg, delta * type_sz(reg.type));
+}
+
+/** Construct unsigned word[16] register */
+static inline struct brw_reg
+brw_uw16_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+ return suboffset(retype(brw_vec16_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+/** Construct unsigned word[8] register */
+static inline struct brw_reg
+brw_uw8_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+ return suboffset(retype(brw_vec8_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+/** Construct unsigned word[1] register */
+static inline struct brw_reg
+brw_uw1_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+ return suboffset(retype(brw_vec1_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+static inline struct brw_reg
+brw_ud1_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+ return retype(brw_vec1_reg(file, nr, subnr), BRW_REGISTER_TYPE_UD);
+}
+
+static inline struct brw_reg
+brw_imm_reg(enum brw_reg_type type)
+{
+ return brw_reg(BRW_IMMEDIATE_VALUE,
+ 0,
+ 0,
+ 0,
+ 0,
+ type,
+ BRW_VERTICAL_STRIDE_0,
+ BRW_WIDTH_1,
+ BRW_HORIZONTAL_STRIDE_0,
+ 0,
+ 0);
+}
+
+/** Construct float immediate register */
+static inline struct brw_reg
+brw_imm_df(double df)
+{
+ struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_DF);
+ imm.df = df;
+ return imm;
+}
+
+static inline struct brw_reg
+brw_imm_f(float f)
+{
+ struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_F);
+ imm.f = f;
+ return imm;
+}
+
+/** Construct integer immediate register */
+static inline struct brw_reg
+brw_imm_d(int d)
+{
+ struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_D);
+ imm.d = d;
+ return imm;
+}
+
+/** Construct uint immediate register */
+static inline struct brw_reg
+brw_imm_ud(unsigned ud)
+{
+ struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UD);
+ imm.ud = ud;
+ return imm;
+}
+
+/** Construct ushort immediate register */
+static inline struct brw_reg
+brw_imm_uw(uint16_t uw)
+{
+ struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UW);
+ imm.ud = uw | (uw << 16);
+ return imm;
+}
+
+/** Construct short immediate register */
+static inline struct brw_reg
+brw_imm_w(int16_t w)
+{
+ struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_W);
+ imm.d = w | (w << 16);
+ return imm;
+}
+
+/* brw_imm_b and brw_imm_ub aren't supported by hardware - the type
+ * numbers alias with _V and _VF below:
+ */
+
+/** Construct vector of eight signed half-byte values */
+static inline struct brw_reg
+brw_imm_v(unsigned v)
+{
+ struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_V);
+ imm.ud = v;
+ return imm;
+}
+
+/** Construct vector of eight unsigned half-byte values */
+static inline struct brw_reg
+brw_imm_uv(unsigned uv)
+{
+ struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UV);
+ imm.ud = uv;
+ return imm;
+}
+
+/** Construct vector of four 8-bit float values */
+static inline struct brw_reg
+brw_imm_vf(unsigned v)
+{
+ struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
+ imm.ud = v;
+ return imm;
+}
+
+static inline struct brw_reg
+brw_imm_vf4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+{
+ struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
+ imm.vstride = BRW_VERTICAL_STRIDE_0;
+ imm.width = BRW_WIDTH_4;
+ imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+ imm.ud = ((v0 << 0) | (v1 << 8) | (v2 << 16) | (v3 << 24));
+ return imm;
+}
+
+
+static inline struct brw_reg
+brw_address(struct brw_reg reg)
+{
+ return brw_imm_uw(reg.nr * REG_SIZE + reg.subnr);
+}
+
+/** Construct float[1] general-purpose register */
+static inline struct brw_reg
+brw_vec1_grf(unsigned nr, unsigned subnr)
+{
+ return brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+/** Construct float[2] general-purpose register */
+static inline struct brw_reg
+brw_vec2_grf(unsigned nr, unsigned subnr)
+{
+ return brw_vec2_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+/** Construct float[4] general-purpose register */
+static inline struct brw_reg
+brw_vec4_grf(unsigned nr, unsigned subnr)
+{
+ return brw_vec4_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+/** Construct float[8] general-purpose register */
+static inline struct brw_reg
+brw_vec8_grf(unsigned nr, unsigned subnr)
+{
+ return brw_vec8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+/** Construct float[16] general-purpose register */
+static inline struct brw_reg
+brw_vec16_grf(unsigned nr, unsigned subnr)
+{
+ return brw_vec16_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static inline struct brw_reg
+brw_vecn_grf(unsigned width, unsigned nr, unsigned subnr)
+{
+ return brw_vecn_reg(width, BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+
+static inline struct brw_reg
+brw_uw8_grf(unsigned nr, unsigned subnr)
+{
+ return brw_uw8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static inline struct brw_reg
+brw_uw16_grf(unsigned nr, unsigned subnr)
+{
+ return brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+
+/** Construct null register (usually used for setting condition codes) */
+static inline struct brw_reg
+brw_null_reg(void)
+{
+ return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_NULL, 0);
+}
+
+static inline struct brw_reg
+brw_null_vec(unsigned width)
+{
+ return brw_vecn_reg(width, BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_NULL, 0);
+}
+
+static inline struct brw_reg
+brw_address_reg(unsigned subnr)
+{
+ return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_ADDRESS, subnr);
+}
+
+/* If/else instructions break in align16 mode if writemask & swizzle
+ * aren't xyzw. This goes against the convention for other scalar
+ * regs:
+ */
+static inline struct brw_reg
+brw_ip_reg(void)
+{
+ return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+ BRW_ARF_IP,
+ 0,
+ 0,
+ 0,
+ BRW_REGISTER_TYPE_UD,
+ BRW_VERTICAL_STRIDE_4, /* ? */
+ BRW_WIDTH_1,
+ BRW_HORIZONTAL_STRIDE_0,
+ BRW_SWIZZLE_XYZW, /* NOTE! */
+ WRITEMASK_XYZW); /* NOTE! */
+}
+
+static inline struct brw_reg
+brw_notification_reg(void)
+{
+ return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+ BRW_ARF_NOTIFICATION_COUNT,
+ 0,
+ 0,
+ 0,
+ BRW_REGISTER_TYPE_UD,
+ BRW_VERTICAL_STRIDE_0,
+ BRW_WIDTH_1,
+ BRW_HORIZONTAL_STRIDE_0,
+ BRW_SWIZZLE_XXXX,
+ WRITEMASK_X);
+}
+
+static inline struct brw_reg
+brw_sr0_reg(unsigned subnr)
+{
+ return brw_ud1_reg(BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_STATE, subnr);
+}
+
+static inline struct brw_reg
+brw_acc_reg(unsigned width)
+{
+ return brw_vecn_reg(width, BRW_ARCHITECTURE_REGISTER_FILE,
+ BRW_ARF_ACCUMULATOR, 0);
+}
+
+static inline struct brw_reg
+brw_flag_reg(int reg, int subreg)
+{
+ return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+ BRW_ARF_FLAG + reg, subreg);
+}
+
+/**
+ * Return the mask register present in Gen4-5, or the related register present
+ * in Gen7.5 and later hardware referred to as "channel enable" register in
+ * the documentation.
+ */
+static inline struct brw_reg
+brw_mask_reg(unsigned subnr)
+{
+ return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_MASK, subnr);
+}
+
+static inline struct brw_reg
+brw_vmask_reg()
+{
+ return brw_sr0_reg(3);
+}
+
+static inline struct brw_reg
+brw_dmask_reg()
+{
+ return brw_sr0_reg(2);
+}
+
+static inline struct brw_reg
+brw_message_reg(unsigned nr)
+{
+ return brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, nr, 0);
+}
+
+static inline struct brw_reg
+brw_uvec_mrf(unsigned width, unsigned nr, unsigned subnr)
+{
+ return retype(brw_vecn_reg(width, BRW_MESSAGE_REGISTER_FILE, nr, subnr),
+ BRW_REGISTER_TYPE_UD);
+}
+
+/* This is almost always called with a numeric constant argument, so
+ * make things easy to evaluate at compile time:
+ */
+static inline unsigned cvt(unsigned val)
+{
+ switch (val) {
+ case 0: return 0;
+ case 1: return 1;
+ case 2: return 2;
+ case 4: return 3;
+ case 8: return 4;
+ case 16: return 5;
+ case 32: return 6;
+ }
+ return 0;
+}
+
+static inline struct brw_reg
+stride(struct brw_reg reg, unsigned vstride, unsigned width, unsigned hstride)
+{
+ reg.vstride = cvt(vstride);
+ reg.width = cvt(width) - 1;
+ reg.hstride = cvt(hstride);
+ return reg;
+}
+
+/**
+ * Multiply the vertical and horizontal stride of a register by the given
+ * factor \a s.
+ */
+static inline struct brw_reg
+spread(struct brw_reg reg, unsigned s)
+{
+ if (s) {
+ assert(_mesa_is_pow_two(s));
+
+ if (reg.hstride)
+ reg.hstride += cvt(s) - 1;
+
+ if (reg.vstride)
+ reg.vstride += cvt(s) - 1;
+
+ return reg;
+ } else {
+ return stride(reg, 0, 1, 0);
+ }
+}
+
+static inline struct brw_reg
+vec16(struct brw_reg reg)
+{
+ return stride(reg, 16,16,1);
+}
+
+static inline struct brw_reg
+vec8(struct brw_reg reg)
+{
+ return stride(reg, 8,8,1);
+}
+
+static inline struct brw_reg
+vec4(struct brw_reg reg)
+{
+ return stride(reg, 4,4,1);
+}
+
+static inline struct brw_reg
+vec2(struct brw_reg reg)
+{
+ return stride(reg, 2,2,1);
+}
+
+static inline struct brw_reg
+vec1(struct brw_reg reg)
+{
+ return stride(reg, 0,1,0);
+}
+
+
+static inline struct brw_reg
+get_element(struct brw_reg reg, unsigned elt)
+{
+ return vec1(suboffset(reg, elt));
+}
+
+static inline struct brw_reg
+get_element_ud(struct brw_reg reg, unsigned elt)
+{
+ return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_UD), elt));
+}
+
+static inline struct brw_reg
+get_element_d(struct brw_reg reg, unsigned elt)
+{
+ return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_D), elt));
+}
+
+static inline struct brw_reg
+brw_swizzle(struct brw_reg reg, unsigned swz)
+{
+ if (reg.file == BRW_IMMEDIATE_VALUE)
+ reg.ud = brw_swizzle_immediate(reg.type, reg.ud, swz);
+ else
+ reg.swizzle = brw_compose_swizzle(swz, reg.swizzle);
+
+ return reg;
+}
+
+static inline struct brw_reg
+brw_writemask(struct brw_reg reg, unsigned mask)
+{
+ assert(reg.file != BRW_IMMEDIATE_VALUE);
+ reg.writemask &= mask;
+ return reg;
+}
+
+static inline struct brw_reg
+brw_set_writemask(struct brw_reg reg, unsigned mask)
+{
+ assert(reg.file != BRW_IMMEDIATE_VALUE);
+ reg.writemask = mask;
+ return reg;
+}
+
+static inline unsigned
+brw_writemask_for_size(unsigned n)
+{
+ return (1 << n) - 1;
+}
+
+static inline unsigned
+brw_writemask_for_component_packing(unsigned n, unsigned first_component)
+{
+ assert(first_component + n <= 4);
+ return (((1 << n) - 1) << first_component);
+}
+
+static inline struct brw_reg
+negate(struct brw_reg reg)
+{
+ reg.negate ^= 1;
+ return reg;
+}
+
+static inline struct brw_reg
+brw_abs(struct brw_reg reg)
+{
+ reg.abs = 1;
+ reg.negate = 0;
+ return reg;
+}
+
+/************************************************************************/
+
+static inline struct brw_reg
+brw_vec4_indirect(unsigned subnr, int offset)
+{
+ struct brw_reg reg = brw_vec4_grf(0, 0);
+ reg.subnr = subnr;
+ reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+ reg.indirect_offset = offset;
+ return reg;
+}
+
+static inline struct brw_reg
+brw_vec1_indirect(unsigned subnr, int offset)
+{
+ struct brw_reg reg = brw_vec1_grf(0, 0);
+ reg.subnr = subnr;
+ reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+ reg.indirect_offset = offset;
+ return reg;
+}
+
+static inline struct brw_reg
+brw_VxH_indirect(unsigned subnr, int offset)
+{
+ struct brw_reg reg = brw_vec1_grf(0, 0);
+ reg.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
+ reg.subnr = subnr;
+ reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+ reg.indirect_offset = offset;
+ return reg;
+}
+
+static inline struct brw_reg
+deref_4f(struct brw_indirect ptr, int offset)
+{
+ return brw_vec4_indirect(ptr.addr_subnr, ptr.addr_offset + offset);
+}
+
+static inline struct brw_reg
+deref_1f(struct brw_indirect ptr, int offset)
+{
+ return brw_vec1_indirect(ptr.addr_subnr, ptr.addr_offset + offset);
+}
+
+static inline struct brw_reg
+deref_4b(struct brw_indirect ptr, int offset)
+{
+ return retype(deref_4f(ptr, offset), BRW_REGISTER_TYPE_B);
+}
+
+static inline struct brw_reg
+deref_1uw(struct brw_indirect ptr, int offset)
+{
+ return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UW);
+}
+
+static inline struct brw_reg
+deref_1d(struct brw_indirect ptr, int offset)
+{
+ return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_D);
+}
+
+static inline struct brw_reg
+deref_1ud(struct brw_indirect ptr, int offset)
+{
+ return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UD);
+}
+
+static inline struct brw_reg
+get_addr_reg(struct brw_indirect ptr)
+{
+ return brw_address_reg(ptr.addr_subnr);
+}
+
+static inline struct brw_indirect
+brw_indirect_offset(struct brw_indirect ptr, int offset)
+{
+ ptr.addr_offset += offset;
+ return ptr;
+}
+
+static inline struct brw_indirect
+brw_indirect(unsigned addr_subnr, int offset)
+{
+ struct brw_indirect ptr;
+ ptr.addr_subnr = addr_subnr;
+ ptr.addr_offset = offset;
+ ptr.pad = 0;
+ return ptr;
+}
+
+static inline bool
+region_matches(struct brw_reg reg, enum brw_vertical_stride v,
+ enum brw_width w, enum brw_horizontal_stride h)
+{
+ return reg.vstride == v &&
+ reg.width == w &&
+ reg.hstride == h;
+}
+
+#define has_scalar_region(reg) \
+ region_matches(reg, BRW_VERTICAL_STRIDE_0, BRW_WIDTH_1, \
+ BRW_HORIZONTAL_STRIDE_0)
+
+/* brw_packed_float.c */
+int brw_float_to_vf(float f);
+float brw_vf_to_float(unsigned char vf);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp
new file mode 100644
index 00000000000..b3f7e877c80
--- /dev/null
+++ b/src/intel/compiler/brw_schedule_instructions.cpp
@@ -0,0 +1,1753 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ * Eric Anholt <[email protected]>
+ *
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_live_variables.h"
+#include "brw_vec4.h"
+#include "brw_cfg.h"
+#include "brw_shader.h"
+
+using namespace brw;
+
+/** @file brw_fs_schedule_instructions.cpp
+ *
+ * List scheduling of FS instructions.
+ *
+ * The basic model of the list scheduler is to take a basic block,
+ * compute a DAG of the dependencies (RAW ordering with latency, WAW
+ * ordering with latency, WAR ordering), and make a list of the DAG heads.
+ * Heuristically pick a DAG head, then put all the children that are
+ * now DAG heads into the list of things to schedule.
+ *
+ * The heuristic is the important part. We're trying to be cheap,
+ * since actually computing the optimal scheduling is NP complete.
+ * What we do is track a "current clock". When we schedule a node, we
+ * update the earliest-unblocked clock time of its children, and
+ * increment the clock. Then, when trying to schedule, we just pick
+ * the earliest-unblocked instruction to schedule.
+ *
+ * Note that often there will be many things which could execute
+ * immediately, and there are a range of heuristic options to choose
+ * from in picking among those.
+ */
+
+static bool debug = false;
+
+class instruction_scheduler;
+
+class schedule_node : public exec_node
+{
+public:
+ schedule_node(backend_instruction *inst, instruction_scheduler *sched);
+ void set_latency_gen4();
+ void set_latency_gen7(bool is_haswell);
+
+ backend_instruction *inst;
+ schedule_node **children;
+ int *child_latency;
+ int child_count;
+ int parent_count;
+ int child_array_size;
+ int unblocked_time;
+ int latency;
+
+ /**
+ * Which iteration of pushing groups of children onto the candidates list
+ * this node was a part of.
+ */
+ unsigned cand_generation;
+
+ /**
+ * This is the sum of the instruction's latency plus the maximum delay of
+ * its children, or just the issue_time if it's a leaf node.
+ */
+ int delay;
+
+ /**
+ * Preferred exit node among the (direct or indirect) successors of this
+ * node. Among the scheduler nodes blocked by this node, this will be the
+ * one that may cause earliest program termination, or NULL if none of the
+ * successors is an exit node.
+ */
+ schedule_node *exit;
+
+ bool is_barrier;
+};
+
+/**
+ * Lower bound of the scheduling time after which one of the instructions
+ * blocked by this node may lead to program termination.
+ *
+ * exit_unblocked_time() determines a strict partial ordering relation '«' on
+ * the set of scheduler nodes as follows:
+ *
+ * n « m <-> exit_unblocked_time(n) < exit_unblocked_time(m)
+ *
+ * which can be used to heuristically order nodes according to how early they
+ * can unblock an exit node and lead to program termination.
+ */
+static inline int
+exit_unblocked_time(const schedule_node *n)
+{
+ return n->exit ? n->exit->unblocked_time : INT_MAX;
+}
+
+void
+schedule_node::set_latency_gen4()
+{
+ int chans = 8;
+ int math_latency = 22;
+
+ switch (inst->opcode) {
+ case SHADER_OPCODE_RCP:
+ this->latency = 1 * chans * math_latency;
+ break;
+ case SHADER_OPCODE_RSQ:
+ this->latency = 2 * chans * math_latency;
+ break;
+ case SHADER_OPCODE_INT_QUOTIENT:
+ case SHADER_OPCODE_SQRT:
+ case SHADER_OPCODE_LOG2:
+ /* full precision log. partial is 2. */
+ this->latency = 3 * chans * math_latency;
+ break;
+ case SHADER_OPCODE_INT_REMAINDER:
+ case SHADER_OPCODE_EXP2:
+ /* full precision. partial is 3, same throughput. */
+ this->latency = 4 * chans * math_latency;
+ break;
+ case SHADER_OPCODE_POW:
+ this->latency = 8 * chans * math_latency;
+ break;
+ case SHADER_OPCODE_SIN:
+ case SHADER_OPCODE_COS:
+ /* minimum latency, max is 12 rounds. */
+ this->latency = 5 * chans * math_latency;
+ break;
+ default:
+ this->latency = 2;
+ break;
+ }
+}
+
+void
+schedule_node::set_latency_gen7(bool is_haswell)
+{
+ switch (inst->opcode) {
+ case BRW_OPCODE_MAD:
+ /* 2 cycles
+ * (since the last two src operands are in different register banks):
+ * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
+ *
+ * 3 cycles on IVB, 4 on HSW
+ * (since the last two src operands are in the same register bank):
+ * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
+ *
+ * 18 cycles on IVB, 16 on HSW
+ * (since the last two src operands are in different register banks):
+ * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
+ * mov(8) null g4<4,5,1>F { align16 WE_normal 1Q };
+ *
+ * 20 cycles on IVB, 18 on HSW
+ * (since the last two src operands are in the same register bank):
+ * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
+ * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q };
+ */
+
+ /* Our register allocator doesn't know about register banks, so use the
+ * higher latency.
+ */
+ latency = is_haswell ? 16 : 18;
+ break;
+
+ case BRW_OPCODE_LRP:
+ /* 2 cycles
+ * (since the last two src operands are in different register banks):
+ * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
+ *
+ * 3 cycles on IVB, 4 on HSW
+ * (since the last two src operands are in the same register bank):
+ * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
+ *
+ * 16 cycles on IVB, 14 on HSW
+ * (since the last two src operands are in different register banks):
+ * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
+ * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q };
+ *
+ * 16 cycles
+ * (since the last two src operands are in the same register bank):
+ * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
+ * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q };
+ */
+
+ /* Our register allocator doesn't know about register banks, so use the
+ * higher latency.
+ */
+ latency = 14;
+ break;
+
+ case SHADER_OPCODE_RCP:
+ case SHADER_OPCODE_RSQ:
+ case SHADER_OPCODE_SQRT:
+ case SHADER_OPCODE_LOG2:
+ case SHADER_OPCODE_EXP2:
+ case SHADER_OPCODE_SIN:
+ case SHADER_OPCODE_COS:
+ /* 2 cycles:
+ * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q };
+ *
+ * 18 cycles:
+ * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q };
+ * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
+ *
+ * Same for exp2, log2, rsq, sqrt, sin, cos.
+ */
+ latency = is_haswell ? 14 : 16;
+ break;
+
+ case SHADER_OPCODE_POW:
+ /* 2 cycles:
+ * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q };
+ *
+ * 26 cycles:
+ * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q };
+ * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
+ */
+ latency = is_haswell ? 22 : 24;
+ break;
+
+ case SHADER_OPCODE_TEX:
+ case SHADER_OPCODE_TXD:
+ case SHADER_OPCODE_TXF:
+ case SHADER_OPCODE_TXF_LZ:
+ case SHADER_OPCODE_TXL:
+ case SHADER_OPCODE_TXL_LZ:
+ /* 18 cycles:
+ * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
+ * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
+ * send(8) g4<1>UW g114<8,8,1>F
+ * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
+ *
+ * 697 +/-49 cycles (min 610, n=26):
+ * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
+ * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
+ * send(8) g4<1>UW g114<8,8,1>F
+ * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
+ * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
+ *
+ * So the latency on our first texture load of the batchbuffer takes
+ * ~700 cycles, since the caches are cold at that point.
+ *
+ * 840 +/- 92 cycles (min 720, n=25):
+ * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
+ * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
+ * send(8) g4<1>UW g114<8,8,1>F
+ * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
+ * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
+ * send(8) g4<1>UW g114<8,8,1>F
+ * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
+ * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
+ *
+ * On the second load, it takes just an extra ~140 cycles, and after
+ * accounting for the 14 cycles of the MOV's latency, that makes ~130.
+ *
+ * 683 +/- 49 cycles (min = 602, n=47):
+ * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
+ * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
+ * send(8) g4<1>UW g114<8,8,1>F
+ * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
+ * send(8) g50<1>UW g114<8,8,1>F
+ * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
+ * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
+ *
+ * The unit appears to be pipelined, since this matches up with the
+ * cache-cold case, despite there being two loads here. If you replace
+ * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39).
+ *
+ * So, take some number between the cache-hot 140 cycles and the
+ * cache-cold 700 cycles. No particular tuning was done on this.
+ *
+ * I haven't done significant testing of the non-TEX opcodes. TXL at
+ * least looked about the same as TEX.
+ */
+ latency = 200;
+ break;
+
+ case SHADER_OPCODE_TXS:
+ /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41
+ * cycles (n=15):
+ * mov(8) g114<1>UD 0D { align1 WE_normal 1Q };
+ * send(8) g6<1>UW g114<8,8,1>F
+ * sampler (10, 0, 10, 1) mlen 1 rlen 4 { align1 WE_normal 1Q };
+ * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1Q };
+ *
+ *
+ * Two loads was 535 +/- 30 cycles (n=19):
+ * mov(16) g114<1>UD 0D { align1 WE_normal 1H };
+ * send(16) g6<1>UW g114<8,8,1>F
+ * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H };
+ * mov(16) g114<1>UD 0D { align1 WE_normal 1H };
+ * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1H };
+ * send(16) g8<1>UW g114<8,8,1>F
+ * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H };
+ * mov(16) g8<1>F g8<8,8,1>D { align1 WE_normal 1H };
+ * add(16) g6<1>F g6<8,8,1>F g8<8,8,1>F { align1 WE_normal 1H };
+ *
+ * Since the only caches that should matter are just the
+ * instruction/state cache containing the surface state, assume that we
+ * always have hot caches.
+ */
+ latency = 100;
+ break;
+
+ case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
+ case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
+ case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+ case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
+ case VS_OPCODE_PULL_CONSTANT_LOAD:
+ /* testing using varying-index pull constants:
+ *
+ * 16 cycles:
+ * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
+ * send(8) g4<1>F g4<8,8,1>D
+ * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
+ *
+ * ~480 cycles:
+ * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
+ * send(8) g4<1>F g4<8,8,1>D
+ * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
+ * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
+ *
+ * ~620 cycles:
+ * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
+ * send(8) g4<1>F g4<8,8,1>D
+ * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
+ * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
+ * send(8) g4<1>F g4<8,8,1>D
+ * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
+ * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
+ *
+ * So, if it's cache-hot, it's about 140. If it's cache cold, it's
+ * about 460. We expect to mostly be cache hot, so pick something more
+ * in that direction.
+ */
+ latency = 200;
+ break;
+
+ case SHADER_OPCODE_GEN7_SCRATCH_READ:
+ /* Testing a load from offset 0, that had been previously written:
+ *
+ * send(8) g114<1>UW g0<8,8,1>F data (0, 0, 0) mlen 1 rlen 1 { align1 WE_normal 1Q };
+ * mov(8) null g114<8,8,1>F { align1 WE_normal 1Q };
+ *
+ * The cycles spent seemed to be grouped around 40-50 (as low as 38),
+ * then around 140. Presumably this is cache hit vs miss.
+ */
+ latency = 50;
+ break;
+
+ case SHADER_OPCODE_UNTYPED_ATOMIC:
+ case SHADER_OPCODE_TYPED_ATOMIC:
+ /* Test code:
+ * mov(8) g112<1>ud 0x00000000ud { align1 WE_all 1Q };
+ * mov(1) g112.7<1>ud g1.7<0,1,0>ud { align1 WE_all };
+ * mov(8) g113<1>ud 0x00000000ud { align1 WE_normal 1Q };
+ * send(8) g4<1>ud g112<8,8,1>ud
+ * data (38, 5, 6) mlen 2 rlen 1 { align1 WE_normal 1Q };
+ *
+ * Running it 100 times as fragment shader on a 128x128 quad
+ * gives an average latency of 13867 cycles per atomic op,
+ * standard deviation 3%. Note that this is a rather
+ * pessimistic estimate, the actual latency in cases with few
+ * collisions between threads and favorable pipelining has been
+ * seen to be reduced by a factor of 100.
+ */
+ latency = 14000;
+ break;
+
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+ case SHADER_OPCODE_TYPED_SURFACE_READ:
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+ /* Test code:
+ * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q };
+ * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all };
+ * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q };
+ * send(8) g4<1>UD g112<8,8,1>UD
+ * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q };
+ * .
+ * . [repeats 8 times]
+ * .
+ * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q };
+ * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all };
+ * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q };
+ * send(8) g4<1>UD g112<8,8,1>UD
+ * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q };
+ *
+ * Running it 100 times as fragment shader on a 128x128 quad
+ * gives an average latency of 583 cycles per surface read,
+ * standard deviation 0.9%.
+ */
+ latency = is_haswell ? 300 : 600;
+ break;
+
+ default:
+ /* 2 cycles:
+ * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q };
+ *
+ * 16 cycles:
+ * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q };
+ * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
+ */
+ latency = 14;
+ break;
+ }
+}
+
+class instruction_scheduler {
+public:
+ instruction_scheduler(backend_shader *s, int grf_count,
+ int hw_reg_count, int block_count,
+ instruction_scheduler_mode mode)
+ {
+ this->bs = s;
+ this->mem_ctx = ralloc_context(NULL);
+ this->grf_count = grf_count;
+ this->hw_reg_count = hw_reg_count;
+ this->instructions.make_empty();
+ this->instructions_to_schedule = 0;
+ this->post_reg_alloc = (mode == SCHEDULE_POST);
+ this->mode = mode;
+ if (!post_reg_alloc) {
+ this->reg_pressure_in = rzalloc_array(mem_ctx, int, block_count);
+
+ this->livein = ralloc_array(mem_ctx, BITSET_WORD *, block_count);
+ for (int i = 0; i < block_count; i++)
+ this->livein[i] = rzalloc_array(mem_ctx, BITSET_WORD,
+ BITSET_WORDS(grf_count));
+
+ this->liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count);
+ for (int i = 0; i < block_count; i++)
+ this->liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD,
+ BITSET_WORDS(grf_count));
+
+ this->hw_liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count);
+ for (int i = 0; i < block_count; i++)
+ this->hw_liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD,
+ BITSET_WORDS(hw_reg_count));
+
+ this->written = rzalloc_array(mem_ctx, bool, grf_count);
+
+ this->reads_remaining = rzalloc_array(mem_ctx, int, grf_count);
+
+ this->hw_reads_remaining = rzalloc_array(mem_ctx, int, hw_reg_count);
+ } else {
+ this->reg_pressure_in = NULL;
+ this->livein = NULL;
+ this->liveout = NULL;
+ this->hw_liveout = NULL;
+ this->written = NULL;
+ this->reads_remaining = NULL;
+ this->hw_reads_remaining = NULL;
+ }
+ }
+
+ ~instruction_scheduler()
+ {
+ ralloc_free(this->mem_ctx);
+ }
+ void add_barrier_deps(schedule_node *n);
+ void add_dep(schedule_node *before, schedule_node *after, int latency);
+ void add_dep(schedule_node *before, schedule_node *after);
+
+ void run(cfg_t *cfg);
+ void add_insts_from_block(bblock_t *block);
+ void compute_delays();
+ void compute_exits();
+ virtual void calculate_deps() = 0;
+ virtual schedule_node *choose_instruction_to_schedule() = 0;
+
+ /**
+ * Returns how many cycles it takes the instruction to issue.
+ *
+ * Instructions in gen hardware are handled one simd4 vector at a time,
+ * with 1 cycle per vector dispatched. Thus SIMD8 pixel shaders take 2
+ * cycles to dispatch and SIMD16 (compressed) instructions take 4.
+ */
+ virtual int issue_time(backend_instruction *inst) = 0;
+
+ virtual void count_reads_remaining(backend_instruction *inst) = 0;
+ virtual void setup_liveness(cfg_t *cfg) = 0;
+ virtual void update_register_pressure(backend_instruction *inst) = 0;
+ virtual int get_register_pressure_benefit(backend_instruction *inst) = 0;
+
+ void schedule_instructions(bblock_t *block);
+
+ void *mem_ctx;
+
+ bool post_reg_alloc;
+ int instructions_to_schedule;
+ int grf_count;
+ int hw_reg_count;
+ int reg_pressure;
+ int block_idx;
+ exec_list instructions;
+ backend_shader *bs;
+
+ instruction_scheduler_mode mode;
+
+ /*
+ * The register pressure at the beginning of each basic block.
+ */
+
+ int *reg_pressure_in;
+
+ /*
+ * The virtual GRF's whose range overlaps the beginning of each basic block.
+ */
+
+ BITSET_WORD **livein;
+
+ /*
+ * The virtual GRF's whose range overlaps the end of each basic block.
+ */
+
+ BITSET_WORD **liveout;
+
+ /*
+ * The hardware GRF's whose range overlaps the end of each basic block.
+ */
+
+ BITSET_WORD **hw_liveout;
+
+ /*
+ * Whether we've scheduled a write for this virtual GRF yet.
+ */
+
+ bool *written;
+
+ /*
+ * How many reads we haven't scheduled for this virtual GRF yet.
+ */
+
+ int *reads_remaining;
+
+ /*
+ * How many reads we haven't scheduled for this hardware GRF yet.
+ */
+
+ int *hw_reads_remaining;
+};
+
+class fs_instruction_scheduler : public instruction_scheduler
+{
+public:
+ fs_instruction_scheduler(fs_visitor *v, int grf_count, int hw_reg_count,
+ int block_count,
+ instruction_scheduler_mode mode);
+ void calculate_deps();
+ bool is_compressed(fs_inst *inst);
+ schedule_node *choose_instruction_to_schedule();
+ int issue_time(backend_instruction *inst);
+ fs_visitor *v;
+
+ void count_reads_remaining(backend_instruction *inst);
+ void setup_liveness(cfg_t *cfg);
+ void update_register_pressure(backend_instruction *inst);
+ int get_register_pressure_benefit(backend_instruction *inst);
+};
+
+fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
+ int grf_count, int hw_reg_count,
+ int block_count,
+ instruction_scheduler_mode mode)
+ : instruction_scheduler(v, grf_count, hw_reg_count, block_count, mode),
+ v(v)
+{
+}
+
+static bool
+is_src_duplicate(fs_inst *inst, int src)
+{
+ for (int i = 0; i < src; i++)
+ if (inst->src[i].equals(inst->src[src]))
+ return true;
+
+ return false;
+}
+
+void
+fs_instruction_scheduler::count_reads_remaining(backend_instruction *be)
+{
+ fs_inst *inst = (fs_inst *)be;
+
+ if (!reads_remaining)
+ return;
+
+ for (int i = 0; i < inst->sources; i++) {
+ if (is_src_duplicate(inst, i))
+ continue;
+
+ if (inst->src[i].file == VGRF) {
+ reads_remaining[inst->src[i].nr]++;
+ } else if (inst->src[i].file == FIXED_GRF) {
+ if (inst->src[i].nr >= hw_reg_count)
+ continue;
+
+ for (unsigned j = 0; j < regs_read(inst, i); j++)
+ hw_reads_remaining[inst->src[i].nr + j]++;
+ }
+ }
+}
+
+void
+fs_instruction_scheduler::setup_liveness(cfg_t *cfg)
+{
+ /* First, compute liveness on a per-GRF level using the in/out sets from
+ * liveness calculation.
+ */
+ for (int block = 0; block < cfg->num_blocks; block++) {
+ for (int i = 0; i < v->live_intervals->num_vars; i++) {
+ if (BITSET_TEST(v->live_intervals->block_data[block].livein, i)) {
+ int vgrf = v->live_intervals->vgrf_from_var[i];
+ if (!BITSET_TEST(livein[block], vgrf)) {
+ reg_pressure_in[block] += v->alloc.sizes[vgrf];
+ BITSET_SET(livein[block], vgrf);
+ }
+ }
+
+ if (BITSET_TEST(v->live_intervals->block_data[block].liveout, i))
+ BITSET_SET(liveout[block], v->live_intervals->vgrf_from_var[i]);
+ }
+ }
+
+ /* Now, extend the live in/live out sets for when a range crosses a block
+ * boundary, which matches what our register allocator/interference code
+ * does to account for force_writemask_all and incompatible exec_mask's.
+ */
+ for (int block = 0; block < cfg->num_blocks - 1; block++) {
+ for (int i = 0; i < grf_count; i++) {
+ if (v->virtual_grf_start[i] <= cfg->blocks[block]->end_ip &&
+ v->virtual_grf_end[i] >= cfg->blocks[block + 1]->start_ip) {
+ if (!BITSET_TEST(livein[block + 1], i)) {
+ reg_pressure_in[block + 1] += v->alloc.sizes[i];
+ BITSET_SET(livein[block + 1], i);
+ }
+
+ BITSET_SET(liveout[block], i);
+ }
+ }
+ }
+
+ int payload_last_use_ip[hw_reg_count];
+ v->calculate_payload_ranges(hw_reg_count, payload_last_use_ip);
+
+ for (int i = 0; i < hw_reg_count; i++) {
+ if (payload_last_use_ip[i] == -1)
+ continue;
+
+ for (int block = 0; block < cfg->num_blocks; block++) {
+ if (cfg->blocks[block]->start_ip <= payload_last_use_ip[i])
+ reg_pressure_in[block]++;
+
+ if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i])
+ BITSET_SET(hw_liveout[block], i);
+ }
+ }
+}
+
+void
+fs_instruction_scheduler::update_register_pressure(backend_instruction *be)
+{
+ fs_inst *inst = (fs_inst *)be;
+
+ if (!reads_remaining)
+ return;
+
+ if (inst->dst.file == VGRF) {
+ written[inst->dst.nr] = true;
+ }
+
+ for (int i = 0; i < inst->sources; i++) {
+ if (is_src_duplicate(inst, i))
+ continue;
+
+ if (inst->src[i].file == VGRF) {
+ reads_remaining[inst->src[i].nr]--;
+ } else if (inst->src[i].file == FIXED_GRF &&
+ inst->src[i].nr < hw_reg_count) {
+ for (unsigned off = 0; off < regs_read(inst, i); off++)
+ hw_reads_remaining[inst->src[i].nr + off]--;
+ }
+ }
+}
+
+int
+fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
+{
+ fs_inst *inst = (fs_inst *)be;
+ int benefit = 0;
+
+ if (inst->dst.file == VGRF) {
+ if (!BITSET_TEST(livein[block_idx], inst->dst.nr) &&
+ !written[inst->dst.nr])
+ benefit -= v->alloc.sizes[inst->dst.nr];
+ }
+
+ for (int i = 0; i < inst->sources; i++) {
+ if (is_src_duplicate(inst, i))
+ continue;
+
+ if (inst->src[i].file == VGRF &&
+ !BITSET_TEST(liveout[block_idx], inst->src[i].nr) &&
+ reads_remaining[inst->src[i].nr] == 1)
+ benefit += v->alloc.sizes[inst->src[i].nr];
+
+ if (inst->src[i].file == FIXED_GRF &&
+ inst->src[i].nr < hw_reg_count) {
+ for (unsigned off = 0; off < regs_read(inst, i); off++) {
+ int reg = inst->src[i].nr + off;
+ if (!BITSET_TEST(hw_liveout[block_idx], reg) &&
+ hw_reads_remaining[reg] == 1) {
+ benefit++;
+ }
+ }
+ }
+ }
+
+ return benefit;
+}
+
+class vec4_instruction_scheduler : public instruction_scheduler
+{
+public:
+ vec4_instruction_scheduler(vec4_visitor *v, int grf_count);
+ void calculate_deps();
+ schedule_node *choose_instruction_to_schedule();
+ int issue_time(backend_instruction *inst);
+ vec4_visitor *v;
+
+ void count_reads_remaining(backend_instruction *inst);
+ void setup_liveness(cfg_t *cfg);
+ void update_register_pressure(backend_instruction *inst);
+ int get_register_pressure_benefit(backend_instruction *inst);
+};
+
+vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
+ int grf_count)
+ : instruction_scheduler(v, grf_count, 0, 0, SCHEDULE_POST),
+ v(v)
+{
+}
+
+void
+vec4_instruction_scheduler::count_reads_remaining(backend_instruction *be)
+{
+}
+
+void
+vec4_instruction_scheduler::setup_liveness(cfg_t *cfg)
+{
+}
+
+void
+vec4_instruction_scheduler::update_register_pressure(backend_instruction *be)
+{
+}
+
+int
+vec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
+{
+ return 0;
+}
+
+schedule_node::schedule_node(backend_instruction *inst,
+ instruction_scheduler *sched)
+{
+ const struct gen_device_info *devinfo = sched->bs->devinfo;
+
+ this->inst = inst;
+ this->child_array_size = 0;
+ this->children = NULL;
+ this->child_latency = NULL;
+ this->child_count = 0;
+ this->parent_count = 0;
+ this->unblocked_time = 0;
+ this->cand_generation = 0;
+ this->delay = 0;
+ this->exit = NULL;
+ this->is_barrier = false;
+
+ /* We can't measure Gen6 timings directly but expect them to be much
+ * closer to Gen7 than Gen4.
+ */
+ if (!sched->post_reg_alloc)
+ this->latency = 1;
+ else if (devinfo->gen >= 6)
+ set_latency_gen7(devinfo->is_haswell);
+ else
+ set_latency_gen4();
+}
+
+void
+instruction_scheduler::add_insts_from_block(bblock_t *block)
+{
+ foreach_inst_in_block(backend_instruction, inst, block) {
+ schedule_node *n = new(mem_ctx) schedule_node(inst, this);
+
+ instructions.push_tail(n);
+ }
+
+ this->instructions_to_schedule = block->end_ip - block->start_ip + 1;
+}
+
+/** Computation of the delay member of each node. */
+void
+instruction_scheduler::compute_delays()
+{
+ foreach_in_list_reverse(schedule_node, n, &instructions) {
+ if (!n->child_count) {
+ n->delay = issue_time(n->inst);
+ } else {
+ for (int i = 0; i < n->child_count; i++) {
+ assert(n->children[i]->delay);
+ n->delay = MAX2(n->delay, n->latency + n->children[i]->delay);
+ }
+ }
+ }
+}
+
+void
+instruction_scheduler::compute_exits()
+{
+ /* Calculate a lower bound of the scheduling time of each node in the
+ * graph. This is analogous to the node's critical path but calculated
+ * from the top instead of from the bottom of the block.
+ */
+ foreach_in_list(schedule_node, n, &instructions) {
+ for (int i = 0; i < n->child_count; i++) {
+ n->children[i]->unblocked_time =
+ MAX2(n->children[i]->unblocked_time,
+ n->unblocked_time + issue_time(n->inst) + n->child_latency[i]);
+ }
+ }
+
+ /* Calculate the exit of each node by induction based on the exit nodes of
+ * its children. The preferred exit of a node is the one among the exit
+ * nodes of its children which can be unblocked first according to the
+ * optimistic unblocked time estimate calculated above.
+ */
+ foreach_in_list_reverse(schedule_node, n, &instructions) {
+ n->exit = (n->inst->opcode == FS_OPCODE_DISCARD_JUMP ? n : NULL);
+
+ for (int i = 0; i < n->child_count; i++) {
+ if (exit_unblocked_time(n->children[i]) < exit_unblocked_time(n))
+ n->exit = n->children[i]->exit;
+ }
+ }
+}
+
+/**
+ * Add a dependency between two instruction nodes.
+ *
+ * The @after node will be scheduled after @before. We will try to
+ * schedule it @latency cycles after @before, but no guarantees there.
+ */
+void
+instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
+ int latency)
+{
+ if (!before || !after)
+ return;
+
+ assert(before != after);
+
+ for (int i = 0; i < before->child_count; i++) {
+ if (before->children[i] == after) {
+ before->child_latency[i] = MAX2(before->child_latency[i], latency);
+ return;
+ }
+ }
+
+ if (before->child_array_size <= before->child_count) {
+ if (before->child_array_size < 16)
+ before->child_array_size = 16;
+ else
+ before->child_array_size *= 2;
+
+ before->children = reralloc(mem_ctx, before->children,
+ schedule_node *,
+ before->child_array_size);
+ before->child_latency = reralloc(mem_ctx, before->child_latency,
+ int, before->child_array_size);
+ }
+
+ before->children[before->child_count] = after;
+ before->child_latency[before->child_count] = latency;
+ before->child_count++;
+ after->parent_count++;
+}
+
+void
+instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
+{
+ if (!before)
+ return;
+
+ add_dep(before, after, before->latency);
+}
+
+/**
+ * Sometimes we really want this node to execute after everything that
+ * was before it and before everything that followed it. This adds
+ * the deps to do so.
+ */
+void
+instruction_scheduler::add_barrier_deps(schedule_node *n)
+{
+ schedule_node *prev = (schedule_node *)n->prev;
+ schedule_node *next = (schedule_node *)n->next;
+
+ n->is_barrier = true;
+
+ if (prev) {
+ while (!prev->is_head_sentinel()) {
+ add_dep(prev, n, 0);
+ if (prev->is_barrier)
+ break;
+ prev = (schedule_node *)prev->prev;
+ }
+ }
+
+ if (next) {
+ while (!next->is_tail_sentinel()) {
+ add_dep(n, next, 0);
+ if (next->is_barrier)
+ break;
+ next = (schedule_node *)next->next;
+ }
+ }
+}
+
+/* instruction scheduling needs to be aware of when an MRF write
+ * actually writes 2 MRFs.
+ */
+bool
+fs_instruction_scheduler::is_compressed(fs_inst *inst)
+{
+ return inst->exec_size == 16;
+}
+
+static bool
+is_scheduling_barrier(const fs_inst *inst)
+{
+ return inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
+ inst->is_control_flow() ||
+ inst->has_side_effects();
+}
+
+void
+fs_instruction_scheduler::calculate_deps()
+{
+ /* Pre-register-allocation, this tracks the last write per VGRF offset.
+ * After register allocation, reg_offsets are gone and we track individual
+ * GRF registers.
+ */
+ schedule_node *last_grf_write[grf_count * 16];
+ schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->gen)];
+ schedule_node *last_conditional_mod[4] = {};
+ schedule_node *last_accumulator_write = NULL;
+ /* Fixed HW registers are assumed to be separate from the virtual
+ * GRFs, so they can be tracked separately. We don't really write
+ * to fixed GRFs much, so don't bother tracking them on a more
+ * granular level.
+ */
+ schedule_node *last_fixed_grf_write = NULL;
+
+ memset(last_grf_write, 0, sizeof(last_grf_write));
+ memset(last_mrf_write, 0, sizeof(last_mrf_write));
+
+ /* top-to-bottom dependencies: RAW and WAW. */
+ foreach_in_list(schedule_node, n, &instructions) {
+ fs_inst *inst = (fs_inst *)n->inst;
+
+ if (is_scheduling_barrier(inst))
+ add_barrier_deps(n);
+
+ /* read-after-write deps. */
+ for (int i = 0; i < inst->sources; i++) {
+ if (inst->src[i].file == VGRF) {
+ if (post_reg_alloc) {
+ for (unsigned r = 0; r < regs_read(inst, i); r++)
+ add_dep(last_grf_write[inst->src[i].nr + r], n);
+ } else {
+ for (unsigned r = 0; r < regs_read(inst, i); r++) {
+ add_dep(last_grf_write[inst->src[i].nr * 16 +
+ inst->src[i].offset / REG_SIZE + r], n);
+ }
+ }
+ } else if (inst->src[i].file == FIXED_GRF) {
+ if (post_reg_alloc) {
+ for (unsigned r = 0; r < regs_read(inst, i); r++)
+ add_dep(last_grf_write[inst->src[i].nr + r], n);
+ } else {
+ add_dep(last_fixed_grf_write, n);
+ }
+ } else if (inst->src[i].is_accumulator()) {
+ add_dep(last_accumulator_write, n);
+ } else if (inst->src[i].file == ARF) {
+ add_barrier_deps(n);
+ }
+ }
+
+ if (inst->base_mrf != -1) {
+ for (int i = 0; i < inst->mlen; i++) {
+ /* It looks like the MRF regs are released in the send
+ * instruction once it's sent, not when the result comes
+ * back.
+ */
+ add_dep(last_mrf_write[inst->base_mrf + i], n);
+ }
+ }
+
+ if (const unsigned mask = inst->flags_read(v->devinfo)) {
+ assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
+
+ for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
+ if (mask & (1 << i))
+ add_dep(last_conditional_mod[i], n);
+ }
+ }
+
+ if (inst->reads_accumulator_implicitly()) {
+ add_dep(last_accumulator_write, n);
+ }
+
+ /* write-after-write deps. */
+ if (inst->dst.file == VGRF) {
+ if (post_reg_alloc) {
+ for (unsigned r = 0; r < regs_written(inst); r++) {
+ add_dep(last_grf_write[inst->dst.nr + r], n);
+ last_grf_write[inst->dst.nr + r] = n;
+ }
+ } else {
+ for (unsigned r = 0; r < regs_written(inst); r++) {
+ add_dep(last_grf_write[inst->dst.nr * 16 +
+ inst->dst.offset / REG_SIZE + r], n);
+ last_grf_write[inst->dst.nr * 16 +
+ inst->dst.offset / REG_SIZE + r] = n;
+ }
+ }
+ } else if (inst->dst.file == MRF) {
+ int reg = inst->dst.nr & ~BRW_MRF_COMPR4;
+
+ add_dep(last_mrf_write[reg], n);
+ last_mrf_write[reg] = n;
+ if (is_compressed(inst)) {
+ if (inst->dst.nr & BRW_MRF_COMPR4)
+ reg += 4;
+ else
+ reg++;
+ add_dep(last_mrf_write[reg], n);
+ last_mrf_write[reg] = n;
+ }
+ } else if (inst->dst.file == FIXED_GRF) {
+ if (post_reg_alloc) {
+ for (unsigned r = 0; r < regs_written(inst); r++)
+ last_grf_write[inst->dst.nr + r] = n;
+ } else {
+ last_fixed_grf_write = n;
+ }
+ } else if (inst->dst.is_accumulator()) {
+ add_dep(last_accumulator_write, n);
+ last_accumulator_write = n;
+ } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
+ add_barrier_deps(n);
+ }
+
+ if (inst->mlen > 0 && inst->base_mrf != -1) {
+ for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
+ add_dep(last_mrf_write[inst->base_mrf + i], n);
+ last_mrf_write[inst->base_mrf + i] = n;
+ }
+ }
+
+ if (const unsigned mask = inst->flags_written()) {
+ assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
+
+ for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
+ if (mask & (1 << i)) {
+ add_dep(last_conditional_mod[i], n, 0);
+ last_conditional_mod[i] = n;
+ }
+ }
+ }
+
+ if (inst->writes_accumulator_implicitly(v->devinfo) &&
+ !inst->dst.is_accumulator()) {
+ add_dep(last_accumulator_write, n);
+ last_accumulator_write = n;
+ }
+ }
+
+ /* bottom-to-top dependencies: WAR */
+ memset(last_grf_write, 0, sizeof(last_grf_write));
+ memset(last_mrf_write, 0, sizeof(last_mrf_write));
+ memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
+ last_accumulator_write = NULL;
+ last_fixed_grf_write = NULL;
+
+ foreach_in_list_reverse_safe(schedule_node, n, &instructions) {
+ fs_inst *inst = (fs_inst *)n->inst;
+
+ /* write-after-read deps. */
+ for (int i = 0; i < inst->sources; i++) {
+ if (inst->src[i].file == VGRF) {
+ if (post_reg_alloc) {
+ for (unsigned r = 0; r < regs_read(inst, i); r++)
+ add_dep(n, last_grf_write[inst->src[i].nr + r], 0);
+ } else {
+ for (unsigned r = 0; r < regs_read(inst, i); r++) {
+ add_dep(n, last_grf_write[inst->src[i].nr * 16 +
+ inst->src[i].offset / REG_SIZE + r], 0);
+ }
+ }
+ } else if (inst->src[i].file == FIXED_GRF) {
+ if (post_reg_alloc) {
+ for (unsigned r = 0; r < regs_read(inst, i); r++)
+ add_dep(n, last_grf_write[inst->src[i].nr + r], 0);
+ } else {
+ add_dep(n, last_fixed_grf_write, 0);
+ }
+ } else if (inst->src[i].is_accumulator()) {
+ add_dep(n, last_accumulator_write, 0);
+ } else if (inst->src[i].file == ARF) {
+ add_barrier_deps(n);
+ }
+ }
+
+ if (inst->base_mrf != -1) {
+ for (int i = 0; i < inst->mlen; i++) {
+ /* It looks like the MRF regs are released in the send
+ * instruction once it's sent, not when the result comes
+ * back.
+ */
+ add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
+ }
+ }
+
+ if (const unsigned mask = inst->flags_read(v->devinfo)) {
+ assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
+
+ for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
+ if (mask & (1 << i))
+ add_dep(n, last_conditional_mod[i]);
+ }
+ }
+
+ if (inst->reads_accumulator_implicitly()) {
+ add_dep(n, last_accumulator_write);
+ }
+
+ /* Update the things this instruction wrote, so earlier reads
+ * can mark this as WAR dependency.
+ */
+ if (inst->dst.file == VGRF) {
+ if (post_reg_alloc) {
+ for (unsigned r = 0; r < regs_written(inst); r++)
+ last_grf_write[inst->dst.nr + r] = n;
+ } else {
+ for (unsigned r = 0; r < regs_written(inst); r++) {
+ last_grf_write[inst->dst.nr * 16 +
+ inst->dst.offset / REG_SIZE + r] = n;
+ }
+ }
+ } else if (inst->dst.file == MRF) {
+ int reg = inst->dst.nr & ~BRW_MRF_COMPR4;
+
+ last_mrf_write[reg] = n;
+
+ if (is_compressed(inst)) {
+ if (inst->dst.nr & BRW_MRF_COMPR4)
+ reg += 4;
+ else
+ reg++;
+
+ last_mrf_write[reg] = n;
+ }
+ } else if (inst->dst.file == FIXED_GRF) {
+ if (post_reg_alloc) {
+ for (unsigned r = 0; r < regs_written(inst); r++)
+ last_grf_write[inst->dst.nr + r] = n;
+ } else {
+ last_fixed_grf_write = n;
+ }
+ } else if (inst->dst.is_accumulator()) {
+ last_accumulator_write = n;
+ } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
+ add_barrier_deps(n);
+ }
+
+ if (inst->mlen > 0 && inst->base_mrf != -1) {
+ for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
+ last_mrf_write[inst->base_mrf + i] = n;
+ }
+ }
+
+ if (const unsigned mask = inst->flags_written()) {
+ assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
+
+ for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
+ if (mask & (1 << i))
+ last_conditional_mod[i] = n;
+ }
+ }
+
+ if (inst->writes_accumulator_implicitly(v->devinfo)) {
+ last_accumulator_write = n;
+ }
+ }
+}
+
+static bool
+is_scheduling_barrier(const vec4_instruction *inst)
+{
+ return inst->is_control_flow() ||
+ inst->has_side_effects();
+}
+
+void
+vec4_instruction_scheduler::calculate_deps()
+{
+ schedule_node *last_grf_write[grf_count];
+ schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->gen)];
+ schedule_node *last_conditional_mod = NULL;
+ schedule_node *last_accumulator_write = NULL;
+ /* Fixed HW registers are assumed to be separate from the virtual
+ * GRFs, so they can be tracked separately. We don't really write
+ * to fixed GRFs much, so don't bother tracking them on a more
+ * granular level.
+ */
+ schedule_node *last_fixed_grf_write = NULL;
+
+ memset(last_grf_write, 0, sizeof(last_grf_write));
+ memset(last_mrf_write, 0, sizeof(last_mrf_write));
+
+ /* top-to-bottom dependencies: RAW and WAW. */
+ foreach_in_list(schedule_node, n, &instructions) {
+ vec4_instruction *inst = (vec4_instruction *)n->inst;
+
+ if (is_scheduling_barrier(inst))
+ add_barrier_deps(n);
+
+ /* read-after-write deps. */
+ for (int i = 0; i < 3; i++) {
+ if (inst->src[i].file == VGRF) {
+ for (unsigned j = 0; j < regs_read(inst, i); ++j)
+ add_dep(last_grf_write[inst->src[i].nr + j], n);
+ } else if (inst->src[i].file == FIXED_GRF) {
+ add_dep(last_fixed_grf_write, n);
+ } else if (inst->src[i].is_accumulator()) {
+ assert(last_accumulator_write);
+ add_dep(last_accumulator_write, n);
+ } else if (inst->src[i].file == ARF) {
+ add_barrier_deps(n);
+ }
+ }
+
+ if (!inst->is_send_from_grf()) {
+ for (int i = 0; i < inst->mlen; i++) {
+ /* It looks like the MRF regs are released in the send
+ * instruction once it's sent, not when the result comes
+ * back.
+ */
+ add_dep(last_mrf_write[inst->base_mrf + i], n);
+ }
+ }
+
+ if (inst->reads_flag()) {
+ assert(last_conditional_mod);
+ add_dep(last_conditional_mod, n);
+ }
+
+ if (inst->reads_accumulator_implicitly()) {
+ assert(last_accumulator_write);
+ add_dep(last_accumulator_write, n);
+ }
+
+ /* write-after-write deps. */
+ if (inst->dst.file == VGRF) {
+ for (unsigned j = 0; j < regs_written(inst); ++j) {
+ add_dep(last_grf_write[inst->dst.nr + j], n);
+ last_grf_write[inst->dst.nr + j] = n;
+ }
+ } else if (inst->dst.file == MRF) {
+ add_dep(last_mrf_write[inst->dst.nr], n);
+ last_mrf_write[inst->dst.nr] = n;
+ } else if (inst->dst.file == FIXED_GRF) {
+ last_fixed_grf_write = n;
+ } else if (inst->dst.is_accumulator()) {
+ add_dep(last_accumulator_write, n);
+ last_accumulator_write = n;
+ } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
+ add_barrier_deps(n);
+ }
+
+ if (inst->mlen > 0 && !inst->is_send_from_grf()) {
+ for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
+ add_dep(last_mrf_write[inst->base_mrf + i], n);
+ last_mrf_write[inst->base_mrf + i] = n;
+ }
+ }
+
+ if (inst->writes_flag()) {
+ add_dep(last_conditional_mod, n, 0);
+ last_conditional_mod = n;
+ }
+
+ if (inst->writes_accumulator_implicitly(v->devinfo) &&
+ !inst->dst.is_accumulator()) {
+ add_dep(last_accumulator_write, n);
+ last_accumulator_write = n;
+ }
+ }
+
+ /* bottom-to-top dependencies: WAR */
+ memset(last_grf_write, 0, sizeof(last_grf_write));
+ memset(last_mrf_write, 0, sizeof(last_mrf_write));
+ last_conditional_mod = NULL;
+ last_accumulator_write = NULL;
+ last_fixed_grf_write = NULL;
+
+ foreach_in_list_reverse_safe(schedule_node, n, &instructions) {
+ vec4_instruction *inst = (vec4_instruction *)n->inst;
+
+ /* write-after-read deps. */
+ for (int i = 0; i < 3; i++) {
+ if (inst->src[i].file == VGRF) {
+ for (unsigned j = 0; j < regs_read(inst, i); ++j)
+ add_dep(n, last_grf_write[inst->src[i].nr + j]);
+ } else if (inst->src[i].file == FIXED_GRF) {
+ add_dep(n, last_fixed_grf_write);
+ } else if (inst->src[i].is_accumulator()) {
+ add_dep(n, last_accumulator_write);
+ } else if (inst->src[i].file == ARF) {
+ add_barrier_deps(n);
+ }
+ }
+
+ if (!inst->is_send_from_grf()) {
+ for (int i = 0; i < inst->mlen; i++) {
+ /* It looks like the MRF regs are released in the send
+ * instruction once it's sent, not when the result comes
+ * back.
+ */
+ add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
+ }
+ }
+
+ if (inst->reads_flag()) {
+ add_dep(n, last_conditional_mod);
+ }
+
+ if (inst->reads_accumulator_implicitly()) {
+ add_dep(n, last_accumulator_write);
+ }
+
+ /* Update the things this instruction wrote, so earlier reads
+ * can mark this as WAR dependency.
+ */
+ if (inst->dst.file == VGRF) {
+ for (unsigned j = 0; j < regs_written(inst); ++j)
+ last_grf_write[inst->dst.nr + j] = n;
+ } else if (inst->dst.file == MRF) {
+ last_mrf_write[inst->dst.nr] = n;
+ } else if (inst->dst.file == FIXED_GRF) {
+ last_fixed_grf_write = n;
+ } else if (inst->dst.is_accumulator()) {
+ last_accumulator_write = n;
+ } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
+ add_barrier_deps(n);
+ }
+
+ if (inst->mlen > 0 && !inst->is_send_from_grf()) {
+ for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
+ last_mrf_write[inst->base_mrf + i] = n;
+ }
+ }
+
+ if (inst->writes_flag()) {
+ last_conditional_mod = n;
+ }
+
+ if (inst->writes_accumulator_implicitly(v->devinfo)) {
+ last_accumulator_write = n;
+ }
+ }
+}
+
+schedule_node *
+fs_instruction_scheduler::choose_instruction_to_schedule()
+{
+ schedule_node *chosen = NULL;
+
+ if (mode == SCHEDULE_PRE || mode == SCHEDULE_POST) {
+ int chosen_time = 0;
+
+ /* Of the instructions ready to execute or the closest to being ready,
+ * choose the one most likely to unblock an early program exit, or
+ * otherwise the oldest one.
+ */
+ foreach_in_list(schedule_node, n, &instructions) {
+ if (!chosen ||
+ exit_unblocked_time(n) < exit_unblocked_time(chosen) ||
+ (exit_unblocked_time(n) == exit_unblocked_time(chosen) &&
+ n->unblocked_time < chosen_time)) {
+ chosen = n;
+ chosen_time = n->unblocked_time;
+ }
+ }
+ } else {
+ /* Before register allocation, we don't care about the latencies of
+ * instructions. All we care about is reducing live intervals of
+ * variables so that we can avoid register spilling, or get SIMD16
+ * shaders which naturally do a better job of hiding instruction
+ * latency.
+ */
+ foreach_in_list(schedule_node, n, &instructions) {
+ fs_inst *inst = (fs_inst *)n->inst;
+
+ if (!chosen) {
+ chosen = n;
+ continue;
+ }
+
+ /* Most important: If we can definitely reduce register pressure, do
+ * so immediately.
+ */
+ int register_pressure_benefit = get_register_pressure_benefit(n->inst);
+ int chosen_register_pressure_benefit =
+ get_register_pressure_benefit(chosen->inst);
+
+ if (register_pressure_benefit > 0 &&
+ register_pressure_benefit > chosen_register_pressure_benefit) {
+ chosen = n;
+ continue;
+ } else if (chosen_register_pressure_benefit > 0 &&
+ (register_pressure_benefit <
+ chosen_register_pressure_benefit)) {
+ continue;
+ }
+
+ if (mode == SCHEDULE_PRE_LIFO) {
+ /* Prefer instructions that recently became available for
+ * scheduling. These are the things that are most likely to
+ * (eventually) make a variable dead and reduce register pressure.
+ * Typical register pressure estimates don't work for us because
+ * most of our pressure comes from texturing, where no single
+ * instruction to schedule will make a vec4 value dead.
+ */
+ if (n->cand_generation > chosen->cand_generation) {
+ chosen = n;
+ continue;
+ } else if (n->cand_generation < chosen->cand_generation) {
+ continue;
+ }
+
+ /* On MRF-using chips, prefer non-SEND instructions. If we don't
+ * do this, then because we prefer instructions that just became
+ * candidates, we'll end up in a pattern of scheduling a SEND,
+ * then the MRFs for the next SEND, then the next SEND, then the
+ * MRFs, etc., without ever consuming the results of a send.
+ */
+ if (v->devinfo->gen < 7) {
+ fs_inst *chosen_inst = (fs_inst *)chosen->inst;
+
+ /* We use size_written > 4 * exec_size as our test for the kind
+ * of send instruction to avoid -- only sends generate many
+ * regs, and a single-result send is probably actually reducing
+ * register pressure.
+ */
+ if (inst->size_written <= 4 * inst->exec_size &&
+ chosen_inst->size_written > 4 * chosen_inst->exec_size) {
+ chosen = n;
+ continue;
+ } else if (inst->size_written > chosen_inst->size_written) {
+ continue;
+ }
+ }
+ }
+
+ /* For instructions pushed on the cands list at the same time, prefer
+ * the one with the highest delay to the end of the program. This is
+ * most likely to have its values able to be consumed first (such as
+ * for a large tree of lowered ubo loads, which appear reversed in
+ * the instruction stream with respect to when they can be consumed).
+ */
+ if (n->delay > chosen->delay) {
+ chosen = n;
+ continue;
+ } else if (n->delay < chosen->delay) {
+ continue;
+ }
+
+ /* Prefer the node most likely to unblock an early program exit.
+ */
+ if (exit_unblocked_time(n) < exit_unblocked_time(chosen)) {
+ chosen = n;
+ continue;
+ } else if (exit_unblocked_time(n) > exit_unblocked_time(chosen)) {
+ continue;
+ }
+
+ /* If all other metrics are equal, we prefer the first instruction in
+ * the list (program execution).
+ */
+ }
+ }
+
+ return chosen;
+}
+
+schedule_node *
+vec4_instruction_scheduler::choose_instruction_to_schedule()
+{
+ schedule_node *chosen = NULL;
+ int chosen_time = 0;
+
+ /* Of the instructions ready to execute or the closest to being ready,
+ * choose the oldest one.
+ */
+ foreach_in_list(schedule_node, n, &instructions) {
+ if (!chosen || n->unblocked_time < chosen_time) {
+ chosen = n;
+ chosen_time = n->unblocked_time;
+ }
+ }
+
+ return chosen;
+}
+
+int
+fs_instruction_scheduler::issue_time(backend_instruction *inst)
+{
+ if (is_compressed((fs_inst *)inst))
+ return 4;
+ else
+ return 2;
+}
+
+int
+vec4_instruction_scheduler::issue_time(backend_instruction *inst)
+{
+ /* We always execute as two vec4s in parallel. */
+ return 2;
+}
+
+void
+instruction_scheduler::schedule_instructions(bblock_t *block)
+{
+ const struct gen_device_info *devinfo = bs->devinfo;
+ int time = 0;
+ if (!post_reg_alloc)
+ reg_pressure = reg_pressure_in[block->num];
+ block_idx = block->num;
+
+ /* Remove non-DAG heads from the list. */
+ foreach_in_list_safe(schedule_node, n, &instructions) {
+ if (n->parent_count != 0)
+ n->remove();
+ }
+
+ unsigned cand_generation = 1;
+ while (!instructions.is_empty()) {
+ schedule_node *chosen = choose_instruction_to_schedule();
+
+ /* Schedule this instruction. */
+ assert(chosen);
+ chosen->remove();
+ chosen->inst->exec_node::remove();
+ block->instructions.push_tail(chosen->inst);
+ instructions_to_schedule--;
+
+ if (!post_reg_alloc) {
+ reg_pressure -= get_register_pressure_benefit(chosen->inst);
+ update_register_pressure(chosen->inst);
+ }
+
+ /* If we expected a delay for scheduling, then bump the clock to reflect
+ * that. In reality, the hardware will switch to another hyperthread
+ * and may not return to dispatching our thread for a while even after
+ * we're unblocked. After this, we have the time when the chosen
+ * instruction will start executing.
+ */
+ time = MAX2(time, chosen->unblocked_time);
+
+ /* Update the clock for how soon an instruction could start after the
+ * chosen one.
+ */
+ time += issue_time(chosen->inst);
+
+ if (debug) {
+ fprintf(stderr, "clock %4d, scheduled: ", time);
+ bs->dump_instruction(chosen->inst);
+ if (!post_reg_alloc)
+ fprintf(stderr, "(register pressure %d)\n", reg_pressure);
+ }
+
+ /* Now that we've scheduled a new instruction, some of its
+ * children can be promoted to the list of instructions ready to
+ * be scheduled. Update the children's unblocked time for this
+ * DAG edge as we do so.
+ */
+ for (int i = chosen->child_count - 1; i >= 0; i--) {
+ schedule_node *child = chosen->children[i];
+
+ child->unblocked_time = MAX2(child->unblocked_time,
+ time + chosen->child_latency[i]);
+
+ if (debug) {
+ fprintf(stderr, "\tchild %d, %d parents: ", i, child->parent_count);
+ bs->dump_instruction(child->inst);
+ }
+
+ child->cand_generation = cand_generation;
+ child->parent_count--;
+ if (child->parent_count == 0) {
+ if (debug) {
+ fprintf(stderr, "\t\tnow available\n");
+ }
+ instructions.push_head(child);
+ }
+ }
+ cand_generation++;
+
+ /* Shared resource: the mathbox. There's one mathbox per EU on Gen6+
+ * but it's more limited pre-gen6, so if we send something off to it then
+ * the next math instruction isn't going to make progress until the first
+ * is done.
+ */
+ if (devinfo->gen < 6 && chosen->inst->is_math()) {
+ foreach_in_list(schedule_node, n, &instructions) {
+ if (n->inst->is_math())
+ n->unblocked_time = MAX2(n->unblocked_time,
+ time + chosen->latency);
+ }
+ }
+ }
+
+ assert(instructions_to_schedule == 0);
+
+ block->cycle_count = time;
+}
+
+static unsigned get_cycle_count(cfg_t *cfg)
+{
+ unsigned count = 0, multiplier = 1;
+ foreach_block(block, cfg) {
+ if (block->start()->opcode == BRW_OPCODE_DO)
+ multiplier *= 10; /* assume that loops execute ~10 times */
+
+ count += block->cycle_count * multiplier;
+
+ if (block->end()->opcode == BRW_OPCODE_WHILE)
+ multiplier /= 10;
+ }
+
+ return count;
+}
+
+void
+instruction_scheduler::run(cfg_t *cfg)
+{
+ if (debug && !post_reg_alloc) {
+ fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n",
+ post_reg_alloc);
+ bs->dump_instructions();
+ }
+
+ if (!post_reg_alloc)
+ setup_liveness(cfg);
+
+ foreach_block(block, cfg) {
+ if (reads_remaining) {
+ memset(reads_remaining, 0,
+ grf_count * sizeof(*reads_remaining));
+ memset(hw_reads_remaining, 0,
+ hw_reg_count * sizeof(*hw_reads_remaining));
+ memset(written, 0, grf_count * sizeof(*written));
+
+ foreach_inst_in_block(fs_inst, inst, block)
+ count_reads_remaining(inst);
+ }
+
+ add_insts_from_block(block);
+
+ calculate_deps();
+
+ compute_delays();
+ compute_exits();
+
+ schedule_instructions(block);
+ }
+
+ if (debug && !post_reg_alloc) {
+ fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n",
+ post_reg_alloc);
+ bs->dump_instructions();
+ }
+
+ cfg->cycle_count = get_cycle_count(cfg);
+}
+
+void
+fs_visitor::schedule_instructions(instruction_scheduler_mode mode)
+{
+ if (mode != SCHEDULE_POST)
+ calculate_live_intervals();
+
+ int grf_count;
+ if (mode == SCHEDULE_POST)
+ grf_count = grf_used;
+ else
+ grf_count = alloc.count;
+
+ fs_instruction_scheduler sched(this, grf_count, first_non_payload_grf,
+ cfg->num_blocks, mode);
+ sched.run(cfg);
+
+ invalidate_live_intervals();
+}
+
+void
+vec4_visitor::opt_schedule_instructions()
+{
+ vec4_instruction_scheduler sched(this, prog_data->total_grf);
+ sched.run(cfg);
+
+ invalidate_live_intervals();
+}
diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp
new file mode 100644
index 00000000000..bfaa5e7bfe2
--- /dev/null
+++ b/src/intel/compiler/brw_shader.cpp
@@ -0,0 +1,1273 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_cfg.h"
+#include "brw_eu.h"
+#include "brw_fs.h"
+#include "brw_nir.h"
+#include "brw_vec4_tes.h"
+#include "common/gen_debug.h"
+#include "main/uniforms.h"
+#include "util/macros.h"
+
+enum brw_reg_type
+brw_type_for_base_type(const struct glsl_type *type)
+{
+ switch (type->base_type) {
+ case GLSL_TYPE_FLOAT:
+ return BRW_REGISTER_TYPE_F;
+ case GLSL_TYPE_INT:
+ case GLSL_TYPE_BOOL:
+ case GLSL_TYPE_SUBROUTINE:
+ return BRW_REGISTER_TYPE_D;
+ case GLSL_TYPE_UINT:
+ return BRW_REGISTER_TYPE_UD;
+ case GLSL_TYPE_ARRAY:
+ return brw_type_for_base_type(type->fields.array);
+ case GLSL_TYPE_STRUCT:
+ case GLSL_TYPE_SAMPLER:
+ case GLSL_TYPE_ATOMIC_UINT:
+ /* These should be overridden with the type of the member when
+ * dereferenced into. BRW_REGISTER_TYPE_UD seems like a likely
+ * way to trip up if we don't.
+ */
+ return BRW_REGISTER_TYPE_UD;
+ case GLSL_TYPE_IMAGE:
+ return BRW_REGISTER_TYPE_UD;
+ case GLSL_TYPE_DOUBLE:
+ return BRW_REGISTER_TYPE_DF;
+ case GLSL_TYPE_UINT64:
+ return BRW_REGISTER_TYPE_UQ;
+ case GLSL_TYPE_INT64:
+ return BRW_REGISTER_TYPE_Q;
+ case GLSL_TYPE_VOID:
+ case GLSL_TYPE_ERROR:
+ case GLSL_TYPE_INTERFACE:
+ case GLSL_TYPE_FUNCTION:
+ unreachable("not reached");
+ }
+
+ return BRW_REGISTER_TYPE_F;
+}
+
+enum brw_conditional_mod
+brw_conditional_for_comparison(unsigned int op)
+{
+ switch (op) {
+ case ir_binop_less:
+ return BRW_CONDITIONAL_L;
+ case ir_binop_greater:
+ return BRW_CONDITIONAL_G;
+ case ir_binop_lequal:
+ return BRW_CONDITIONAL_LE;
+ case ir_binop_gequal:
+ return BRW_CONDITIONAL_GE;
+ case ir_binop_equal:
+ case ir_binop_all_equal: /* same as equal for scalars */
+ return BRW_CONDITIONAL_Z;
+ case ir_binop_nequal:
+ case ir_binop_any_nequal: /* same as nequal for scalars */
+ return BRW_CONDITIONAL_NZ;
+ default:
+ unreachable("not reached: bad operation for comparison");
+ }
+}
+
+uint32_t
+brw_math_function(enum opcode op)
+{
+ switch (op) {
+ case SHADER_OPCODE_RCP:
+ return BRW_MATH_FUNCTION_INV;
+ case SHADER_OPCODE_RSQ:
+ return BRW_MATH_FUNCTION_RSQ;
+ case SHADER_OPCODE_SQRT:
+ return BRW_MATH_FUNCTION_SQRT;
+ case SHADER_OPCODE_EXP2:
+ return BRW_MATH_FUNCTION_EXP;
+ case SHADER_OPCODE_LOG2:
+ return BRW_MATH_FUNCTION_LOG;
+ case SHADER_OPCODE_POW:
+ return BRW_MATH_FUNCTION_POW;
+ case SHADER_OPCODE_SIN:
+ return BRW_MATH_FUNCTION_SIN;
+ case SHADER_OPCODE_COS:
+ return BRW_MATH_FUNCTION_COS;
+ case SHADER_OPCODE_INT_QUOTIENT:
+ return BRW_MATH_FUNCTION_INT_DIV_QUOTIENT;
+ case SHADER_OPCODE_INT_REMAINDER:
+ return BRW_MATH_FUNCTION_INT_DIV_REMAINDER;
+ default:
+ unreachable("not reached: unknown math function");
+ }
+}
+
+bool
+brw_texture_offset(int *offsets, unsigned num_components, uint32_t *offset_bits)
+{
+ if (!offsets) return false; /* nonconstant offset; caller will handle it. */
+
+ /* offset out of bounds; caller will handle it. */
+ for (unsigned i = 0; i < num_components; i++)
+ if (offsets[i] > 7 || offsets[i] < -8)
+ return false;
+
+ /* Combine all three offsets into a single unsigned dword:
+ *
+ * bits 11:8 - U Offset (X component)
+ * bits 7:4 - V Offset (Y component)
+ * bits 3:0 - R Offset (Z component)
+ */
+ *offset_bits = 0;
+ for (unsigned i = 0; i < num_components; i++) {
+ const unsigned shift = 4 * (2 - i);
+ *offset_bits |= (offsets[i] << shift) & (0xF << shift);
+ }
+ return true;
+}
+
+const char *
+brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)
+{
+ switch (op) {
+ case BRW_OPCODE_ILLEGAL ... BRW_OPCODE_NOP:
+ /* The DO instruction doesn't exist on Gen6+, but we use it to mark the
+ * start of a loop in the IR.
+ */
+ if (devinfo->gen >= 6 && op == BRW_OPCODE_DO)
+ return "do";
+
+ assert(brw_opcode_desc(devinfo, op)->name);
+ return brw_opcode_desc(devinfo, op)->name;
+ case FS_OPCODE_FB_WRITE:
+ return "fb_write";
+ case FS_OPCODE_FB_WRITE_LOGICAL:
+ return "fb_write_logical";
+ case FS_OPCODE_REP_FB_WRITE:
+ return "rep_fb_write";
+ case FS_OPCODE_FB_READ:
+ return "fb_read";
+ case FS_OPCODE_FB_READ_LOGICAL:
+ return "fb_read_logical";
+
+ case SHADER_OPCODE_RCP:
+ return "rcp";
+ case SHADER_OPCODE_RSQ:
+ return "rsq";
+ case SHADER_OPCODE_SQRT:
+ return "sqrt";
+ case SHADER_OPCODE_EXP2:
+ return "exp2";
+ case SHADER_OPCODE_LOG2:
+ return "log2";
+ case SHADER_OPCODE_POW:
+ return "pow";
+ case SHADER_OPCODE_INT_QUOTIENT:
+ return "int_quot";
+ case SHADER_OPCODE_INT_REMAINDER:
+ return "int_rem";
+ case SHADER_OPCODE_SIN:
+ return "sin";
+ case SHADER_OPCODE_COS:
+ return "cos";
+
+ case SHADER_OPCODE_TEX:
+ return "tex";
+ case SHADER_OPCODE_TEX_LOGICAL:
+ return "tex_logical";
+ case SHADER_OPCODE_TXD:
+ return "txd";
+ case SHADER_OPCODE_TXD_LOGICAL:
+ return "txd_logical";
+ case SHADER_OPCODE_TXF:
+ return "txf";
+ case SHADER_OPCODE_TXF_LOGICAL:
+ return "txf_logical";
+ case SHADER_OPCODE_TXF_LZ:
+ return "txf_lz";
+ case SHADER_OPCODE_TXL:
+ return "txl";
+ case SHADER_OPCODE_TXL_LOGICAL:
+ return "txl_logical";
+ case SHADER_OPCODE_TXL_LZ:
+ return "txl_lz";
+ case SHADER_OPCODE_TXS:
+ return "txs";
+ case SHADER_OPCODE_TXS_LOGICAL:
+ return "txs_logical";
+ case FS_OPCODE_TXB:
+ return "txb";
+ case FS_OPCODE_TXB_LOGICAL:
+ return "txb_logical";
+ case SHADER_OPCODE_TXF_CMS:
+ return "txf_cms";
+ case SHADER_OPCODE_TXF_CMS_LOGICAL:
+ return "txf_cms_logical";
+ case SHADER_OPCODE_TXF_CMS_W:
+ return "txf_cms_w";
+ case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+ return "txf_cms_w_logical";
+ case SHADER_OPCODE_TXF_UMS:
+ return "txf_ums";
+ case SHADER_OPCODE_TXF_UMS_LOGICAL:
+ return "txf_ums_logical";
+ case SHADER_OPCODE_TXF_MCS:
+ return "txf_mcs";
+ case SHADER_OPCODE_TXF_MCS_LOGICAL:
+ return "txf_mcs_logical";
+ case SHADER_OPCODE_LOD:
+ return "lod";
+ case SHADER_OPCODE_LOD_LOGICAL:
+ return "lod_logical";
+ case SHADER_OPCODE_TG4:
+ return "tg4";
+ case SHADER_OPCODE_TG4_LOGICAL:
+ return "tg4_logical";
+ case SHADER_OPCODE_TG4_OFFSET:
+ return "tg4_offset";
+ case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+ return "tg4_offset_logical";
+ case SHADER_OPCODE_SAMPLEINFO:
+ return "sampleinfo";
+ case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
+ return "sampleinfo_logical";
+
+ case SHADER_OPCODE_SHADER_TIME_ADD:
+ return "shader_time_add";
+
+ case SHADER_OPCODE_UNTYPED_ATOMIC:
+ return "untyped_atomic";
+ case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+ return "untyped_atomic_logical";
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+ return "untyped_surface_read";
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+ return "untyped_surface_read_logical";
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+ return "untyped_surface_write";
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+ return "untyped_surface_write_logical";
+ case SHADER_OPCODE_TYPED_ATOMIC:
+ return "typed_atomic";
+ case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+ return "typed_atomic_logical";
+ case SHADER_OPCODE_TYPED_SURFACE_READ:
+ return "typed_surface_read";
+ case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+ return "typed_surface_read_logical";
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+ return "typed_surface_write";
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+ return "typed_surface_write_logical";
+ case SHADER_OPCODE_MEMORY_FENCE:
+ return "memory_fence";
+
+ case SHADER_OPCODE_LOAD_PAYLOAD:
+ return "load_payload";
+ case FS_OPCODE_PACK:
+ return "pack";
+
+ case SHADER_OPCODE_GEN4_SCRATCH_READ:
+ return "gen4_scratch_read";
+ case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
+ return "gen4_scratch_write";
+ case SHADER_OPCODE_GEN7_SCRATCH_READ:
+ return "gen7_scratch_read";
+ case SHADER_OPCODE_URB_WRITE_SIMD8:
+ return "gen8_urb_write_simd8";
+ case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+ return "gen8_urb_write_simd8_per_slot";
+ case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+ return "gen8_urb_write_simd8_masked";
+ case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
+ return "gen8_urb_write_simd8_masked_per_slot";
+ case SHADER_OPCODE_URB_READ_SIMD8:
+ return "urb_read_simd8";
+ case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
+ return "urb_read_simd8_per_slot";
+
+ case SHADER_OPCODE_FIND_LIVE_CHANNEL:
+ return "find_live_channel";
+ case SHADER_OPCODE_BROADCAST:
+ return "broadcast";
+
+ case VEC4_OPCODE_MOV_BYTES:
+ return "mov_bytes";
+ case VEC4_OPCODE_PACK_BYTES:
+ return "pack_bytes";
+ case VEC4_OPCODE_UNPACK_UNIFORM:
+ return "unpack_uniform";
+ case VEC4_OPCODE_FROM_DOUBLE:
+ return "double_to_single";
+ case VEC4_OPCODE_TO_DOUBLE:
+ return "single_to_double";
+ case VEC4_OPCODE_PICK_LOW_32BIT:
+ return "pick_low_32bit";
+ case VEC4_OPCODE_PICK_HIGH_32BIT:
+ return "pick_high_32bit";
+ case VEC4_OPCODE_SET_LOW_32BIT:
+ return "set_low_32bit";
+ case VEC4_OPCODE_SET_HIGH_32BIT:
+ return "set_high_32bit";
+
+ case FS_OPCODE_DDX_COARSE:
+ return "ddx_coarse";
+ case FS_OPCODE_DDX_FINE:
+ return "ddx_fine";
+ case FS_OPCODE_DDY_COARSE:
+ return "ddy_coarse";
+ case FS_OPCODE_DDY_FINE:
+ return "ddy_fine";
+
+ case FS_OPCODE_CINTERP:
+ return "cinterp";
+ case FS_OPCODE_LINTERP:
+ return "linterp";
+
+ case FS_OPCODE_PIXEL_X:
+ return "pixel_x";
+ case FS_OPCODE_PIXEL_Y:
+ return "pixel_y";
+
+ case FS_OPCODE_GET_BUFFER_SIZE:
+ return "fs_get_buffer_size";
+
+ case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+ return "uniform_pull_const";
+ case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
+ return "uniform_pull_const_gen7";
+ case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
+ return "varying_pull_const_gen4";
+ case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
+ return "varying_pull_const_gen7";
+ case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
+ return "varying_pull_const_logical";
+
+ case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
+ return "mov_dispatch_to_flags";
+ case FS_OPCODE_DISCARD_JUMP:
+ return "discard_jump";
+
+ case FS_OPCODE_SET_SAMPLE_ID:
+ return "set_sample_id";
+
+ case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+ return "pack_half_2x16_split";
+ case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
+ return "unpack_half_2x16_split_x";
+ case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
+ return "unpack_half_2x16_split_y";
+
+ case FS_OPCODE_PLACEHOLDER_HALT:
+ return "placeholder_halt";
+
+ case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+ return "interp_sample";
+ case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+ return "interp_shared_offset";
+ case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+ return "interp_per_slot_offset";
+
+ case VS_OPCODE_URB_WRITE:
+ return "vs_urb_write";
+ case VS_OPCODE_PULL_CONSTANT_LOAD:
+ return "pull_constant_load";
+ case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
+ return "pull_constant_load_gen7";
+
+ case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
+ return "set_simd4x2_header_gen9";
+
+ case VS_OPCODE_GET_BUFFER_SIZE:
+ return "vs_get_buffer_size";
+
+ case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
+ return "unpack_flags_simd4x2";
+
+ case GS_OPCODE_URB_WRITE:
+ return "gs_urb_write";
+ case GS_OPCODE_URB_WRITE_ALLOCATE:
+ return "gs_urb_write_allocate";
+ case GS_OPCODE_THREAD_END:
+ return "gs_thread_end";
+ case GS_OPCODE_SET_WRITE_OFFSET:
+ return "set_write_offset";
+ case GS_OPCODE_SET_VERTEX_COUNT:
+ return "set_vertex_count";
+ case GS_OPCODE_SET_DWORD_2:
+ return "set_dword_2";
+ case GS_OPCODE_PREPARE_CHANNEL_MASKS:
+ return "prepare_channel_masks";
+ case GS_OPCODE_SET_CHANNEL_MASKS:
+ return "set_channel_masks";
+ case GS_OPCODE_GET_INSTANCE_ID:
+ return "get_instance_id";
+ case GS_OPCODE_FF_SYNC:
+ return "ff_sync";
+ case GS_OPCODE_SET_PRIMITIVE_ID:
+ return "set_primitive_id";
+ case GS_OPCODE_SVB_WRITE:
+ return "gs_svb_write";
+ case GS_OPCODE_SVB_SET_DST_INDEX:
+ return "gs_svb_set_dst_index";
+ case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
+ return "gs_ff_sync_set_primitives";
+ case CS_OPCODE_CS_TERMINATE:
+ return "cs_terminate";
+ case SHADER_OPCODE_BARRIER:
+ return "barrier";
+ case SHADER_OPCODE_MULH:
+ return "mulh";
+ case SHADER_OPCODE_MOV_INDIRECT:
+ return "mov_indirect";
+
+ case VEC4_OPCODE_URB_READ:
+ return "urb_read";
+ case TCS_OPCODE_GET_INSTANCE_ID:
+ return "tcs_get_instance_id";
+ case TCS_OPCODE_URB_WRITE:
+ return "tcs_urb_write";
+ case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+ return "tcs_set_input_urb_offsets";
+ case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+ return "tcs_set_output_urb_offsets";
+ case TCS_OPCODE_GET_PRIMITIVE_ID:
+ return "tcs_get_primitive_id";
+ case TCS_OPCODE_CREATE_BARRIER_HEADER:
+ return "tcs_create_barrier_header";
+ case TCS_OPCODE_SRC0_010_IS_ZERO:
+ return "tcs_src0<0,1,0>_is_zero";
+ case TCS_OPCODE_RELEASE_INPUT:
+ return "tcs_release_input";
+ case TCS_OPCODE_THREAD_END:
+ return "tcs_thread_end";
+ case TES_OPCODE_CREATE_INPUT_READ_HEADER:
+ return "tes_create_input_read_header";
+ case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
+ return "tes_add_indirect_urb_offset";
+ case TES_OPCODE_GET_PRIMITIVE_ID:
+ return "tes_get_primitive_id";
+ }
+
+ unreachable("not reached");
+}
+
+bool
+brw_saturate_immediate(enum brw_reg_type type, struct brw_reg *reg)
+{
+ union {
+ unsigned ud;
+ int d;
+ float f;
+ double df;
+ } imm, sat_imm = { 0 };
+
+ const unsigned size = type_sz(type);
+
+ /* We want to either do a 32-bit or 64-bit data copy, the type is otherwise
+ * irrelevant, so just check the size of the type and copy from/to an
+ * appropriately sized field.
+ */
+ if (size < 8)
+ imm.ud = reg->ud;
+ else
+ imm.df = reg->df;
+
+ switch (type) {
+ case BRW_REGISTER_TYPE_UD:
+ case BRW_REGISTER_TYPE_D:
+ case BRW_REGISTER_TYPE_UW:
+ case BRW_REGISTER_TYPE_W:
+ case BRW_REGISTER_TYPE_UQ:
+ case BRW_REGISTER_TYPE_Q:
+ /* Nothing to do. */
+ return false;
+ case BRW_REGISTER_TYPE_F:
+ sat_imm.f = CLAMP(imm.f, 0.0f, 1.0f);
+ break;
+ case BRW_REGISTER_TYPE_DF:
+ sat_imm.df = CLAMP(imm.df, 0.0, 1.0);
+ break;
+ case BRW_REGISTER_TYPE_UB:
+ case BRW_REGISTER_TYPE_B:
+ unreachable("no UB/B immediates");
+ case BRW_REGISTER_TYPE_V:
+ case BRW_REGISTER_TYPE_UV:
+ case BRW_REGISTER_TYPE_VF:
+ unreachable("unimplemented: saturate vector immediate");
+ case BRW_REGISTER_TYPE_HF:
+ unreachable("unimplemented: saturate HF immediate");
+ }
+
+ if (size < 8) {
+ if (imm.ud != sat_imm.ud) {
+ reg->ud = sat_imm.ud;
+ return true;
+ }
+ } else {
+ if (imm.df != sat_imm.df) {
+ reg->df = sat_imm.df;
+ return true;
+ }
+ }
+ return false;
+}
+
+bool
+brw_negate_immediate(enum brw_reg_type type, struct brw_reg *reg)
+{
+ switch (type) {
+ case BRW_REGISTER_TYPE_D:
+ case BRW_REGISTER_TYPE_UD:
+ reg->d = -reg->d;
+ return true;
+ case BRW_REGISTER_TYPE_W:
+ case BRW_REGISTER_TYPE_UW:
+ reg->d = -(int16_t)reg->ud;
+ return true;
+ case BRW_REGISTER_TYPE_F:
+ reg->f = -reg->f;
+ return true;
+ case BRW_REGISTER_TYPE_VF:
+ reg->ud ^= 0x80808080;
+ return true;
+ case BRW_REGISTER_TYPE_DF:
+ reg->df = -reg->df;
+ return true;
+ case BRW_REGISTER_TYPE_UQ:
+ case BRW_REGISTER_TYPE_Q:
+ reg->d64 = -reg->d64;
+ return true;
+ case BRW_REGISTER_TYPE_UB:
+ case BRW_REGISTER_TYPE_B:
+ unreachable("no UB/B immediates");
+ case BRW_REGISTER_TYPE_UV:
+ case BRW_REGISTER_TYPE_V:
+ assert(!"unimplemented: negate UV/V immediate");
+ case BRW_REGISTER_TYPE_HF:
+ assert(!"unimplemented: negate HF immediate");
+ }
+
+ return false;
+}
+
+bool
+brw_abs_immediate(enum brw_reg_type type, struct brw_reg *reg)
+{
+ switch (type) {
+ case BRW_REGISTER_TYPE_D:
+ reg->d = abs(reg->d);
+ return true;
+ case BRW_REGISTER_TYPE_W:
+ reg->d = abs((int16_t)reg->ud);
+ return true;
+ case BRW_REGISTER_TYPE_F:
+ reg->f = fabsf(reg->f);
+ return true;
+ case BRW_REGISTER_TYPE_DF:
+ reg->df = fabs(reg->df);
+ return true;
+ case BRW_REGISTER_TYPE_VF:
+ reg->ud &= ~0x80808080;
+ return true;
+ case BRW_REGISTER_TYPE_Q:
+ reg->d64 = imaxabs(reg->d64);
+ return true;
+ case BRW_REGISTER_TYPE_UB:
+ case BRW_REGISTER_TYPE_B:
+ unreachable("no UB/B immediates");
+ case BRW_REGISTER_TYPE_UQ:
+ case BRW_REGISTER_TYPE_UD:
+ case BRW_REGISTER_TYPE_UW:
+ case BRW_REGISTER_TYPE_UV:
+ /* Presumably the absolute value modifier on an unsigned source is a
+ * nop, but it would be nice to confirm.
+ */
+ assert(!"unimplemented: abs unsigned immediate");
+ case BRW_REGISTER_TYPE_V:
+ assert(!"unimplemented: abs V immediate");
+ case BRW_REGISTER_TYPE_HF:
+ assert(!"unimplemented: abs HF immediate");
+ }
+
+ return false;
+}
+
+/**
+ * Get the appropriate atomic op for an image atomic intrinsic.
+ */
+unsigned
+get_atomic_counter_op(nir_intrinsic_op op)
+{
+ switch (op) {
+ case nir_intrinsic_atomic_counter_inc:
+ return BRW_AOP_INC;
+ case nir_intrinsic_atomic_counter_dec:
+ return BRW_AOP_PREDEC;
+ case nir_intrinsic_atomic_counter_add:
+ return BRW_AOP_ADD;
+ case nir_intrinsic_atomic_counter_min:
+ return BRW_AOP_UMIN;
+ case nir_intrinsic_atomic_counter_max:
+ return BRW_AOP_UMAX;
+ case nir_intrinsic_atomic_counter_and:
+ return BRW_AOP_AND;
+ case nir_intrinsic_atomic_counter_or:
+ return BRW_AOP_OR;
+ case nir_intrinsic_atomic_counter_xor:
+ return BRW_AOP_XOR;
+ case nir_intrinsic_atomic_counter_exchange:
+ return BRW_AOP_MOV;
+ case nir_intrinsic_atomic_counter_comp_swap:
+ return BRW_AOP_CMPWR;
+ default:
+ unreachable("Not reachable.");
+ }
+}
+
+backend_shader::backend_shader(const struct brw_compiler *compiler,
+ void *log_data,
+ void *mem_ctx,
+ const nir_shader *shader,
+ struct brw_stage_prog_data *stage_prog_data)
+ : compiler(compiler),
+ log_data(log_data),
+ devinfo(compiler->devinfo),
+ nir(shader),
+ stage_prog_data(stage_prog_data),
+ mem_ctx(mem_ctx),
+ cfg(NULL),
+ stage(shader->stage)
+{
+ debug_enabled = INTEL_DEBUG & intel_debug_flag_for_shader_stage(stage);
+ stage_name = _mesa_shader_stage_to_string(stage);
+ stage_abbrev = _mesa_shader_stage_to_abbrev(stage);
+}
+
+bool
+backend_reg::equals(const backend_reg &r) const
+{
+ return brw_regs_equal(this, &r) && offset == r.offset;
+}
+
+bool
+backend_reg::is_zero() const
+{
+ if (file != IMM)
+ return false;
+
+ switch (type) {
+ case BRW_REGISTER_TYPE_F:
+ return f == 0;
+ case BRW_REGISTER_TYPE_DF:
+ return df == 0;
+ case BRW_REGISTER_TYPE_D:
+ case BRW_REGISTER_TYPE_UD:
+ return d == 0;
+ case BRW_REGISTER_TYPE_UQ:
+ case BRW_REGISTER_TYPE_Q:
+ return u64 == 0;
+ default:
+ return false;
+ }
+}
+
+bool
+backend_reg::is_one() const
+{
+ if (file != IMM)
+ return false;
+
+ switch (type) {
+ case BRW_REGISTER_TYPE_F:
+ return f == 1.0f;
+ case BRW_REGISTER_TYPE_DF:
+ return df == 1.0;
+ case BRW_REGISTER_TYPE_D:
+ case BRW_REGISTER_TYPE_UD:
+ return d == 1;
+ case BRW_REGISTER_TYPE_UQ:
+ case BRW_REGISTER_TYPE_Q:
+ return u64 == 1;
+ default:
+ return false;
+ }
+}
+
+bool
+backend_reg::is_negative_one() const
+{
+ if (file != IMM)
+ return false;
+
+ switch (type) {
+ case BRW_REGISTER_TYPE_F:
+ return f == -1.0;
+ case BRW_REGISTER_TYPE_DF:
+ return df == -1.0;
+ case BRW_REGISTER_TYPE_D:
+ return d == -1;
+ case BRW_REGISTER_TYPE_Q:
+ return d64 == -1;
+ default:
+ return false;
+ }
+}
+
+bool
+backend_reg::is_null() const
+{
+ return file == ARF && nr == BRW_ARF_NULL;
+}
+
+
+bool
+backend_reg::is_accumulator() const
+{
+ return file == ARF && nr == BRW_ARF_ACCUMULATOR;
+}
+
+bool
+backend_instruction::is_commutative() const
+{
+ switch (opcode) {
+ case BRW_OPCODE_AND:
+ case BRW_OPCODE_OR:
+ case BRW_OPCODE_XOR:
+ case BRW_OPCODE_ADD:
+ case BRW_OPCODE_MUL:
+ case SHADER_OPCODE_MULH:
+ return true;
+ case BRW_OPCODE_SEL:
+ /* MIN and MAX are commutative. */
+ if (conditional_mod == BRW_CONDITIONAL_GE ||
+ conditional_mod == BRW_CONDITIONAL_L) {
+ return true;
+ }
+ /* fallthrough */
+ default:
+ return false;
+ }
+}
+
+bool
+backend_instruction::is_3src(const struct gen_device_info *devinfo) const
+{
+ return ::is_3src(devinfo, opcode);
+}
+
+bool
+backend_instruction::is_tex() const
+{
+ return (opcode == SHADER_OPCODE_TEX ||
+ opcode == FS_OPCODE_TXB ||
+ opcode == SHADER_OPCODE_TXD ||
+ opcode == SHADER_OPCODE_TXF ||
+ opcode == SHADER_OPCODE_TXF_LZ ||
+ opcode == SHADER_OPCODE_TXF_CMS ||
+ opcode == SHADER_OPCODE_TXF_CMS_W ||
+ opcode == SHADER_OPCODE_TXF_UMS ||
+ opcode == SHADER_OPCODE_TXF_MCS ||
+ opcode == SHADER_OPCODE_TXL ||
+ opcode == SHADER_OPCODE_TXL_LZ ||
+ opcode == SHADER_OPCODE_TXS ||
+ opcode == SHADER_OPCODE_LOD ||
+ opcode == SHADER_OPCODE_TG4 ||
+ opcode == SHADER_OPCODE_TG4_OFFSET ||
+ opcode == SHADER_OPCODE_SAMPLEINFO);
+}
+
+bool
+backend_instruction::is_math() const
+{
+ return (opcode == SHADER_OPCODE_RCP ||
+ opcode == SHADER_OPCODE_RSQ ||
+ opcode == SHADER_OPCODE_SQRT ||
+ opcode == SHADER_OPCODE_EXP2 ||
+ opcode == SHADER_OPCODE_LOG2 ||
+ opcode == SHADER_OPCODE_SIN ||
+ opcode == SHADER_OPCODE_COS ||
+ opcode == SHADER_OPCODE_INT_QUOTIENT ||
+ opcode == SHADER_OPCODE_INT_REMAINDER ||
+ opcode == SHADER_OPCODE_POW);
+}
+
+bool
+backend_instruction::is_control_flow() const
+{
+ switch (opcode) {
+ case BRW_OPCODE_DO:
+ case BRW_OPCODE_WHILE:
+ case BRW_OPCODE_IF:
+ case BRW_OPCODE_ELSE:
+ case BRW_OPCODE_ENDIF:
+ case BRW_OPCODE_BREAK:
+ case BRW_OPCODE_CONTINUE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool
+backend_instruction::can_do_source_mods() const
+{
+ switch (opcode) {
+ case BRW_OPCODE_ADDC:
+ case BRW_OPCODE_BFE:
+ case BRW_OPCODE_BFI1:
+ case BRW_OPCODE_BFI2:
+ case BRW_OPCODE_BFREV:
+ case BRW_OPCODE_CBIT:
+ case BRW_OPCODE_FBH:
+ case BRW_OPCODE_FBL:
+ case BRW_OPCODE_SUBB:
+ return false;
+ default:
+ return true;
+ }
+}
+
+bool
+backend_instruction::can_do_saturate() const
+{
+ switch (opcode) {
+ case BRW_OPCODE_ADD:
+ case BRW_OPCODE_ASR:
+ case BRW_OPCODE_AVG:
+ case BRW_OPCODE_DP2:
+ case BRW_OPCODE_DP3:
+ case BRW_OPCODE_DP4:
+ case BRW_OPCODE_DPH:
+ case BRW_OPCODE_F16TO32:
+ case BRW_OPCODE_F32TO16:
+ case BRW_OPCODE_LINE:
+ case BRW_OPCODE_LRP:
+ case BRW_OPCODE_MAC:
+ case BRW_OPCODE_MAD:
+ case BRW_OPCODE_MATH:
+ case BRW_OPCODE_MOV:
+ case BRW_OPCODE_MUL:
+ case SHADER_OPCODE_MULH:
+ case BRW_OPCODE_PLN:
+ case BRW_OPCODE_RNDD:
+ case BRW_OPCODE_RNDE:
+ case BRW_OPCODE_RNDU:
+ case BRW_OPCODE_RNDZ:
+ case BRW_OPCODE_SEL:
+ case BRW_OPCODE_SHL:
+ case BRW_OPCODE_SHR:
+ case FS_OPCODE_LINTERP:
+ case SHADER_OPCODE_COS:
+ case SHADER_OPCODE_EXP2:
+ case SHADER_OPCODE_LOG2:
+ case SHADER_OPCODE_POW:
+ case SHADER_OPCODE_RCP:
+ case SHADER_OPCODE_RSQ:
+ case SHADER_OPCODE_SIN:
+ case SHADER_OPCODE_SQRT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool
+backend_instruction::can_do_cmod() const
+{
+ switch (opcode) {
+ case BRW_OPCODE_ADD:
+ case BRW_OPCODE_ADDC:
+ case BRW_OPCODE_AND:
+ case BRW_OPCODE_ASR:
+ case BRW_OPCODE_AVG:
+ case BRW_OPCODE_CMP:
+ case BRW_OPCODE_CMPN:
+ case BRW_OPCODE_DP2:
+ case BRW_OPCODE_DP3:
+ case BRW_OPCODE_DP4:
+ case BRW_OPCODE_DPH:
+ case BRW_OPCODE_F16TO32:
+ case BRW_OPCODE_F32TO16:
+ case BRW_OPCODE_FRC:
+ case BRW_OPCODE_LINE:
+ case BRW_OPCODE_LRP:
+ case BRW_OPCODE_LZD:
+ case BRW_OPCODE_MAC:
+ case BRW_OPCODE_MACH:
+ case BRW_OPCODE_MAD:
+ case BRW_OPCODE_MOV:
+ case BRW_OPCODE_MUL:
+ case BRW_OPCODE_NOT:
+ case BRW_OPCODE_OR:
+ case BRW_OPCODE_PLN:
+ case BRW_OPCODE_RNDD:
+ case BRW_OPCODE_RNDE:
+ case BRW_OPCODE_RNDU:
+ case BRW_OPCODE_RNDZ:
+ case BRW_OPCODE_SAD2:
+ case BRW_OPCODE_SADA2:
+ case BRW_OPCODE_SHL:
+ case BRW_OPCODE_SHR:
+ case BRW_OPCODE_SUBB:
+ case BRW_OPCODE_XOR:
+ case FS_OPCODE_CINTERP:
+ case FS_OPCODE_LINTERP:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool
+backend_instruction::reads_accumulator_implicitly() const
+{
+ switch (opcode) {
+ case BRW_OPCODE_MAC:
+ case BRW_OPCODE_MACH:
+ case BRW_OPCODE_SADA2:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool
+backend_instruction::writes_accumulator_implicitly(const struct gen_device_info *devinfo) const
+{
+ return writes_accumulator ||
+ (devinfo->gen < 6 &&
+ ((opcode >= BRW_OPCODE_ADD && opcode < BRW_OPCODE_NOP) ||
+ (opcode >= FS_OPCODE_DDX_COARSE && opcode <= FS_OPCODE_LINTERP &&
+ opcode != FS_OPCODE_CINTERP)));
+}
+
+bool
+backend_instruction::has_side_effects() const
+{
+ switch (opcode) {
+ case SHADER_OPCODE_UNTYPED_ATOMIC:
+ case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+ case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+ case SHADER_OPCODE_TYPED_ATOMIC:
+ case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+ case SHADER_OPCODE_MEMORY_FENCE:
+ case SHADER_OPCODE_URB_WRITE_SIMD8:
+ case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+ case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+ case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
+ case FS_OPCODE_FB_WRITE:
+ case FS_OPCODE_FB_WRITE_LOGICAL:
+ case SHADER_OPCODE_BARRIER:
+ case TCS_OPCODE_URB_WRITE:
+ case TCS_OPCODE_RELEASE_INPUT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool
+backend_instruction::is_volatile() const
+{
+ switch (opcode) {
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+ case SHADER_OPCODE_TYPED_SURFACE_READ:
+ case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+ case SHADER_OPCODE_URB_READ_SIMD8:
+ case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
+ case VEC4_OPCODE_URB_READ:
+ return true;
+ default:
+ return false;
+ }
+}
+
+#ifndef NDEBUG
+static bool
+inst_is_in_block(const bblock_t *block, const backend_instruction *inst)
+{
+ bool found = false;
+ foreach_inst_in_block (backend_instruction, i, block) {
+ if (inst == i) {
+ found = true;
+ }
+ }
+ return found;
+}
+#endif
+
+static void
+adjust_later_block_ips(bblock_t *start_block, int ip_adjustment)
+{
+ for (bblock_t *block_iter = start_block->next();
+ block_iter;
+ block_iter = block_iter->next()) {
+ block_iter->start_ip += ip_adjustment;
+ block_iter->end_ip += ip_adjustment;
+ }
+}
+
+void
+backend_instruction::insert_after(bblock_t *block, backend_instruction *inst)
+{
+ assert(this != inst);
+
+ if (!this->is_head_sentinel())
+ assert(inst_is_in_block(block, this) || !"Instruction not in block");
+
+ block->end_ip++;
+
+ adjust_later_block_ips(block, 1);
+
+ exec_node::insert_after(inst);
+}
+
+void
+backend_instruction::insert_before(bblock_t *block, backend_instruction *inst)
+{
+ assert(this != inst);
+
+ if (!this->is_tail_sentinel())
+ assert(inst_is_in_block(block, this) || !"Instruction not in block");
+
+ block->end_ip++;
+
+ adjust_later_block_ips(block, 1);
+
+ exec_node::insert_before(inst);
+}
+
+void
+backend_instruction::insert_before(bblock_t *block, exec_list *list)
+{
+ assert(inst_is_in_block(block, this) || !"Instruction not in block");
+
+ unsigned num_inst = list->length();
+
+ block->end_ip += num_inst;
+
+ adjust_later_block_ips(block, num_inst);
+
+ exec_node::insert_before(list);
+}
+
+void
+backend_instruction::remove(bblock_t *block)
+{
+ assert(inst_is_in_block(block, this) || !"Instruction not in block");
+
+ adjust_later_block_ips(block, -1);
+
+ if (block->start_ip == block->end_ip) {
+ block->cfg->remove_block(block);
+ } else {
+ block->end_ip--;
+ }
+
+ exec_node::remove();
+}
+
+void
+backend_shader::dump_instructions()
+{
+ dump_instructions(NULL);
+}
+
+void
+backend_shader::dump_instructions(const char *name)
+{
+ FILE *file = stderr;
+ if (name && geteuid() != 0) {
+ file = fopen(name, "w");
+ if (!file)
+ file = stderr;
+ }
+
+ if (cfg) {
+ int ip = 0;
+ foreach_block_and_inst(block, backend_instruction, inst, cfg) {
+ if (!unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER))
+ fprintf(file, "%4d: ", ip++);
+ dump_instruction(inst, file);
+ }
+ } else {
+ int ip = 0;
+ foreach_in_list(backend_instruction, inst, &instructions) {
+ if (!unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER))
+ fprintf(file, "%4d: ", ip++);
+ dump_instruction(inst, file);
+ }
+ }
+
+ if (file != stderr) {
+ fclose(file);
+ }
+}
+
+void
+backend_shader::calculate_cfg()
+{
+ if (this->cfg)
+ return;
+ cfg = new(mem_ctx) cfg_t(&this->instructions);
+}
+
+extern "C" const unsigned *
+brw_compile_tes(const struct brw_compiler *compiler,
+ void *log_data,
+ void *mem_ctx,
+ const struct brw_tes_prog_key *key,
+ const struct brw_vue_map *input_vue_map,
+ struct brw_tes_prog_data *prog_data,
+ const nir_shader *src_shader,
+ struct gl_program *prog,
+ int shader_time_index,
+ unsigned *final_assembly_size,
+ char **error_str)
+{
+ const struct gen_device_info *devinfo = compiler->devinfo;
+ const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_EVAL];
+
+ nir_shader *nir = nir_shader_clone(mem_ctx, src_shader);
+ nir->info->inputs_read = key->inputs_read;
+ nir->info->patch_inputs_read = key->patch_inputs_read;
+
+ nir = brw_nir_apply_sampler_key(nir, compiler, &key->tex, is_scalar);
+ brw_nir_lower_tes_inputs(nir, input_vue_map);
+ brw_nir_lower_vue_outputs(nir, is_scalar);
+ nir = brw_postprocess_nir(nir, compiler, is_scalar);
+
+ brw_compute_vue_map(devinfo, &prog_data->base.vue_map,
+ nir->info->outputs_written,
+ nir->info->separate_shader);
+
+ unsigned output_size_bytes = prog_data->base.vue_map.num_slots * 4 * 4;
+
+ assert(output_size_bytes >= 1);
+ if (output_size_bytes > GEN7_MAX_DS_URB_ENTRY_SIZE_BYTES) {
+ if (error_str)
+ *error_str = ralloc_strdup(mem_ctx, "DS outputs exceed maximum size");
+ return NULL;
+ }
+
+ prog_data->base.clip_distance_mask =
+ ((1 << nir->info->clip_distance_array_size) - 1);
+ prog_data->base.cull_distance_mask =
+ ((1 << nir->info->cull_distance_array_size) - 1) <<
+ nir->info->clip_distance_array_size;
+
+ /* URB entry sizes are stored as a multiple of 64 bytes. */
+ prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+ prog_data->base.urb_read_length = 0;
+
+ STATIC_ASSERT(BRW_TESS_PARTITIONING_INTEGER == TESS_SPACING_EQUAL - 1);
+ STATIC_ASSERT(BRW_TESS_PARTITIONING_ODD_FRACTIONAL ==
+ TESS_SPACING_FRACTIONAL_ODD - 1);
+ STATIC_ASSERT(BRW_TESS_PARTITIONING_EVEN_FRACTIONAL ==
+ TESS_SPACING_FRACTIONAL_EVEN - 1);
+
+ prog_data->partitioning =
+ (enum brw_tess_partitioning) (nir->info->tess.spacing - 1);
+
+ switch (nir->info->tess.primitive_mode) {
+ case GL_QUADS:
+ prog_data->domain = BRW_TESS_DOMAIN_QUAD;
+ break;
+ case GL_TRIANGLES:
+ prog_data->domain = BRW_TESS_DOMAIN_TRI;
+ break;
+ case GL_ISOLINES:
+ prog_data->domain = BRW_TESS_DOMAIN_ISOLINE;
+ break;
+ default:
+ unreachable("invalid domain shader primitive mode");
+ }
+
+ if (nir->info->tess.point_mode) {
+ prog_data->output_topology = BRW_TESS_OUTPUT_TOPOLOGY_POINT;
+ } else if (nir->info->tess.primitive_mode == GL_ISOLINES) {
+ prog_data->output_topology = BRW_TESS_OUTPUT_TOPOLOGY_LINE;
+ } else {
+ /* Hardware winding order is backwards from OpenGL */
+ prog_data->output_topology =
+ nir->info->tess.ccw ? BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW
+ : BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW;
+ }
+
+ if (unlikely(INTEL_DEBUG & DEBUG_TES)) {
+ fprintf(stderr, "TES Input ");
+ brw_print_vue_map(stderr, input_vue_map);
+ fprintf(stderr, "TES Output ");
+ brw_print_vue_map(stderr, &prog_data->base.vue_map);
+ }
+
+ if (is_scalar) {
+ fs_visitor v(compiler, log_data, mem_ctx, (void *) key,
+ &prog_data->base.base, NULL, nir, 8,
+ shader_time_index, input_vue_map);
+ if (!v.run_tes()) {
+ if (error_str)
+ *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+ return NULL;
+ }
+
+ prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
+ prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
+
+ fs_generator g(compiler, log_data, mem_ctx, (void *) key,
+ &prog_data->base.base, v.promoted_constants, false,
+ MESA_SHADER_TESS_EVAL);
+ if (unlikely(INTEL_DEBUG & DEBUG_TES)) {
+ g.enable_debug(ralloc_asprintf(mem_ctx,
+ "%s tessellation evaluation shader %s",
+ nir->info->label ? nir->info->label
+ : "unnamed",
+ nir->info->name));
+ }
+
+ g.generate_code(v.cfg, 8);
+
+ return g.get_assembly(final_assembly_size);
+ } else {
+ brw::vec4_tes_visitor v(compiler, log_data, key, prog_data,
+ nir, mem_ctx, shader_time_index);
+ if (!v.run()) {
+ if (error_str)
+ *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+ return NULL;
+ }
+
+ if (unlikely(INTEL_DEBUG & DEBUG_TES))
+ v.dump_instructions();
+
+ return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
+ &prog_data->base, v.cfg,
+ final_assembly_size);
+ }
+}
diff --git a/src/intel/compiler/brw_shader.h b/src/intel/compiler/brw_shader.h
new file mode 100644
index 00000000000..5a253e66570
--- /dev/null
+++ b/src/intel/compiler/brw_shader.h
@@ -0,0 +1,295 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include "brw_reg.h"
+#include "brw_compiler.h"
+#include "brw_eu_defines.h"
+#include "brw_inst.h"
+#include "compiler/nir/nir.h"
+
+#ifdef __cplusplus
+#include "brw_ir_allocator.h"
+#endif
+
+#define MAX_SAMPLER_MESSAGE_SIZE 11
+#define MAX_VGRF_SIZE 16
+
+#ifdef __cplusplus
+struct backend_reg : private brw_reg
+{
+ backend_reg() {}
+ backend_reg(const struct brw_reg &reg) : brw_reg(reg) {}
+
+ const brw_reg &as_brw_reg() const
+ {
+ assert(file == ARF || file == FIXED_GRF || file == MRF || file == IMM);
+ assert(offset == 0);
+ return static_cast<const brw_reg &>(*this);
+ }
+
+ brw_reg &as_brw_reg()
+ {
+ assert(file == ARF || file == FIXED_GRF || file == MRF || file == IMM);
+ assert(offset == 0);
+ return static_cast<brw_reg &>(*this);
+ }
+
+ bool equals(const backend_reg &r) const;
+
+ bool is_zero() const;
+ bool is_one() const;
+ bool is_negative_one() const;
+ bool is_null() const;
+ bool is_accumulator() const;
+
+ /** Offset from the start of the (virtual) register in bytes. */
+ uint16_t offset;
+
+ using brw_reg::type;
+ using brw_reg::file;
+ using brw_reg::negate;
+ using brw_reg::abs;
+ using brw_reg::address_mode;
+ using brw_reg::subnr;
+ using brw_reg::nr;
+
+ using brw_reg::swizzle;
+ using brw_reg::writemask;
+ using brw_reg::indirect_offset;
+ using brw_reg::vstride;
+ using brw_reg::width;
+ using brw_reg::hstride;
+
+ using brw_reg::df;
+ using brw_reg::f;
+ using brw_reg::d;
+ using brw_reg::ud;
+};
+#endif
+
+struct cfg_t;
+struct bblock_t;
+
+#ifdef __cplusplus
+struct backend_instruction : public exec_node {
+ bool is_3src(const struct gen_device_info *devinfo) const;
+ bool is_tex() const;
+ bool is_math() const;
+ bool is_control_flow() const;
+ bool is_commutative() const;
+ bool can_do_source_mods() const;
+ bool can_do_saturate() const;
+ bool can_do_cmod() const;
+ bool reads_accumulator_implicitly() const;
+ bool writes_accumulator_implicitly(const struct gen_device_info *devinfo) const;
+
+ void remove(bblock_t *block);
+ void insert_after(bblock_t *block, backend_instruction *inst);
+ void insert_before(bblock_t *block, backend_instruction *inst);
+ void insert_before(bblock_t *block, exec_list *list);
+
+ /**
+ * True if the instruction has side effects other than writing to
+ * its destination registers. You are expected not to reorder or
+ * optimize these out unless you know what you are doing.
+ */
+ bool has_side_effects() const;
+
+ /**
+ * True if the instruction might be affected by side effects of other
+ * instructions.
+ */
+ bool is_volatile() const;
+#else
+struct backend_instruction {
+ struct exec_node link;
+#endif
+ /** @{
+ * Annotation for the generated IR. One of the two can be set.
+ */
+ const void *ir;
+ const char *annotation;
+ /** @} */
+
+ /**
+ * Execution size of the instruction. This is used by the generator to
+ * generate the correct binary for the given instruction. Current valid
+ * values are 1, 4, 8, 16, 32.
+ */
+ uint8_t exec_size;
+
+ /**
+ * Channel group from the hardware execution and predication mask that
+ * should be applied to the instruction. The subset of channel enable
+ * signals (calculated from the EU control flow and predication state)
+ * given by [group, group + exec_size) will be used to mask GRF writes and
+ * any other side effects of the instruction.
+ */
+ uint8_t group;
+
+ uint32_t offset; /**< spill/unspill offset or texture offset bitfield */
+ uint8_t mlen; /**< SEND message length */
+ int8_t base_mrf; /**< First MRF in the SEND message, if mlen is nonzero. */
+ uint8_t target; /**< MRT target. */
+ unsigned size_written; /**< Data written to the destination register in bytes. */
+
+ enum opcode opcode; /* BRW_OPCODE_* or FS_OPCODE_* */
+ enum brw_conditional_mod conditional_mod; /**< BRW_CONDITIONAL_* */
+ enum brw_predicate predicate;
+ bool predicate_inverse:1;
+ bool writes_accumulator:1; /**< instruction implicitly writes accumulator */
+ bool force_writemask_all:1;
+ bool no_dd_clear:1;
+ bool no_dd_check:1;
+ bool saturate:1;
+ bool shadow_compare:1;
+
+ /* Chooses which flag subregister (f0.0 or f0.1) is used for conditional
+ * mod and predication.
+ */
+ unsigned flag_subreg:1;
+
+ /** The number of hardware registers used for a message header. */
+ uint8_t header_size;
+};
+
+#ifdef __cplusplus
+
+enum instruction_scheduler_mode {
+ SCHEDULE_PRE,
+ SCHEDULE_PRE_NON_LIFO,
+ SCHEDULE_PRE_LIFO,
+ SCHEDULE_POST,
+};
+
+struct backend_shader {
+protected:
+
+ backend_shader(const struct brw_compiler *compiler,
+ void *log_data,
+ void *mem_ctx,
+ const nir_shader *shader,
+ struct brw_stage_prog_data *stage_prog_data);
+
+public:
+
+ const struct brw_compiler *compiler;
+ void *log_data; /* Passed to compiler->*_log functions */
+
+ const struct gen_device_info * const devinfo;
+ const nir_shader *nir;
+ struct brw_stage_prog_data * const stage_prog_data;
+
+ /** ralloc context for temporary data used during compile */
+ void *mem_ctx;
+
+ /**
+ * List of either fs_inst or vec4_instruction (inheriting from
+ * backend_instruction)
+ */
+ exec_list instructions;
+
+ cfg_t *cfg;
+
+ gl_shader_stage stage;
+ bool debug_enabled;
+ const char *stage_name;
+ const char *stage_abbrev;
+
+ brw::simple_allocator alloc;
+
+ virtual void dump_instruction(backend_instruction *inst) = 0;
+ virtual void dump_instruction(backend_instruction *inst, FILE *file) = 0;
+ virtual void dump_instructions();
+ virtual void dump_instructions(const char *name);
+
+ void calculate_cfg();
+
+ virtual void invalidate_live_intervals() = 0;
+};
+
+bool brw_texture_offset(int *offsets,
+ unsigned num_components,
+ uint32_t *offset_bits);
+
+void brw_setup_image_uniform_values(gl_shader_stage stage,
+ struct brw_stage_prog_data *stage_prog_data,
+ unsigned param_start_index,
+ const gl_uniform_storage *storage);
+
+#else
+struct backend_shader;
+#endif /* __cplusplus */
+
+enum brw_reg_type brw_type_for_base_type(const struct glsl_type *type);
+enum brw_conditional_mod brw_conditional_for_comparison(unsigned int op);
+uint32_t brw_math_function(enum opcode op);
+const char *brw_instruction_name(const struct gen_device_info *devinfo,
+ enum opcode op);
+bool brw_saturate_immediate(enum brw_reg_type type, struct brw_reg *reg);
+bool brw_negate_immediate(enum brw_reg_type type, struct brw_reg *reg);
+bool brw_abs_immediate(enum brw_reg_type type, struct brw_reg *reg);
+
+bool opt_predicated_break(struct backend_shader *s);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* brw_fs_reg_allocate.cpp */
+void brw_fs_alloc_reg_sets(struct brw_compiler *compiler);
+
+/* brw_vec4_reg_allocate.cpp */
+void brw_vec4_alloc_reg_set(struct brw_compiler *compiler);
+
+/* brw_disasm.c */
+extern const char *const conditional_modifier[16];
+extern const char *const pred_ctrl_align16[16];
+
+/* Per-thread scratch space is a power-of-two multiple of 1KB. */
+static inline int
+brw_get_scratch_size(int size)
+{
+ return MAX2(1024, util_next_power_of_two(size));
+}
+
+/**
+ * Scratch data used when compiling a GLSL geometry shader.
+ */
+struct brw_gs_compile
+{
+ struct brw_gs_prog_key key;
+ struct brw_vue_map input_vue_map;
+
+ unsigned control_data_bits_per_vertex;
+ unsigned control_data_header_size_bits;
+};
+
+unsigned get_atomic_counter_op(nir_intrinsic_op op);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp
new file mode 100644
index 00000000000..d7c09093032
--- /dev/null
+++ b/src/intel/compiler/brw_vec4.cpp
@@ -0,0 +1,2851 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4.h"
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "brw_nir.h"
+#include "brw_vec4_builder.h"
+#include "brw_vec4_live_variables.h"
+#include "brw_vec4_vs.h"
+#include "brw_dead_control_flow.h"
+#include "common/gen_debug.h"
+#include "program/prog_parameter.h"
+
+#define MAX_INSTRUCTION (1 << 30)
+
+using namespace brw;
+
+namespace brw {
+
+void
+src_reg::init()
+{
+ memset(this, 0, sizeof(*this));
+
+ this->file = BAD_FILE;
+}
+
+src_reg::src_reg(enum brw_reg_file file, int nr, const glsl_type *type)
+{
+ init();
+
+ this->file = file;
+ this->nr = nr;
+ if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
+ this->swizzle = brw_swizzle_for_size(type->vector_elements);
+ else
+ this->swizzle = BRW_SWIZZLE_XYZW;
+ if (type)
+ this->type = brw_type_for_base_type(type);
+}
+
+/** Generic unset register constructor. */
+src_reg::src_reg()
+{
+ init();
+}
+
+src_reg::src_reg(struct ::brw_reg reg) :
+ backend_reg(reg)
+{
+ this->offset = 0;
+ this->reladdr = NULL;
+}
+
+src_reg::src_reg(const dst_reg &reg) :
+ backend_reg(reg)
+{
+ this->reladdr = reg.reladdr;
+ this->swizzle = brw_swizzle_for_mask(reg.writemask);
+}
+
+void
+dst_reg::init()
+{
+ memset(this, 0, sizeof(*this));
+ this->file = BAD_FILE;
+ this->writemask = WRITEMASK_XYZW;
+}
+
+dst_reg::dst_reg()
+{
+ init();
+}
+
+dst_reg::dst_reg(enum brw_reg_file file, int nr)
+{
+ init();
+
+ this->file = file;
+ this->nr = nr;
+}
+
+dst_reg::dst_reg(enum brw_reg_file file, int nr, const glsl_type *type,
+ unsigned writemask)
+{
+ init();
+
+ this->file = file;
+ this->nr = nr;
+ this->type = brw_type_for_base_type(type);
+ this->writemask = writemask;
+}
+
+dst_reg::dst_reg(enum brw_reg_file file, int nr, brw_reg_type type,
+ unsigned writemask)
+{
+ init();
+
+ this->file = file;
+ this->nr = nr;
+ this->type = type;
+ this->writemask = writemask;
+}
+
+dst_reg::dst_reg(struct ::brw_reg reg) :
+ backend_reg(reg)
+{
+ this->offset = 0;
+ this->reladdr = NULL;
+}
+
+dst_reg::dst_reg(const src_reg &reg) :
+ backend_reg(reg)
+{
+ this->writemask = brw_mask_for_swizzle(reg.swizzle);
+ this->reladdr = reg.reladdr;
+}
+
+bool
+dst_reg::equals(const dst_reg &r) const
+{
+ return (this->backend_reg::equals(r) &&
+ (reladdr == r.reladdr ||
+ (reladdr && r.reladdr && reladdr->equals(*r.reladdr))));
+}
+
+bool
+vec4_instruction::is_send_from_grf()
+{
+ switch (opcode) {
+ case SHADER_OPCODE_SHADER_TIME_ADD:
+ case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
+ case SHADER_OPCODE_UNTYPED_ATOMIC:
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+ case SHADER_OPCODE_TYPED_ATOMIC:
+ case SHADER_OPCODE_TYPED_SURFACE_READ:
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+ case VEC4_OPCODE_URB_READ:
+ case TCS_OPCODE_URB_WRITE:
+ case TCS_OPCODE_RELEASE_INPUT:
+ case SHADER_OPCODE_BARRIER:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/**
+ * Returns true if this instruction's sources and destinations cannot
+ * safely be the same register.
+ *
+ * In most cases, a register can be written over safely by the same
+ * instruction that is its last use. For a single instruction, the
+ * sources are dereferenced before writing of the destination starts
+ * (naturally).
+ *
+ * However, there are a few cases where this can be problematic:
+ *
+ * - Virtual opcodes that translate to multiple instructions in the
+ * code generator: if src == dst and one instruction writes the
+ * destination before a later instruction reads the source, then
+ * src will have been clobbered.
+ *
+ * The register allocator uses this information to set up conflicts between
+ * GRF sources and the destination.
+ */
+bool
+vec4_instruction::has_source_and_destination_hazard() const
+{
+ switch (opcode) {
+ case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+ case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+ case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
+ return true;
+ default:
+ /* 8-wide compressed DF operations are executed as two 4-wide operations,
+ * so we have a src/dst hazard if the first half of the instruction
+ * overwrites the source of the second half. Prevent this by marking
+ * compressed instructions as having src/dst hazards, so the register
+ * allocator assigns safe register regions for dst and srcs.
+ */
+ return size_written > REG_SIZE;
+ }
+}
+
+unsigned
+vec4_instruction::size_read(unsigned arg) const
+{
+ switch (opcode) {
+ case SHADER_OPCODE_SHADER_TIME_ADD:
+ case SHADER_OPCODE_UNTYPED_ATOMIC:
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+ case SHADER_OPCODE_TYPED_ATOMIC:
+ case SHADER_OPCODE_TYPED_SURFACE_READ:
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+ case TCS_OPCODE_URB_WRITE:
+ if (arg == 0)
+ return mlen * REG_SIZE;
+ break;
+ case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
+ if (arg == 1)
+ return mlen * REG_SIZE;
+ break;
+ default:
+ break;
+ }
+
+ switch (src[arg].file) {
+ case BAD_FILE:
+ return 0;
+ case IMM:
+ case UNIFORM:
+ return 4 * type_sz(src[arg].type);
+ default:
+ /* XXX - Represent actual vertical stride. */
+ return exec_size * type_sz(src[arg].type);
+ }
+}
+
+bool
+vec4_instruction::can_do_source_mods(const struct gen_device_info *devinfo)
+{
+ if (devinfo->gen == 6 && is_math())
+ return false;
+
+ if (is_send_from_grf())
+ return false;
+
+ if (!backend_instruction::can_do_source_mods())
+ return false;
+
+ return true;
+}
+
+bool
+vec4_instruction::can_do_writemask(const struct gen_device_info *devinfo)
+{
+ switch (opcode) {
+ case SHADER_OPCODE_GEN4_SCRATCH_READ:
+ case VEC4_OPCODE_FROM_DOUBLE:
+ case VEC4_OPCODE_TO_DOUBLE:
+ case VEC4_OPCODE_PICK_LOW_32BIT:
+ case VEC4_OPCODE_PICK_HIGH_32BIT:
+ case VEC4_OPCODE_SET_LOW_32BIT:
+ case VEC4_OPCODE_SET_HIGH_32BIT:
+ case VS_OPCODE_PULL_CONSTANT_LOAD:
+ case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
+ case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
+ case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+ case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+ case TES_OPCODE_CREATE_INPUT_READ_HEADER:
+ case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
+ case VEC4_OPCODE_URB_READ:
+ case SHADER_OPCODE_MOV_INDIRECT:
+ return false;
+ default:
+ /* The MATH instruction on Gen6 only executes in align1 mode, which does
+ * not support writemasking.
+ */
+ if (devinfo->gen == 6 && is_math())
+ return false;
+
+ if (is_tex())
+ return false;
+
+ return true;
+ }
+}
+
+bool
+vec4_instruction::can_change_types() const
+{
+ return dst.type == src[0].type &&
+ !src[0].abs && !src[0].negate && !saturate &&
+ (opcode == BRW_OPCODE_MOV ||
+ (opcode == BRW_OPCODE_SEL &&
+ dst.type == src[1].type &&
+ predicate != BRW_PREDICATE_NONE &&
+ !src[1].abs && !src[1].negate));
+}
+
+/**
+ * Returns how many MRFs an opcode will write over.
+ *
+ * Note that this is not the 0 or 1 implied writes in an actual gen
+ * instruction -- the generate_* functions generate additional MOVs
+ * for setup.
+ */
+int
+vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
+{
+ if (inst->mlen == 0 || inst->is_send_from_grf())
+ return 0;
+
+ switch (inst->opcode) {
+ case SHADER_OPCODE_RCP:
+ case SHADER_OPCODE_RSQ:
+ case SHADER_OPCODE_SQRT:
+ case SHADER_OPCODE_EXP2:
+ case SHADER_OPCODE_LOG2:
+ case SHADER_OPCODE_SIN:
+ case SHADER_OPCODE_COS:
+ return 1;
+ case SHADER_OPCODE_INT_QUOTIENT:
+ case SHADER_OPCODE_INT_REMAINDER:
+ case SHADER_OPCODE_POW:
+ case TCS_OPCODE_THREAD_END:
+ return 2;
+ case VS_OPCODE_URB_WRITE:
+ return 1;
+ case VS_OPCODE_PULL_CONSTANT_LOAD:
+ return 2;
+ case SHADER_OPCODE_GEN4_SCRATCH_READ:
+ return 2;
+ case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
+ return 3;
+ case GS_OPCODE_URB_WRITE:
+ case GS_OPCODE_URB_WRITE_ALLOCATE:
+ case GS_OPCODE_THREAD_END:
+ return 0;
+ case GS_OPCODE_FF_SYNC:
+ return 1;
+ case TCS_OPCODE_URB_WRITE:
+ return 0;
+ case SHADER_OPCODE_SHADER_TIME_ADD:
+ return 0;
+ case SHADER_OPCODE_TEX:
+ case SHADER_OPCODE_TXL:
+ case SHADER_OPCODE_TXD:
+ case SHADER_OPCODE_TXF:
+ case SHADER_OPCODE_TXF_CMS:
+ case SHADER_OPCODE_TXF_CMS_W:
+ case SHADER_OPCODE_TXF_MCS:
+ case SHADER_OPCODE_TXS:
+ case SHADER_OPCODE_TG4:
+ case SHADER_OPCODE_TG4_OFFSET:
+ case SHADER_OPCODE_SAMPLEINFO:
+ case VS_OPCODE_GET_BUFFER_SIZE:
+ return inst->header_size;
+ default:
+ unreachable("not reached");
+ }
+}
+
+bool
+src_reg::equals(const src_reg &r) const
+{
+ return (this->backend_reg::equals(r) &&
+ !reladdr && !r.reladdr);
+}
+
+bool
+vec4_visitor::opt_vector_float()
+{
+ bool progress = false;
+
+ foreach_block(block, cfg) {
+ int last_reg = -1, last_offset = -1;
+ enum brw_reg_file last_reg_file = BAD_FILE;
+
+ uint8_t imm[4] = { 0 };
+ int inst_count = 0;
+ vec4_instruction *imm_inst[4];
+ unsigned writemask = 0;
+ enum brw_reg_type dest_type = BRW_REGISTER_TYPE_F;
+
+ foreach_inst_in_block_safe(vec4_instruction, inst, block) {
+ int vf = -1;
+ enum brw_reg_type need_type;
+
+ /* Look for unconditional MOVs from an immediate with a partial
+ * writemask. Skip type-conversion MOVs other than integer 0,
+ * where the type doesn't matter. See if the immediate can be
+ * represented as a VF.
+ */
+ if (inst->opcode == BRW_OPCODE_MOV &&
+ inst->src[0].file == IMM &&
+ inst->predicate == BRW_PREDICATE_NONE &&
+ inst->dst.writemask != WRITEMASK_XYZW &&
+ type_sz(inst->src[0].type) < 8 &&
+ (inst->src[0].type == inst->dst.type || inst->src[0].d == 0)) {
+
+ vf = brw_float_to_vf(inst->src[0].d);
+ need_type = BRW_REGISTER_TYPE_D;
+
+ if (vf == -1) {
+ vf = brw_float_to_vf(inst->src[0].f);
+ need_type = BRW_REGISTER_TYPE_F;
+ }
+ } else {
+ last_reg = -1;
+ }
+
+ /* If this wasn't a MOV, or the destination register doesn't match,
+ * or we have to switch destination types, then this breaks our
+ * sequence. Combine anything we've accumulated so far.
+ */
+ if (last_reg != inst->dst.nr ||
+ last_offset != inst->dst.offset ||
+ last_reg_file != inst->dst.file ||
+ (vf > 0 && dest_type != need_type)) {
+
+ if (inst_count > 1) {
+ unsigned vf;
+ memcpy(&vf, imm, sizeof(vf));
+ vec4_instruction *mov = MOV(imm_inst[0]->dst, brw_imm_vf(vf));
+ mov->dst.type = dest_type;
+ mov->dst.writemask = writemask;
+ inst->insert_before(block, mov);
+
+ for (int i = 0; i < inst_count; i++) {
+ imm_inst[i]->remove(block);
+ }
+
+ progress = true;
+ }
+
+ inst_count = 0;
+ last_reg = -1;
+ writemask = 0;
+ dest_type = BRW_REGISTER_TYPE_F;
+
+ for (int i = 0; i < 4; i++) {
+ imm[i] = 0;
+ }
+ }
+
+ /* Record this instruction's value (if it was representable). */
+ if (vf != -1) {
+ if ((inst->dst.writemask & WRITEMASK_X) != 0)
+ imm[0] = vf;
+ if ((inst->dst.writemask & WRITEMASK_Y) != 0)
+ imm[1] = vf;
+ if ((inst->dst.writemask & WRITEMASK_Z) != 0)
+ imm[2] = vf;
+ if ((inst->dst.writemask & WRITEMASK_W) != 0)
+ imm[3] = vf;
+
+ writemask |= inst->dst.writemask;
+ imm_inst[inst_count++] = inst;
+
+ last_reg = inst->dst.nr;
+ last_offset = inst->dst.offset;
+ last_reg_file = inst->dst.file;
+ if (vf > 0)
+ dest_type = need_type;
+ }
+ }
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+/* Replaces unused channels of a swizzle with channels that are used.
+ *
+ * For instance, this pass transforms
+ *
+ * mov vgrf4.yz, vgrf5.wxzy
+ *
+ * into
+ *
+ * mov vgrf4.yz, vgrf5.xxzx
+ *
+ * This eliminates false uses of some channels, letting dead code elimination
+ * remove the instructions that wrote them.
+ */
+bool
+vec4_visitor::opt_reduce_swizzle()
+{
+ bool progress = false;
+
+ foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+ if (inst->dst.file == BAD_FILE ||
+ inst->dst.file == ARF ||
+ inst->dst.file == FIXED_GRF ||
+ inst->is_send_from_grf())
+ continue;
+
+ unsigned swizzle;
+
+ /* Determine which channels of the sources are read. */
+ switch (inst->opcode) {
+ case VEC4_OPCODE_PACK_BYTES:
+ case BRW_OPCODE_DP4:
+ case BRW_OPCODE_DPH: /* FINISHME: DPH reads only three channels of src0,
+ * but all four of src1.
+ */
+ swizzle = brw_swizzle_for_size(4);
+ break;
+ case BRW_OPCODE_DP3:
+ swizzle = brw_swizzle_for_size(3);
+ break;
+ case BRW_OPCODE_DP2:
+ swizzle = brw_swizzle_for_size(2);
+ break;
+
+ case VEC4_OPCODE_TO_DOUBLE:
+ case VEC4_OPCODE_FROM_DOUBLE:
+ case VEC4_OPCODE_PICK_LOW_32BIT:
+ case VEC4_OPCODE_PICK_HIGH_32BIT:
+ case VEC4_OPCODE_SET_LOW_32BIT:
+ case VEC4_OPCODE_SET_HIGH_32BIT:
+ swizzle = brw_swizzle_for_size(4);
+ break;
+
+ default:
+ swizzle = brw_swizzle_for_mask(inst->dst.writemask);
+ break;
+ }
+
+ /* Update sources' swizzles. */
+ for (int i = 0; i < 3; i++) {
+ if (inst->src[i].file != VGRF &&
+ inst->src[i].file != ATTR &&
+ inst->src[i].file != UNIFORM)
+ continue;
+
+ const unsigned new_swizzle =
+ brw_compose_swizzle(swizzle, inst->src[i].swizzle);
+ if (inst->src[i].swizzle != new_swizzle) {
+ inst->src[i].swizzle = new_swizzle;
+ progress = true;
+ }
+ }
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+void
+vec4_visitor::split_uniform_registers()
+{
+ /* Prior to this, uniforms have been in an array sized according to
+ * the number of vector uniforms present, sparsely filled (so an
+ * aggregate results in reg indices being skipped over). Now we're
+ * going to cut those aggregates up so each .nr index is one
+ * vector. The goal is to make elimination of unused uniform
+ * components easier later.
+ */
+ foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+ for (int i = 0 ; i < 3; i++) {
+ if (inst->src[i].file != UNIFORM)
+ continue;
+
+ assert(!inst->src[i].reladdr);
+
+ inst->src[i].nr += inst->src[i].offset / 16;
+ inst->src[i].offset %= 16;
+ }
+ }
+}
+
+void
+vec4_visitor::pack_uniform_registers()
+{
+ uint8_t chans_used[this->uniforms];
+ int new_loc[this->uniforms];
+ int new_chan[this->uniforms];
+
+ memset(chans_used, 0, sizeof(chans_used));
+ memset(new_loc, 0, sizeof(new_loc));
+ memset(new_chan, 0, sizeof(new_chan));
+
+ /* Find which uniform vectors are actually used by the program. We
+ * expect unused vector elements when we've moved array access out
+ * to pull constants, and from some GLSL code generators like wine.
+ */
+ foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+ unsigned readmask;
+ switch (inst->opcode) {
+ case VEC4_OPCODE_PACK_BYTES:
+ case BRW_OPCODE_DP4:
+ case BRW_OPCODE_DPH:
+ readmask = 0xf;
+ break;
+ case BRW_OPCODE_DP3:
+ readmask = 0x7;
+ break;
+ case BRW_OPCODE_DP2:
+ readmask = 0x3;
+ break;
+ default:
+ readmask = inst->dst.writemask;
+ break;
+ }
+
+ for (int i = 0 ; i < 3; i++) {
+ if (inst->src[i].file != UNIFORM)
+ continue;
+
+ assert(type_sz(inst->src[i].type) % 4 == 0);
+ unsigned channel_size = type_sz(inst->src[i].type) / 4;
+
+ int reg = inst->src[i].nr;
+ for (int c = 0; c < 4; c++) {
+ if (!(readmask & (1 << c)))
+ continue;
+
+ unsigned channel = BRW_GET_SWZ(inst->src[i].swizzle, c) + 1;
+ unsigned used = MAX2(chans_used[reg], channel * channel_size);
+ if (used <= 4)
+ chans_used[reg] = used;
+ else
+ chans_used[reg + 1] = used - 4;
+ }
+ }
+
+ if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
+ inst->src[0].file == UNIFORM) {
+ assert(inst->src[2].file == BRW_IMMEDIATE_VALUE);
+ assert(inst->src[0].subnr == 0);
+
+ unsigned bytes_read = inst->src[2].ud;
+ assert(bytes_read % 4 == 0);
+ unsigned vec4s_read = DIV_ROUND_UP(bytes_read, 16);
+
+ /* We just mark every register touched by a MOV_INDIRECT as being
+ * fully used. This ensures that it doesn't broken up piecewise by
+ * the next part of our packing algorithm.
+ */
+ int reg = inst->src[0].nr;
+ for (unsigned i = 0; i < vec4s_read; i++)
+ chans_used[reg + i] = 4;
+ }
+ }
+
+ int new_uniform_count = 0;
+
+ /* Now, figure out a packing of the live uniform vectors into our
+ * push constants.
+ */
+ for (int src = 0; src < uniforms; src++) {
+ int size = chans_used[src];
+
+ if (size == 0)
+ continue;
+
+ int dst;
+ /* Find the lowest place we can slot this uniform in. */
+ for (dst = 0; dst < src; dst++) {
+ if (chans_used[dst] + size <= 4)
+ break;
+ }
+
+ if (src == dst) {
+ new_loc[src] = dst;
+ new_chan[src] = 0;
+ } else {
+ new_loc[src] = dst;
+ new_chan[src] = chans_used[dst];
+
+ /* Move the references to the data */
+ for (int j = 0; j < size; j++) {
+ stage_prog_data->param[dst * 4 + new_chan[src] + j] =
+ stage_prog_data->param[src * 4 + j];
+ }
+
+ chans_used[dst] += size;
+ chans_used[src] = 0;
+ }
+
+ new_uniform_count = MAX2(new_uniform_count, dst + 1);
+ }
+
+ this->uniforms = new_uniform_count;
+
+ /* Now, update the instructions for our repacked uniforms. */
+ foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+ for (int i = 0 ; i < 3; i++) {
+ int src = inst->src[i].nr;
+
+ if (inst->src[i].file != UNIFORM)
+ continue;
+
+ inst->src[i].nr = new_loc[src];
+ inst->src[i].swizzle += BRW_SWIZZLE4(new_chan[src], new_chan[src],
+ new_chan[src], new_chan[src]);
+ }
+ }
+}
+
+/**
+ * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
+ *
+ * While GLSL IR also performs this optimization, we end up with it in
+ * our instruction stream for a couple of reasons. One is that we
+ * sometimes generate silly instructions, for example in array access
+ * where we'll generate "ADD offset, index, base" even if base is 0.
+ * The other is that GLSL IR's constant propagation doesn't track the
+ * components of aggregates, so some VS patterns (initialize matrix to
+ * 0, accumulate in vertex blending factors) end up breaking down to
+ * instructions involving 0.
+ */
+bool
+vec4_visitor::opt_algebraic()
+{
+ bool progress = false;
+
+ foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+ switch (inst->opcode) {
+ case BRW_OPCODE_MOV:
+ if (inst->src[0].file != IMM)
+ break;
+
+ if (inst->saturate) {
+ if (inst->dst.type != inst->src[0].type)
+ assert(!"unimplemented: saturate mixed types");
+
+ if (brw_saturate_immediate(inst->dst.type,
+ &inst->src[0].as_brw_reg())) {
+ inst->saturate = false;
+ progress = true;
+ }
+ }
+ break;
+
+ case VEC4_OPCODE_UNPACK_UNIFORM:
+ if (inst->src[0].file != UNIFORM) {
+ inst->opcode = BRW_OPCODE_MOV;
+ progress = true;
+ }
+ break;
+
+ case BRW_OPCODE_ADD:
+ if (inst->src[1].is_zero()) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[1] = src_reg();
+ progress = true;
+ }
+ break;
+
+ case BRW_OPCODE_MUL:
+ if (inst->src[1].is_zero()) {
+ inst->opcode = BRW_OPCODE_MOV;
+ switch (inst->src[0].type) {
+ case BRW_REGISTER_TYPE_F:
+ inst->src[0] = brw_imm_f(0.0f);
+ break;
+ case BRW_REGISTER_TYPE_D:
+ inst->src[0] = brw_imm_d(0);
+ break;
+ case BRW_REGISTER_TYPE_UD:
+ inst->src[0] = brw_imm_ud(0u);
+ break;
+ default:
+ unreachable("not reached");
+ }
+ inst->src[1] = src_reg();
+ progress = true;
+ } else if (inst->src[1].is_one()) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[1] = src_reg();
+ progress = true;
+ } else if (inst->src[1].is_negative_one()) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[0].negate = !inst->src[0].negate;
+ inst->src[1] = src_reg();
+ progress = true;
+ }
+ break;
+ case BRW_OPCODE_CMP:
+ if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
+ inst->src[0].abs &&
+ inst->src[0].negate &&
+ inst->src[1].is_zero()) {
+ inst->src[0].abs = false;
+ inst->src[0].negate = false;
+ inst->conditional_mod = BRW_CONDITIONAL_Z;
+ progress = true;
+ break;
+ }
+ break;
+ case SHADER_OPCODE_BROADCAST:
+ if (is_uniform(inst->src[0]) ||
+ inst->src[1].is_zero()) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[1] = src_reg();
+ inst->force_writemask_all = true;
+ progress = true;
+ }
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+/**
+ * Only a limited number of hardware registers may be used for push
+ * constants, so this turns access to the overflowed constants into
+ * pull constants.
+ */
+void
+vec4_visitor::move_push_constants_to_pull_constants()
+{
+ int pull_constant_loc[this->uniforms];
+
+ /* Only allow 32 registers (256 uniform components) as push constants,
+ * which is the limit on gen6.
+ *
+ * If changing this value, note the limitation about total_regs in
+ * brw_curbe.c.
+ */
+ int max_uniform_components = 32 * 8;
+ if (this->uniforms * 4 <= max_uniform_components)
+ return;
+
+ /* Make some sort of choice as to which uniforms get sent to pull
+ * constants. We could potentially do something clever here like
+ * look for the most infrequently used uniform vec4s, but leave
+ * that for later.
+ */
+ for (int i = 0; i < this->uniforms * 4; i += 4) {
+ pull_constant_loc[i / 4] = -1;
+
+ if (i >= max_uniform_components) {
+ const gl_constant_value **values = &stage_prog_data->param[i];
+
+ /* Try to find an existing copy of this uniform in the pull
+ * constants if it was part of an array access already.
+ */
+ for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j += 4) {
+ int matches;
+
+ for (matches = 0; matches < 4; matches++) {
+ if (stage_prog_data->pull_param[j + matches] != values[matches])
+ break;
+ }
+
+ if (matches == 4) {
+ pull_constant_loc[i / 4] = j / 4;
+ break;
+ }
+ }
+
+ if (pull_constant_loc[i / 4] == -1) {
+ assert(stage_prog_data->nr_pull_params % 4 == 0);
+ pull_constant_loc[i / 4] = stage_prog_data->nr_pull_params / 4;
+
+ for (int j = 0; j < 4; j++) {
+ stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
+ values[j];
+ }
+ }
+ }
+ }
+
+ /* Now actually rewrite usage of the things we've moved to pull
+ * constants.
+ */
+ foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+ for (int i = 0 ; i < 3; i++) {
+ if (inst->src[i].file != UNIFORM ||
+ pull_constant_loc[inst->src[i].nr] == -1)
+ continue;
+
+ int uniform = inst->src[i].nr;
+
+ const glsl_type *temp_type = type_sz(inst->src[i].type) == 8 ?
+ glsl_type::dvec4_type : glsl_type::vec4_type;
+ dst_reg temp = dst_reg(this, temp_type);
+
+ emit_pull_constant_load(block, inst, temp, inst->src[i],
+ pull_constant_loc[uniform], src_reg());
+
+ inst->src[i].file = temp.file;
+ inst->src[i].nr = temp.nr;
+ inst->src[i].offset %= 16;
+ inst->src[i].reladdr = NULL;
+ }
+ }
+
+ /* Repack push constants to remove the now-unused ones. */
+ pack_uniform_registers();
+}
+
+/* Conditions for which we want to avoid setting the dependency control bits */
+bool
+vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
+{
+#define IS_DWORD(reg) \
+ (reg.type == BRW_REGISTER_TYPE_UD || \
+ reg.type == BRW_REGISTER_TYPE_D)
+
+#define IS_64BIT(reg) (reg.file != BAD_FILE && type_sz(reg.type) == 8)
+
+ /* From the Cherryview and Broadwell PRMs:
+ *
+ * "When source or destination datatype is 64b or operation is integer DWord
+ * multiply, DepCtrl must not be used."
+ *
+ * SKL PRMs don't include this restriction, however, gen7 seems to be
+ * affected, at least by the 64b restriction, since DepCtrl with double
+ * precision instructions seems to produce GPU hangs in some cases.
+ */
+ if (devinfo->gen == 8 || devinfo->is_broxton) {
+ if (inst->opcode == BRW_OPCODE_MUL &&
+ IS_DWORD(inst->src[0]) &&
+ IS_DWORD(inst->src[1]))
+ return true;
+ }
+
+ if (devinfo->gen >= 7 && devinfo->gen <= 8) {
+ if (IS_64BIT(inst->dst) || IS_64BIT(inst->src[0]) ||
+ IS_64BIT(inst->src[1]) || IS_64BIT(inst->src[2]))
+ return true;
+ }
+
+#undef IS_64BIT
+#undef IS_DWORD
+
+ if (devinfo->gen >= 8) {
+ if (inst->opcode == BRW_OPCODE_F32TO16)
+ return true;
+ }
+
+ /*
+ * mlen:
+ * In the presence of send messages, totally interrupt dependency
+ * control. They're long enough that the chance of dependency
+ * control around them just doesn't matter.
+ *
+ * predicate:
+ * From the Ivy Bridge PRM, volume 4 part 3.7, page 80:
+ * When a sequence of NoDDChk and NoDDClr are used, the last instruction that
+ * completes the scoreboard clear must have a non-zero execution mask. This
+ * means, if any kind of predication can change the execution mask or channel
+ * enable of the last instruction, the optimization must be avoided. This is
+ * to avoid instructions being shot down the pipeline when no writes are
+ * required.
+ *
+ * math:
+ * Dependency control does not work well over math instructions.
+ * NB: Discovered empirically
+ */
+ return (inst->mlen || inst->predicate || inst->is_math());
+}
+
+/**
+ * Sets the dependency control fields on instructions after register
+ * allocation and before the generator is run.
+ *
+ * When you have a sequence of instructions like:
+ *
+ * DP4 temp.x vertex uniform[0]
+ * DP4 temp.y vertex uniform[0]
+ * DP4 temp.z vertex uniform[0]
+ * DP4 temp.w vertex uniform[0]
+ *
+ * The hardware doesn't know that it can actually run the later instructions
+ * while the previous ones are in flight, producing stalls. However, we have
+ * manual fields we can set in the instructions that let it do so.
+ */
+void
+vec4_visitor::opt_set_dependency_control()
+{
+ vec4_instruction *last_grf_write[BRW_MAX_GRF];
+ uint8_t grf_channels_written[BRW_MAX_GRF];
+ vec4_instruction *last_mrf_write[BRW_MAX_GRF];
+ uint8_t mrf_channels_written[BRW_MAX_GRF];
+
+ assert(prog_data->total_grf ||
+ !"Must be called after register allocation");
+
+ foreach_block (block, cfg) {
+ memset(last_grf_write, 0, sizeof(last_grf_write));
+ memset(last_mrf_write, 0, sizeof(last_mrf_write));
+
+ foreach_inst_in_block (vec4_instruction, inst, block) {
+ /* If we read from a register that we were doing dependency control
+ * on, don't do dependency control across the read.
+ */
+ for (int i = 0; i < 3; i++) {
+ int reg = inst->src[i].nr + inst->src[i].offset / REG_SIZE;
+ if (inst->src[i].file == VGRF) {
+ last_grf_write[reg] = NULL;
+ } else if (inst->src[i].file == FIXED_GRF) {
+ memset(last_grf_write, 0, sizeof(last_grf_write));
+ break;
+ }
+ assert(inst->src[i].file != MRF);
+ }
+
+ if (is_dep_ctrl_unsafe(inst)) {
+ memset(last_grf_write, 0, sizeof(last_grf_write));
+ memset(last_mrf_write, 0, sizeof(last_mrf_write));
+ continue;
+ }
+
+ /* Now, see if we can do dependency control for this instruction
+ * against a previous one writing to its destination.
+ */
+ int reg = inst->dst.nr + inst->dst.offset / REG_SIZE;
+ if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) {
+ if (last_grf_write[reg] &&
+ last_grf_write[reg]->dst.offset == inst->dst.offset &&
+ !(inst->dst.writemask & grf_channels_written[reg])) {
+ last_grf_write[reg]->no_dd_clear = true;
+ inst->no_dd_check = true;
+ } else {
+ grf_channels_written[reg] = 0;
+ }
+
+ last_grf_write[reg] = inst;
+ grf_channels_written[reg] |= inst->dst.writemask;
+ } else if (inst->dst.file == MRF) {
+ if (last_mrf_write[reg] &&
+ last_mrf_write[reg]->dst.offset == inst->dst.offset &&
+ !(inst->dst.writemask & mrf_channels_written[reg])) {
+ last_mrf_write[reg]->no_dd_clear = true;
+ inst->no_dd_check = true;
+ } else {
+ mrf_channels_written[reg] = 0;
+ }
+
+ last_mrf_write[reg] = inst;
+ mrf_channels_written[reg] |= inst->dst.writemask;
+ }
+ }
+ }
+}
+
+bool
+vec4_instruction::can_reswizzle(const struct gen_device_info *devinfo,
+ int dst_writemask,
+ int swizzle,
+ int swizzle_mask)
+{
+ /* Gen6 MATH instructions can not execute in align16 mode, so swizzles
+ * are not allowed.
+ */
+ if (devinfo->gen == 6 && is_math() && swizzle != BRW_SWIZZLE_XYZW)
+ return false;
+
+ if (!can_do_writemask(devinfo) && dst_writemask != WRITEMASK_XYZW)
+ return false;
+
+ /* If this instruction sets anything not referenced by swizzle, then we'd
+ * totally break it when we reswizzle.
+ */
+ if (dst.writemask & ~swizzle_mask)
+ return false;
+
+ if (mlen > 0)
+ return false;
+
+ for (int i = 0; i < 3; i++) {
+ if (src[i].is_accumulator())
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * For any channels in the swizzle's source that were populated by this
+ * instruction, rewrite the instruction to put the appropriate result directly
+ * in those channels.
+ *
+ * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
+ */
+void
+vec4_instruction::reswizzle(int dst_writemask, int swizzle)
+{
+ /* Destination write mask doesn't correspond to source swizzle for the dot
+ * product and pack_bytes instructions.
+ */
+ if (opcode != BRW_OPCODE_DP4 && opcode != BRW_OPCODE_DPH &&
+ opcode != BRW_OPCODE_DP3 && opcode != BRW_OPCODE_DP2 &&
+ opcode != VEC4_OPCODE_PACK_BYTES) {
+ for (int i = 0; i < 3; i++) {
+ if (src[i].file == BAD_FILE || src[i].file == IMM)
+ continue;
+
+ src[i].swizzle = brw_compose_swizzle(swizzle, src[i].swizzle);
+ }
+ }
+
+ /* Apply the specified swizzle and writemask to the original mask of
+ * written components.
+ */
+ dst.writemask = dst_writemask &
+ brw_apply_swizzle_to_mask(swizzle, dst.writemask);
+}
+
+/*
+ * Tries to reduce extra MOV instructions by taking temporary GRFs that get
+ * just written and then MOVed into another reg and making the original write
+ * of the GRF write directly to the final destination instead.
+ */
+bool
+vec4_visitor::opt_register_coalesce()
+{
+ bool progress = false;
+ int next_ip = 0;
+
+ calculate_live_intervals();
+
+ foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) {
+ int ip = next_ip;
+ next_ip++;
+
+ if (inst->opcode != BRW_OPCODE_MOV ||
+ (inst->dst.file != VGRF && inst->dst.file != MRF) ||
+ inst->predicate ||
+ inst->src[0].file != VGRF ||
+ inst->dst.type != inst->src[0].type ||
+ inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
+ continue;
+
+ /* Remove no-op MOVs */
+ if (inst->dst.file == inst->src[0].file &&
+ inst->dst.nr == inst->src[0].nr &&
+ inst->dst.offset == inst->src[0].offset) {
+ bool is_nop_mov = true;
+
+ for (unsigned c = 0; c < 4; c++) {
+ if ((inst->dst.writemask & (1 << c)) == 0)
+ continue;
+
+ if (BRW_GET_SWZ(inst->src[0].swizzle, c) != c) {
+ is_nop_mov = false;
+ break;
+ }
+ }
+
+ if (is_nop_mov) {
+ inst->remove(block);
+ progress = true;
+ continue;
+ }
+ }
+
+ bool to_mrf = (inst->dst.file == MRF);
+
+ /* Can't coalesce this GRF if someone else was going to
+ * read it later.
+ */
+ if (var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 8) > ip)
+ continue;
+
+ /* We need to check interference with the final destination between this
+ * instruction and the earliest instruction involved in writing the GRF
+ * we're eliminating. To do that, keep track of which of our source
+ * channels we've seen initialized.
+ */
+ const unsigned chans_needed =
+ brw_apply_inv_swizzle_to_mask(inst->src[0].swizzle,
+ inst->dst.writemask);
+ unsigned chans_remaining = chans_needed;
+
+ /* Now walk up the instruction stream trying to see if we can rewrite
+ * everything writing to the temporary to write into the destination
+ * instead.
+ */
+ vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev;
+ foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst,
+ inst) {
+ _scan_inst = scan_inst;
+
+ if (regions_overlap(inst->src[0], inst->size_read(0),
+ scan_inst->dst, scan_inst->size_written)) {
+ /* Found something writing to the reg we want to coalesce away. */
+ if (to_mrf) {
+ /* SEND instructions can't have MRF as a destination. */
+ if (scan_inst->mlen)
+ break;
+
+ if (devinfo->gen == 6) {
+ /* gen6 math instructions must have the destination be
+ * VGRF, so no compute-to-MRF for them.
+ */
+ if (scan_inst->is_math()) {
+ break;
+ }
+ }
+ }
+
+ /* This doesn't handle saturation on the instruction we
+ * want to coalesce away if the register types do not match.
+ * But if scan_inst is a non type-converting 'mov', we can fix
+ * the types later.
+ */
+ if (inst->saturate &&
+ inst->dst.type != scan_inst->dst.type &&
+ !(scan_inst->opcode == BRW_OPCODE_MOV &&
+ scan_inst->dst.type == scan_inst->src[0].type))
+ break;
+
+ /* Only allow coalescing between registers of the same type size.
+ * Otherwise we would need to make the pass aware of the fact that
+ * channel sizes are different for single and double precision.
+ */
+ if (type_sz(inst->src[0].type) != type_sz(scan_inst->src[0].type))
+ break;
+
+ /* Check that scan_inst writes the same amount of data as the
+ * instruction, otherwise coalescing would lead to writing a
+ * different (larger or smaller) region of the destination
+ */
+ if (scan_inst->size_written != inst->size_written)
+ break;
+
+ /* If we can't handle the swizzle, bail. */
+ if (!scan_inst->can_reswizzle(devinfo, inst->dst.writemask,
+ inst->src[0].swizzle,
+ chans_needed)) {
+ break;
+ }
+
+ /* This only handles coalescing writes of 8 channels (1 register
+ * for single-precision and 2 registers for double-precision)
+ * starting at the source offset of the copy instruction.
+ */
+ if (DIV_ROUND_UP(scan_inst->size_written,
+ type_sz(scan_inst->dst.type)) > 8 ||
+ scan_inst->dst.offset != inst->src[0].offset)
+ break;
+
+ /* Mark which channels we found unconditional writes for. */
+ if (!scan_inst->predicate)
+ chans_remaining &= ~scan_inst->dst.writemask;
+
+ if (chans_remaining == 0)
+ break;
+ }
+
+ /* You can't read from an MRF, so if someone else reads our MRF's
+ * source GRF that we wanted to rewrite, that stops us. If it's a
+ * GRF we're trying to coalesce to, we don't actually handle
+ * rewriting sources so bail in that case as well.
+ */
+ bool interfered = false;
+ for (int i = 0; i < 3; i++) {
+ if (regions_overlap(inst->src[0], inst->size_read(0),
+ scan_inst->src[i], scan_inst->size_read(i)))
+ interfered = true;
+ }
+ if (interfered)
+ break;
+
+ /* If somebody else writes the same channels of our destination here,
+ * we can't coalesce before that.
+ */
+ if (regions_overlap(inst->dst, inst->size_written,
+ scan_inst->dst, scan_inst->size_written) &&
+ (inst->dst.writemask & scan_inst->dst.writemask) != 0) {
+ break;
+ }
+
+ /* Check for reads of the register we're trying to coalesce into. We
+ * can't go rewriting instructions above that to put some other value
+ * in the register instead.
+ */
+ if (to_mrf && scan_inst->mlen > 0) {
+ if (inst->dst.nr >= scan_inst->base_mrf &&
+ inst->dst.nr < scan_inst->base_mrf + scan_inst->mlen) {
+ break;
+ }
+ } else {
+ for (int i = 0; i < 3; i++) {
+ if (regions_overlap(inst->dst, inst->size_written,
+ scan_inst->src[i], scan_inst->size_read(i)))
+ interfered = true;
+ }
+ if (interfered)
+ break;
+ }
+ }
+
+ if (chans_remaining == 0) {
+ /* If we've made it here, we have an MOV we want to coalesce out, and
+ * a scan_inst pointing to the earliest instruction involved in
+ * computing the value. Now go rewrite the instruction stream
+ * between the two.
+ */
+ vec4_instruction *scan_inst = _scan_inst;
+ while (scan_inst != inst) {
+ if (scan_inst->dst.file == VGRF &&
+ scan_inst->dst.nr == inst->src[0].nr &&
+ scan_inst->dst.offset == inst->src[0].offset) {
+ scan_inst->reswizzle(inst->dst.writemask,
+ inst->src[0].swizzle);
+ scan_inst->dst.file = inst->dst.file;
+ scan_inst->dst.nr = inst->dst.nr;
+ scan_inst->dst.offset = inst->dst.offset;
+ if (inst->saturate &&
+ inst->dst.type != scan_inst->dst.type) {
+ /* If we have reached this point, scan_inst is a non
+ * type-converting 'mov' and we can modify its register types
+ * to match the ones in inst. Otherwise, we could have an
+ * incorrect saturation result.
+ */
+ scan_inst->dst.type = inst->dst.type;
+ scan_inst->src[0].type = inst->src[0].type;
+ }
+ scan_inst->saturate |= inst->saturate;
+ }
+ scan_inst = (vec4_instruction *)scan_inst->next;
+ }
+ inst->remove(block);
+ progress = true;
+ }
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+/**
+ * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
+ * flow. We could probably do better here with some form of divergence
+ * analysis.
+ */
+bool
+vec4_visitor::eliminate_find_live_channel()
+{
+ bool progress = false;
+ unsigned depth = 0;
+
+ if (!brw_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
+ /* The optimization below assumes that channel zero is live on thread
+ * dispatch, which may not be the case if the fixed function dispatches
+ * threads sparsely.
+ */
+ return false;
+ }
+
+ foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+ switch (inst->opcode) {
+ case BRW_OPCODE_IF:
+ case BRW_OPCODE_DO:
+ depth++;
+ break;
+
+ case BRW_OPCODE_ENDIF:
+ case BRW_OPCODE_WHILE:
+ depth--;
+ break;
+
+ case SHADER_OPCODE_FIND_LIVE_CHANNEL:
+ if (depth == 0) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[0] = brw_imm_d(0);
+ inst->force_writemask_all = true;
+ progress = true;
+ }
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ return progress;
+}
+
+/**
+ * Splits virtual GRFs requesting more than one contiguous physical register.
+ *
+ * We initially create large virtual GRFs for temporary structures, arrays,
+ * and matrices, so that the visitor functions can add offsets to work their
+ * way down to the actual member being accessed. But when it comes to
+ * optimization, we'd like to treat each register as individual storage if
+ * possible.
+ *
+ * So far, the only thing that might prevent splitting is a send message from
+ * a GRF on IVB.
+ */
+void
+vec4_visitor::split_virtual_grfs()
+{
+ int num_vars = this->alloc.count;
+ int new_virtual_grf[num_vars];
+ bool split_grf[num_vars];
+
+ memset(new_virtual_grf, 0, sizeof(new_virtual_grf));
+
+ /* Try to split anything > 0 sized. */
+ for (int i = 0; i < num_vars; i++) {
+ split_grf[i] = this->alloc.sizes[i] != 1;
+ }
+
+ /* Check that the instructions are compatible with the registers we're trying
+ * to split.
+ */
+ foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+ if (inst->dst.file == VGRF && regs_written(inst) > 1)
+ split_grf[inst->dst.nr] = false;
+
+ for (int i = 0; i < 3; i++) {
+ if (inst->src[i].file == VGRF && regs_read(inst, i) > 1)
+ split_grf[inst->src[i].nr] = false;
+ }
+ }
+
+ /* Allocate new space for split regs. Note that the virtual
+ * numbers will be contiguous.
+ */
+ for (int i = 0; i < num_vars; i++) {
+ if (!split_grf[i])
+ continue;
+
+ new_virtual_grf[i] = alloc.allocate(1);
+ for (unsigned j = 2; j < this->alloc.sizes[i]; j++) {
+ unsigned reg = alloc.allocate(1);
+ assert(reg == new_virtual_grf[i] + j - 1);
+ (void) reg;
+ }
+ this->alloc.sizes[i] = 1;
+ }
+
+ foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+ if (inst->dst.file == VGRF && split_grf[inst->dst.nr] &&
+ inst->dst.offset / REG_SIZE != 0) {
+ inst->dst.nr = (new_virtual_grf[inst->dst.nr] +
+ inst->dst.offset / REG_SIZE - 1);
+ inst->dst.offset %= REG_SIZE;
+ }
+ for (int i = 0; i < 3; i++) {
+ if (inst->src[i].file == VGRF && split_grf[inst->src[i].nr] &&
+ inst->src[i].offset / REG_SIZE != 0) {
+ inst->src[i].nr = (new_virtual_grf[inst->src[i].nr] +
+ inst->src[i].offset / REG_SIZE - 1);
+ inst->src[i].offset %= REG_SIZE;
+ }
+ }
+ }
+ invalidate_live_intervals();
+}
+
+void
+vec4_visitor::dump_instruction(backend_instruction *be_inst)
+{
+ dump_instruction(be_inst, stderr);
+}
+
+void
+vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
+{
+ vec4_instruction *inst = (vec4_instruction *)be_inst;
+
+ if (inst->predicate) {
+ fprintf(file, "(%cf0.%d%s) ",
+ inst->predicate_inverse ? '-' : '+',
+ inst->flag_subreg,
+ pred_ctrl_align16[inst->predicate]);
+ }
+
+ fprintf(file, "%s(%d)", brw_instruction_name(devinfo, inst->opcode),
+ inst->exec_size);
+ if (inst->saturate)
+ fprintf(file, ".sat");
+ if (inst->conditional_mod) {
+ fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
+ if (!inst->predicate &&
+ (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
+ inst->opcode != BRW_OPCODE_IF &&
+ inst->opcode != BRW_OPCODE_WHILE))) {
+ fprintf(file, ".f0.%d", inst->flag_subreg);
+ }
+ }
+ fprintf(file, " ");
+
+ switch (inst->dst.file) {
+ case VGRF:
+ fprintf(file, "vgrf%d", inst->dst.nr);
+ break;
+ case FIXED_GRF:
+ fprintf(file, "g%d", inst->dst.nr);
+ break;
+ case MRF:
+ fprintf(file, "m%d", inst->dst.nr);
+ break;
+ case ARF:
+ switch (inst->dst.nr) {
+ case BRW_ARF_NULL:
+ fprintf(file, "null");
+ break;
+ case BRW_ARF_ADDRESS:
+ fprintf(file, "a0.%d", inst->dst.subnr);
+ break;
+ case BRW_ARF_ACCUMULATOR:
+ fprintf(file, "acc%d", inst->dst.subnr);
+ break;
+ case BRW_ARF_FLAG:
+ fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
+ break;
+ default:
+ fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
+ break;
+ }
+ break;
+ case BAD_FILE:
+ fprintf(file, "(null)");
+ break;
+ case IMM:
+ case ATTR:
+ case UNIFORM:
+ unreachable("not reached");
+ }
+ if (inst->dst.offset ||
+ (inst->dst.file == VGRF &&
+ alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
+ const unsigned reg_size = (inst->dst.file == UNIFORM ? 16 : REG_SIZE);
+ fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
+ inst->dst.offset % reg_size);
+ }
+ if (inst->dst.writemask != WRITEMASK_XYZW) {
+ fprintf(file, ".");
+ if (inst->dst.writemask & 1)
+ fprintf(file, "x");
+ if (inst->dst.writemask & 2)
+ fprintf(file, "y");
+ if (inst->dst.writemask & 4)
+ fprintf(file, "z");
+ if (inst->dst.writemask & 8)
+ fprintf(file, "w");
+ }
+ fprintf(file, ":%s", brw_reg_type_letters(inst->dst.type));
+
+ if (inst->src[0].file != BAD_FILE)
+ fprintf(file, ", ");
+
+ for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
+ if (inst->src[i].negate)
+ fprintf(file, "-");
+ if (inst->src[i].abs)
+ fprintf(file, "|");
+ switch (inst->src[i].file) {
+ case VGRF:
+ fprintf(file, "vgrf%d", inst->src[i].nr);
+ break;
+ case FIXED_GRF:
+ fprintf(file, "g%d.%d", inst->src[i].nr, inst->src[i].subnr);
+ break;
+ case ATTR:
+ fprintf(file, "attr%d", inst->src[i].nr);
+ break;
+ case UNIFORM:
+ fprintf(file, "u%d", inst->src[i].nr);
+ break;
+ case IMM:
+ switch (inst->src[i].type) {
+ case BRW_REGISTER_TYPE_F:
+ fprintf(file, "%fF", inst->src[i].f);
+ break;
+ case BRW_REGISTER_TYPE_DF:
+ fprintf(file, "%fDF", inst->src[i].df);
+ break;
+ case BRW_REGISTER_TYPE_D:
+ fprintf(file, "%dD", inst->src[i].d);
+ break;
+ case BRW_REGISTER_TYPE_UD:
+ fprintf(file, "%uU", inst->src[i].ud);
+ break;
+ case BRW_REGISTER_TYPE_VF:
+ fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
+ brw_vf_to_float((inst->src[i].ud >> 0) & 0xff),
+ brw_vf_to_float((inst->src[i].ud >> 8) & 0xff),
+ brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
+ brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
+ break;
+ default:
+ fprintf(file, "???");
+ break;
+ }
+ break;
+ case ARF:
+ switch (inst->src[i].nr) {
+ case BRW_ARF_NULL:
+ fprintf(file, "null");
+ break;
+ case BRW_ARF_ADDRESS:
+ fprintf(file, "a0.%d", inst->src[i].subnr);
+ break;
+ case BRW_ARF_ACCUMULATOR:
+ fprintf(file, "acc%d", inst->src[i].subnr);
+ break;
+ case BRW_ARF_FLAG:
+ fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
+ break;
+ default:
+ fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
+ break;
+ }
+ break;
+ case BAD_FILE:
+ fprintf(file, "(null)");
+ break;
+ case MRF:
+ unreachable("not reached");
+ }
+
+ if (inst->src[i].offset ||
+ (inst->src[i].file == VGRF &&
+ alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
+ const unsigned reg_size = (inst->src[i].file == UNIFORM ? 16 : REG_SIZE);
+ fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
+ inst->src[i].offset % reg_size);
+ }
+
+ if (inst->src[i].file != IMM) {
+ static const char *chans[4] = {"x", "y", "z", "w"};
+ fprintf(file, ".");
+ for (int c = 0; c < 4; c++) {
+ fprintf(file, "%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]);
+ }
+ }
+
+ if (inst->src[i].abs)
+ fprintf(file, "|");
+
+ if (inst->src[i].file != IMM) {
+ fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
+ }
+
+ if (i < 2 && inst->src[i + 1].file != BAD_FILE)
+ fprintf(file, ", ");
+ }
+
+ if (inst->force_writemask_all)
+ fprintf(file, " NoMask");
+
+ if (inst->exec_size != 8)
+ fprintf(file, " group%d", inst->group);
+
+ fprintf(file, "\n");
+}
+
+
+static inline struct brw_reg
+attribute_to_hw_reg(int attr, brw_reg_type type, bool interleaved)
+{
+ struct brw_reg reg;
+
+ unsigned width = REG_SIZE / 2 / MAX2(4, type_sz(type));
+ if (interleaved) {
+ reg = stride(brw_vecn_grf(width, attr / 2, (attr % 2) * 4), 0, width, 1);
+ } else {
+ reg = brw_vecn_grf(width, attr, 0);
+ }
+
+ reg.type = type;
+ return reg;
+}
+
+
+/**
+ * Replace each register of type ATTR in this->instructions with a reference
+ * to a fixed HW register.
+ *
+ * If interleaved is true, then each attribute takes up half a register, with
+ * register N containing attribute 2*N in its first half and attribute 2*N+1
+ * in its second half (this corresponds to the payload setup used by geometry
+ * shaders in "single" or "dual instanced" dispatch mode). If interleaved is
+ * false, then each attribute takes up a whole register, with register N
+ * containing attribute N (this corresponds to the payload setup used by
+ * vertex shaders, and by geometry shaders in "dual object" dispatch mode).
+ */
+void
+vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
+ bool interleaved)
+{
+ foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+ for (int i = 0; i < 3; i++) {
+ if (inst->src[i].file != ATTR)
+ continue;
+
+ int grf = attribute_map[inst->src[i].nr +
+ inst->src[i].offset / REG_SIZE];
+ assert(inst->src[i].offset % REG_SIZE == 0);
+
+ /* All attributes used in the shader need to have been assigned a
+ * hardware register by the caller
+ */
+ assert(grf != 0);
+
+ struct brw_reg reg =
+ attribute_to_hw_reg(grf, inst->src[i].type, interleaved);
+ reg.swizzle = inst->src[i].swizzle;
+ if (inst->src[i].abs)
+ reg = brw_abs(reg);
+ if (inst->src[i].negate)
+ reg = negate(reg);
+
+ inst->src[i] = reg;
+ }
+ }
+}
+
+int
+vec4_vs_visitor::setup_attributes(int payload_reg)
+{
+ int nr_attributes;
+ int attribute_map[VERT_ATTRIB_MAX + 2];
+ memset(attribute_map, 0, sizeof(attribute_map));
+
+ nr_attributes = 0;
+ GLbitfield64 vs_inputs = vs_prog_data->inputs_read;
+ while (vs_inputs) {
+ GLuint first = ffsll(vs_inputs) - 1;
+ int needed_slots =
+ (vs_prog_data->double_inputs_read & BITFIELD64_BIT(first)) ? 2 : 1;
+ for (int c = 0; c < needed_slots; c++) {
+ attribute_map[first + c] = payload_reg + nr_attributes;
+ nr_attributes++;
+ vs_inputs &= ~BITFIELD64_BIT(first + c);
+ }
+ }
+
+ /* VertexID is stored by the VF as the last vertex element, but we
+ * don't represent it with a flag in inputs_read, so we call it
+ * VERT_ATTRIB_MAX.
+ */
+ if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid ||
+ vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) {
+ attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes;
+ nr_attributes++;
+ }
+
+ if (vs_prog_data->uses_drawid) {
+ attribute_map[VERT_ATTRIB_MAX + 1] = payload_reg + nr_attributes;
+ nr_attributes++;
+ }
+
+ lower_attributes_to_hw_regs(attribute_map, false /* interleaved */);
+
+ return payload_reg + vs_prog_data->nr_attribute_slots;
+}
+
+int
+vec4_visitor::setup_uniforms(int reg)
+{
+ prog_data->base.dispatch_grf_start_reg = reg;
+
+ /* The pre-gen6 VS requires that some push constants get loaded no
+ * matter what, or the GPU would hang.
+ */
+ if (devinfo->gen < 6 && this->uniforms == 0) {
+ stage_prog_data->param =
+ reralloc(NULL, stage_prog_data->param, const gl_constant_value *, 4);
+ for (unsigned int i = 0; i < 4; i++) {
+ unsigned int slot = this->uniforms * 4 + i;
+ static gl_constant_value zero = { 0.0 };
+ stage_prog_data->param[slot] = &zero;
+ }
+
+ this->uniforms++;
+ reg++;
+ } else {
+ reg += ALIGN(uniforms, 2) / 2;
+ }
+
+ stage_prog_data->nr_params = this->uniforms * 4;
+
+ prog_data->base.curb_read_length =
+ reg - prog_data->base.dispatch_grf_start_reg;
+
+ return reg;
+}
+
+void
+vec4_vs_visitor::setup_payload(void)
+{
+ int reg = 0;
+
+ /* The payload always contains important data in g0, which contains
+ * the URB handles that are passed on to the URB write at the end
+ * of the thread. So, we always start push constants at g1.
+ */
+ reg++;
+
+ reg = setup_uniforms(reg);
+
+ reg = setup_attributes(reg);
+
+ this->first_non_payload_grf = reg;
+}
+
+bool
+vec4_visitor::lower_minmax()
+{
+ assert(devinfo->gen < 6);
+
+ bool progress = false;
+
+ foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+ const vec4_builder ibld(this, block, inst);
+
+ if (inst->opcode == BRW_OPCODE_SEL &&
+ inst->predicate == BRW_PREDICATE_NONE) {
+ /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of
+ * the original SEL.L/GE instruction
+ */
+ ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
+ inst->conditional_mod);
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ inst->conditional_mod = BRW_CONDITIONAL_NONE;
+
+ progress = true;
+ }
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+src_reg
+vec4_visitor::get_timestamp()
+{
+ assert(devinfo->gen >= 7);
+
+ src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+ BRW_ARF_TIMESTAMP,
+ 0,
+ 0,
+ 0,
+ BRW_REGISTER_TYPE_UD,
+ BRW_VERTICAL_STRIDE_0,
+ BRW_WIDTH_4,
+ BRW_HORIZONTAL_STRIDE_4,
+ BRW_SWIZZLE_XYZW,
+ WRITEMASK_XYZW));
+
+ dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
+
+ vec4_instruction *mov = emit(MOV(dst, ts));
+ /* We want to read the 3 fields we care about (mostly field 0, but also 2)
+ * even if it's not enabled in the dispatch.
+ */
+ mov->force_writemask_all = true;
+
+ return src_reg(dst);
+}
+
+void
+vec4_visitor::emit_shader_time_begin()
+{
+ current_annotation = "shader time start";
+ shader_start_time = get_timestamp();
+}
+
+void
+vec4_visitor::emit_shader_time_end()
+{
+ current_annotation = "shader time end";
+ src_reg shader_end_time = get_timestamp();
+
+
+ /* Check that there weren't any timestamp reset events (assuming these
+ * were the only two timestamp reads that happened).
+ */
+ src_reg reset_end = shader_end_time;
+ reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
+ vec4_instruction *test = emit(AND(dst_null_ud(), reset_end, brw_imm_ud(1u)));
+ test->conditional_mod = BRW_CONDITIONAL_Z;
+
+ emit(IF(BRW_PREDICATE_NORMAL));
+
+ /* Take the current timestamp and get the delta. */
+ shader_start_time.negate = true;
+ dst_reg diff = dst_reg(this, glsl_type::uint_type);
+ emit(ADD(diff, shader_start_time, shader_end_time));
+
+ /* If there were no instructions between the two timestamp gets, the diff
+ * is 2 cycles. Remove that overhead, so I can forget about that when
+ * trying to determine the time taken for single instructions.
+ */
+ emit(ADD(diff, src_reg(diff), brw_imm_ud(-2u)));
+
+ emit_shader_time_write(0, src_reg(diff));
+ emit_shader_time_write(1, brw_imm_ud(1u));
+ emit(BRW_OPCODE_ELSE);
+ emit_shader_time_write(2, brw_imm_ud(1u));
+ emit(BRW_OPCODE_ENDIF);
+}
+
+void
+vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value)
+{
+ dst_reg dst =
+ dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2));
+
+ dst_reg offset = dst;
+ dst_reg time = dst;
+ time.offset += REG_SIZE;
+
+ offset.type = BRW_REGISTER_TYPE_UD;
+ int index = shader_time_index * 3 + shader_time_subindex;
+ emit(MOV(offset, brw_imm_d(index * BRW_SHADER_TIME_STRIDE)));
+
+ time.type = BRW_REGISTER_TYPE_UD;
+ emit(MOV(time, value));
+
+ vec4_instruction *inst =
+ emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst));
+ inst->mlen = 2;
+}
+
+void
+vec4_visitor::convert_to_hw_regs()
+{
+ foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+ for (int i = 0; i < 3; i++) {
+ struct src_reg &src = inst->src[i];
+ struct brw_reg reg;
+ switch (src.file) {
+ case VGRF: {
+ const unsigned type_size = type_sz(src.type);
+ const unsigned width = REG_SIZE / 2 / MAX2(4, type_size);
+ reg = byte_offset(brw_vecn_grf(width, src.nr, 0), src.offset);
+ reg.type = src.type;
+ reg.abs = src.abs;
+ reg.negate = src.negate;
+ break;
+ }
+
+ case UNIFORM: {
+ const unsigned width = REG_SIZE / 2 / MAX2(4, type_sz(src.type));
+ reg = stride(byte_offset(brw_vec4_grf(
+ prog_data->base.dispatch_grf_start_reg +
+ src.nr / 2, src.nr % 2 * 4),
+ src.offset),
+ 0, width, 1);
+ reg.type = src.type;
+ reg.abs = src.abs;
+ reg.negate = src.negate;
+
+ /* This should have been moved to pull constants. */
+ assert(!src.reladdr);
+ break;
+ }
+
+ case FIXED_GRF:
+ if (type_sz(src.type) == 8) {
+ reg = src.as_brw_reg();
+ break;
+ }
+ /* fallthrough */
+ case ARF:
+ case IMM:
+ continue;
+
+ case BAD_FILE:
+ /* Probably unused. */
+ reg = brw_null_reg();
+ break;
+
+ case MRF:
+ case ATTR:
+ unreachable("not reached");
+ }
+
+ apply_logical_swizzle(&reg, inst, i);
+ src = reg;
+ }
+
+ if (inst->is_3src(devinfo)) {
+ /* 3-src instructions with scalar sources support arbitrary subnr,
+ * but don't actually use swizzles. Convert swizzle into subnr.
+ * Skip this for double-precision instructions: RepCtrl=1 is not
+ * allowed for them and needs special handling.
+ */
+ for (int i = 0; i < 3; i++) {
+ if (inst->src[i].vstride == BRW_VERTICAL_STRIDE_0 &&
+ type_sz(inst->src[i].type) < 8) {
+ assert(brw_is_single_value_swizzle(inst->src[i].swizzle));
+ inst->src[i].subnr += 4 * BRW_GET_SWZ(inst->src[i].swizzle, 0);
+ }
+ }
+ }
+
+ dst_reg &dst = inst->dst;
+ struct brw_reg reg;
+
+ switch (inst->dst.file) {
+ case VGRF:
+ reg = byte_offset(brw_vec8_grf(dst.nr, 0), dst.offset);
+ reg.type = dst.type;
+ reg.writemask = dst.writemask;
+ break;
+
+ case MRF:
+ reg = byte_offset(brw_message_reg(dst.nr), dst.offset);
+ assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
+ reg.type = dst.type;
+ reg.writemask = dst.writemask;
+ break;
+
+ case ARF:
+ case FIXED_GRF:
+ reg = dst.as_brw_reg();
+ break;
+
+ case BAD_FILE:
+ reg = brw_null_reg();
+ break;
+
+ case IMM:
+ case ATTR:
+ case UNIFORM:
+ unreachable("not reached");
+ }
+
+ dst = reg;
+ }
+}
+
+static bool
+stage_uses_interleaved_attributes(unsigned stage,
+ enum shader_dispatch_mode dispatch_mode)
+{
+ switch (stage) {
+ case MESA_SHADER_TESS_EVAL:
+ return true;
+ case MESA_SHADER_GEOMETRY:
+ return dispatch_mode != DISPATCH_MODE_4X2_DUAL_OBJECT;
+ default:
+ return false;
+ }
+}
+
+/**
+ * Get the closest native SIMD width supported by the hardware for instruction
+ * \p inst. The instruction will be left untouched by
+ * vec4_visitor::lower_simd_width() if the returned value matches the
+ * instruction's original execution size.
+ */
+static unsigned
+get_lowered_simd_width(const struct gen_device_info *devinfo,
+ enum shader_dispatch_mode dispatch_mode,
+ unsigned stage, const vec4_instruction *inst)
+{
+ /* Do not split some instructions that require special handling */
+ switch (inst->opcode) {
+ case SHADER_OPCODE_GEN4_SCRATCH_READ:
+ case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
+ return inst->exec_size;
+ default:
+ break;
+ }
+
+ unsigned lowered_width = MIN2(16, inst->exec_size);
+
+ /* We need to split some cases of double-precision instructions that write
+ * 2 registers. We only need to care about this in gen7 because that is the
+ * only hardware that implements fp64 in Align16.
+ */
+ if (devinfo->gen == 7 && inst->size_written > REG_SIZE) {
+ /* Align16 8-wide double-precision SEL does not work well. Verified
+ * empirically.
+ */
+ if (inst->opcode == BRW_OPCODE_SEL && type_sz(inst->dst.type) == 8)
+ lowered_width = MIN2(lowered_width, 4);
+
+ /* HSW PRM, 3D Media GPGPU Engine, Region Alignment Rules for Direct
+ * Register Addressing:
+ *
+ * "When destination spans two registers, the source MUST span two
+ * registers."
+ */
+ for (unsigned i = 0; i < 3; i++) {
+ if (inst->src[i].file == BAD_FILE)
+ continue;
+ if (inst->size_read(i) <= REG_SIZE)
+ lowered_width = MIN2(lowered_width, 4);
+
+ /* Interleaved attribute setups use a vertical stride of 0, which
+ * makes them hit the associated instruction decompression bug in gen7.
+ * Split them to prevent this.
+ */
+ if (inst->src[i].file == ATTR &&
+ stage_uses_interleaved_attributes(stage, dispatch_mode))
+ lowered_width = MIN2(lowered_width, 4);
+ }
+ }
+
+ return lowered_width;
+}
+
+static bool
+dst_src_regions_overlap(vec4_instruction *inst)
+{
+ if (inst->size_written == 0)
+ return false;
+
+ unsigned dst_start = inst->dst.offset;
+ unsigned dst_end = dst_start + inst->size_written - 1;
+ for (int i = 0; i < 3; i++) {
+ if (inst->src[i].file == BAD_FILE)
+ continue;
+
+ if (inst->dst.file != inst->src[i].file ||
+ inst->dst.nr != inst->src[i].nr)
+ continue;
+
+ unsigned src_start = inst->src[i].offset;
+ unsigned src_end = src_start + inst->size_read(i) - 1;
+
+ if ((dst_start >= src_start && dst_start <= src_end) ||
+ (dst_end >= src_start && dst_end <= src_end) ||
+ (dst_start <= src_start && dst_end >= src_end)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool
+vec4_visitor::lower_simd_width()
+{
+ bool progress = false;
+
+ foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+ const unsigned lowered_width =
+ get_lowered_simd_width(devinfo, prog_data->dispatch_mode, stage, inst);
+ assert(lowered_width <= inst->exec_size);
+ if (lowered_width == inst->exec_size)
+ continue;
+
+ /* We need to deal with source / destination overlaps when splitting.
+ * The hardware supports reading from and writing to the same register
+ * in the same instruction, but we need to be careful that each split
+ * instruction we produce does not corrupt the source of the next.
+ *
+ * The easiest way to handle this is to make the split instructions write
+ * to temporaries if there is an src/dst overlap and then move from the
+ * temporaries to the original destination. We also need to consider
+ * instructions that do partial writes via align1 opcodes, in which case
+ * we need to make sure that the we initialize the temporary with the
+ * value of the instruction's dst.
+ */
+ bool needs_temp = dst_src_regions_overlap(inst);
+ for (unsigned n = 0; n < inst->exec_size / lowered_width; n++) {
+ unsigned channel_offset = lowered_width * n;
+
+ unsigned size_written = lowered_width * type_sz(inst->dst.type);
+
+ /* Create the split instruction from the original so that we copy all
+ * relevant instruction fields, then set the width and calculate the
+ * new dst/src regions.
+ */
+ vec4_instruction *linst = new(mem_ctx) vec4_instruction(*inst);
+ linst->exec_size = lowered_width;
+ linst->group = channel_offset;
+ linst->size_written = size_written;
+
+ /* Compute split dst region */
+ dst_reg dst;
+ if (needs_temp) {
+ unsigned num_regs = DIV_ROUND_UP(size_written, REG_SIZE);
+ dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)),
+ inst->dst.type);
+ if (inst->is_align1_partial_write()) {
+ vec4_instruction *copy = MOV(dst, src_reg(inst->dst));
+ copy->exec_size = lowered_width;
+ copy->group = channel_offset;
+ copy->size_written = size_written;
+ inst->insert_before(block, copy);
+ }
+ } else {
+ dst = horiz_offset(inst->dst, channel_offset);
+ }
+ linst->dst = dst;
+
+ /* Compute split source regions */
+ for (int i = 0; i < 3; i++) {
+ if (linst->src[i].file == BAD_FILE)
+ continue;
+
+ if (!is_uniform(linst->src[i]))
+ linst->src[i] = horiz_offset(linst->src[i], channel_offset);
+ }
+
+ inst->insert_before(block, linst);
+
+ /* If we used a temporary to store the result of the split
+ * instruction, copy the result to the original destination
+ */
+ if (needs_temp) {
+ vec4_instruction *mov =
+ MOV(offset(inst->dst, lowered_width, n), src_reg(dst));
+ mov->exec_size = lowered_width;
+ mov->group = channel_offset;
+ mov->size_written = size_written;
+ mov->predicate = inst->predicate;
+ inst->insert_before(block, mov);
+ }
+ }
+
+ inst->remove(block);
+ progress = true;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+static bool
+is_align1_df(vec4_instruction *inst)
+{
+ switch (inst->opcode) {
+ case VEC4_OPCODE_FROM_DOUBLE:
+ case VEC4_OPCODE_TO_DOUBLE:
+ case VEC4_OPCODE_PICK_LOW_32BIT:
+ case VEC4_OPCODE_PICK_HIGH_32BIT:
+ case VEC4_OPCODE_SET_LOW_32BIT:
+ case VEC4_OPCODE_SET_HIGH_32BIT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static brw_predicate
+scalarize_predicate(brw_predicate predicate, unsigned writemask)
+{
+ if (predicate != BRW_PREDICATE_NORMAL)
+ return predicate;
+
+ switch (writemask) {
+ case WRITEMASK_X:
+ return BRW_PREDICATE_ALIGN16_REPLICATE_X;
+ case WRITEMASK_Y:
+ return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
+ case WRITEMASK_Z:
+ return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
+ case WRITEMASK_W:
+ return BRW_PREDICATE_ALIGN16_REPLICATE_W;
+ default:
+ unreachable("invalid writemask");
+ }
+}
+
+/* Gen7 has a hardware decompression bug that we can exploit to represent
+ * handful of additional swizzles natively.
+ */
+static bool
+is_gen7_supported_64bit_swizzle(vec4_instruction *inst, unsigned arg)
+{
+ switch (inst->src[arg].swizzle) {
+ case BRW_SWIZZLE_XXXX:
+ case BRW_SWIZZLE_YYYY:
+ case BRW_SWIZZLE_ZZZZ:
+ case BRW_SWIZZLE_WWWW:
+ case BRW_SWIZZLE_XYXY:
+ case BRW_SWIZZLE_YXYX:
+ case BRW_SWIZZLE_ZWZW:
+ case BRW_SWIZZLE_WZWZ:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/* 64-bit sources use regions with a width of 2. These 2 elements in each row
+ * can be addressed using 32-bit swizzles (which is what the hardware supports)
+ * but it also means that the swizzle we apply on the first two components of a
+ * dvec4 is coupled with the swizzle we use for the last 2. In other words,
+ * only some specific swizzle combinations can be natively supported.
+ *
+ * FIXME: we can go an step further and implement even more swizzle
+ * variations using only partial scalarization.
+ *
+ * For more details see:
+ * https://bugs.freedesktop.org/show_bug.cgi?id=92760#c82
+ */
+bool
+vec4_visitor::is_supported_64bit_region(vec4_instruction *inst, unsigned arg)
+{
+ const src_reg &src = inst->src[arg];
+ assert(type_sz(src.type) == 8);
+
+ /* Uniform regions have a vstride=0. Because we use 2-wide rows with
+ * 64-bit regions it means that we cannot access components Z/W, so
+ * return false for any such case. Interleaved attributes will also be
+ * mapped to GRF registers with a vstride of 0, so apply the same
+ * treatment.
+ */
+ if ((is_uniform(src) ||
+ (stage_uses_interleaved_attributes(stage, prog_data->dispatch_mode) &&
+ src.file == ATTR)) &&
+ (brw_mask_for_swizzle(src.swizzle) & 12))
+ return false;
+
+ switch (src.swizzle) {
+ case BRW_SWIZZLE_XYZW:
+ case BRW_SWIZZLE_XXZZ:
+ case BRW_SWIZZLE_YYWW:
+ case BRW_SWIZZLE_YXWZ:
+ return true;
+ default:
+ return devinfo->gen == 7 && is_gen7_supported_64bit_swizzle(inst, arg);
+ }
+}
+
+bool
+vec4_visitor::scalarize_df()
+{
+ bool progress = false;
+
+ foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+ /* Skip DF instructions that operate in Align1 mode */
+ if (is_align1_df(inst))
+ continue;
+
+ /* Check if this is a double-precision instruction */
+ bool is_double = type_sz(inst->dst.type) == 8;
+ for (int arg = 0; !is_double && arg < 3; arg++) {
+ is_double = inst->src[arg].file != BAD_FILE &&
+ type_sz(inst->src[arg].type) == 8;
+ }
+
+ if (!is_double)
+ continue;
+
+ /* Skip the lowering for specific regioning scenarios that we can
+ * support natively.
+ */
+ bool skip_lowering = true;
+
+ /* XY and ZW writemasks operate in 32-bit, which means that they don't
+ * have a native 64-bit representation and they should always be split.
+ */
+ if (inst->dst.writemask == WRITEMASK_XY ||
+ inst->dst.writemask == WRITEMASK_ZW) {
+ skip_lowering = false;
+ } else {
+ for (unsigned i = 0; i < 3; i++) {
+ if (inst->src[i].file == BAD_FILE || type_sz(inst->src[i].type) < 8)
+ continue;
+ skip_lowering = skip_lowering && is_supported_64bit_region(inst, i);
+ }
+ }
+
+ if (skip_lowering)
+ continue;
+
+ /* Generate scalar instructions for each enabled channel */
+ for (unsigned chan = 0; chan < 4; chan++) {
+ unsigned chan_mask = 1 << chan;
+ if (!(inst->dst.writemask & chan_mask))
+ continue;
+
+ vec4_instruction *scalar_inst = new(mem_ctx) vec4_instruction(*inst);
+
+ for (unsigned i = 0; i < 3; i++) {
+ unsigned swz = BRW_GET_SWZ(inst->src[i].swizzle, chan);
+ scalar_inst->src[i].swizzle = BRW_SWIZZLE4(swz, swz, swz, swz);
+ }
+
+ scalar_inst->dst.writemask = chan_mask;
+
+ if (inst->predicate != BRW_PREDICATE_NONE) {
+ scalar_inst->predicate =
+ scalarize_predicate(inst->predicate, chan_mask);
+ }
+
+ inst->insert_before(block, scalar_inst);
+ }
+
+ inst->remove(block);
+ progress = true;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+bool
+vec4_visitor::lower_64bit_mad_to_mul_add()
+{
+ bool progress = false;
+
+ foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+ if (inst->opcode != BRW_OPCODE_MAD)
+ continue;
+
+ if (type_sz(inst->dst.type) != 8)
+ continue;
+
+ dst_reg mul_dst = dst_reg(this, glsl_type::dvec4_type);
+
+ /* Use the copy constructor so we copy all relevant instruction fields
+ * from the original mad into the add and mul instructions
+ */
+ vec4_instruction *mul = new(mem_ctx) vec4_instruction(*inst);
+ mul->opcode = BRW_OPCODE_MUL;
+ mul->dst = mul_dst;
+ mul->src[0] = inst->src[1];
+ mul->src[1] = inst->src[2];
+ mul->src[2].file = BAD_FILE;
+
+ vec4_instruction *add = new(mem_ctx) vec4_instruction(*inst);
+ add->opcode = BRW_OPCODE_ADD;
+ add->src[0] = src_reg(mul_dst);
+ add->src[1] = inst->src[0];
+ add->src[2].file = BAD_FILE;
+
+ inst->insert_before(block, mul);
+ inst->insert_before(block, add);
+ inst->remove(block);
+
+ progress = true;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+/* The align16 hardware can only do 32-bit swizzle channels, so we need to
+ * translate the logical 64-bit swizzle channels that we use in the Vec4 IR
+ * to 32-bit swizzle channels in hardware registers.
+ *
+ * @inst and @arg identify the original vec4 IR source operand we need to
+ * translate the swizzle for and @hw_reg is the hardware register where we
+ * will write the hardware swizzle to use.
+ *
+ * This pass assumes that Align16/DF instructions have been fully scalarized
+ * previously so there is just one 64-bit swizzle channel to deal with for any
+ * given Vec4 IR source.
+ */
+void
+vec4_visitor::apply_logical_swizzle(struct brw_reg *hw_reg,
+ vec4_instruction *inst, int arg)
+{
+ src_reg reg = inst->src[arg];
+
+ if (reg.file == BAD_FILE || reg.file == BRW_IMMEDIATE_VALUE)
+ return;
+
+ /* If this is not a 64-bit operand or this is a scalar instruction we don't
+ * need to do anything about the swizzles.
+ */
+ if(type_sz(reg.type) < 8 || is_align1_df(inst)) {
+ hw_reg->swizzle = reg.swizzle;
+ return;
+ }
+
+ /* Take the 64-bit logical swizzle channel and translate it to 32-bit */
+ assert(brw_is_single_value_swizzle(reg.swizzle) ||
+ is_supported_64bit_region(inst, arg));
+
+ if (is_supported_64bit_region(inst, arg) &&
+ !is_gen7_supported_64bit_swizzle(inst, arg)) {
+ /* Supported 64-bit swizzles are those such that their first two
+ * components, when expanded to 32-bit swizzles, match the semantics
+ * of the original 64-bit swizzle with 2-wide row regioning.
+ */
+ unsigned swizzle0 = BRW_GET_SWZ(reg.swizzle, 0);
+ unsigned swizzle1 = BRW_GET_SWZ(reg.swizzle, 1);
+ hw_reg->swizzle = BRW_SWIZZLE4(swizzle0 * 2, swizzle0 * 2 + 1,
+ swizzle1 * 2, swizzle1 * 2 + 1);
+ } else {
+ /* If we got here then we have one of the following:
+ *
+ * 1. An unsupported swizzle, which should be single-value thanks to the
+ * scalarization pass.
+ *
+ * 2. A gen7 supported swizzle. These can be single-value or double-value
+ * swizzles. If the latter, they are never cross-dvec2 channels. For
+ * these we always need to activate the gen7 vstride=0 exploit.
+ */
+ unsigned swizzle0 = BRW_GET_SWZ(reg.swizzle, 0);
+ unsigned swizzle1 = BRW_GET_SWZ(reg.swizzle, 1);
+ assert((swizzle0 < 2) == (swizzle1 < 2));
+
+ /* To gain access to Z/W components we need to select the second half
+ * of the register and then use a X/Y swizzle to select Z/W respectively.
+ */
+ if (swizzle0 >= 2) {
+ *hw_reg = suboffset(*hw_reg, 2);
+ swizzle0 -= 2;
+ swizzle1 -= 2;
+ }
+
+ /* All gen7-specific supported swizzles require the vstride=0 exploit */
+ if (devinfo->gen == 7 && is_gen7_supported_64bit_swizzle(inst, arg))
+ hw_reg->vstride = BRW_VERTICAL_STRIDE_0;
+
+ /* Any 64-bit source with an offset at 16B is intended to address the
+ * second half of a register and needs a vertical stride of 0 so we:
+ *
+ * 1. Don't violate register region restrictions.
+ * 2. Activate the gen7 instruction decompresion bug exploit when
+ * execsize > 4
+ */
+ if (hw_reg->subnr % REG_SIZE == 16) {
+ assert(devinfo->gen == 7);
+ hw_reg->vstride = BRW_VERTICAL_STRIDE_0;
+ }
+
+ hw_reg->swizzle = BRW_SWIZZLE4(swizzle0 * 2, swizzle0 * 2 + 1,
+ swizzle1 * 2, swizzle1 * 2 + 1);
+ }
+}
+
+bool
+vec4_visitor::run()
+{
+ if (shader_time_index >= 0)
+ emit_shader_time_begin();
+
+ emit_prolog();
+
+ emit_nir_code();
+ if (failed)
+ return false;
+ base_ir = NULL;
+
+ emit_thread_end();
+
+ calculate_cfg();
+
+ /* Before any optimization, push array accesses out to scratch
+ * space where we need them to be. This pass may allocate new
+ * virtual GRFs, so we want to do it early. It also makes sure
+ * that we have reladdr computations available for CSE, since we'll
+ * often do repeated subexpressions for those.
+ */
+ move_grf_array_access_to_scratch();
+ move_uniform_array_access_to_pull_constants();
+
+ pack_uniform_registers();
+ move_push_constants_to_pull_constants();
+ split_virtual_grfs();
+
+#define OPT(pass, args...) ({ \
+ pass_num++; \
+ bool this_progress = pass(args); \
+ \
+ if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
+ char filename[64]; \
+ snprintf(filename, 64, "%s-%s-%02d-%02d-" #pass, \
+ stage_abbrev, nir->info->name, iteration, pass_num); \
+ \
+ backend_shader::dump_instructions(filename); \
+ } \
+ \
+ progress = progress || this_progress; \
+ this_progress; \
+ })
+
+
+ if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
+ char filename[64];
+ snprintf(filename, 64, "%s-%s-00-00-start",
+ stage_abbrev, nir->info->name);
+
+ backend_shader::dump_instructions(filename);
+ }
+
+ bool progress;
+ int iteration = 0;
+ int pass_num = 0;
+ do {
+ progress = false;
+ pass_num = 0;
+ iteration++;
+
+ OPT(opt_predicated_break, this);
+ OPT(opt_reduce_swizzle);
+ OPT(dead_code_eliminate);
+ OPT(dead_control_flow_eliminate, this);
+ OPT(opt_copy_propagation);
+ OPT(opt_cmod_propagation);
+ OPT(opt_cse);
+ OPT(opt_algebraic);
+ OPT(opt_register_coalesce);
+ OPT(eliminate_find_live_channel);
+ } while (progress);
+
+ pass_num = 0;
+
+ if (OPT(opt_vector_float)) {
+ OPT(opt_cse);
+ OPT(opt_copy_propagation, false);
+ OPT(opt_copy_propagation, true);
+ OPT(dead_code_eliminate);
+ }
+
+ if (devinfo->gen <= 5 && OPT(lower_minmax)) {
+ OPT(opt_cmod_propagation);
+ OPT(opt_cse);
+ OPT(opt_copy_propagation);
+ OPT(dead_code_eliminate);
+ }
+
+ if (OPT(lower_simd_width)) {
+ OPT(opt_copy_propagation);
+ OPT(dead_code_eliminate);
+ }
+
+ if (failed)
+ return false;
+
+ OPT(lower_64bit_mad_to_mul_add);
+
+ /* Run this before payload setup because tesselation shaders
+ * rely on it to prevent cross dvec2 regioning on DF attributes
+ * that are setup so that XY are on the second half of register and
+ * ZW are in the first half of the next.
+ */
+ OPT(scalarize_df);
+
+ setup_payload();
+
+ if (unlikely(INTEL_DEBUG & DEBUG_SPILL_VEC4)) {
+ /* Debug of register spilling: Go spill everything. */
+ const int grf_count = alloc.count;
+ float spill_costs[alloc.count];
+ bool no_spill[alloc.count];
+ evaluate_spill_costs(spill_costs, no_spill);
+ for (int i = 0; i < grf_count; i++) {
+ if (no_spill[i])
+ continue;
+ spill_reg(i);
+ }
+
+ /* We want to run this after spilling because 64-bit (un)spills need to
+ * emit code to shuffle 64-bit data for the 32-bit scratch read/write
+ * messages that can produce unsupported 64-bit swizzle regions.
+ */
+ OPT(scalarize_df);
+ }
+
+ bool allocated_without_spills = reg_allocate();
+
+ if (!allocated_without_spills) {
+ compiler->shader_perf_log(log_data,
+ "%s shader triggered register spilling. "
+ "Try reducing the number of live vec4 values "
+ "to improve performance.\n",
+ stage_name);
+
+ while (!reg_allocate()) {
+ if (failed)
+ return false;
+ }
+
+ /* We want to run this after spilling because 64-bit (un)spills need to
+ * emit code to shuffle 64-bit data for the 32-bit scratch read/write
+ * messages that can produce unsupported 64-bit swizzle regions.
+ */
+ OPT(scalarize_df);
+ }
+
+ opt_schedule_instructions();
+
+ opt_set_dependency_control();
+
+ convert_to_hw_regs();
+
+ if (last_scratch > 0) {
+ prog_data->base.total_scratch =
+ brw_get_scratch_size(last_scratch * REG_SIZE);
+ }
+
+ return !failed;
+}
+
+} /* namespace brw */
+
+extern "C" {
+
+/**
+ * Compile a vertex shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
+ void *mem_ctx,
+ const struct brw_vs_prog_key *key,
+ struct brw_vs_prog_data *prog_data,
+ const nir_shader *src_shader,
+ gl_clip_plane *clip_planes,
+ bool use_legacy_snorm_formula,
+ int shader_time_index,
+ unsigned *final_assembly_size,
+ char **error_str)
+{
+ const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX];
+ nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
+ shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, is_scalar);
+ brw_nir_lower_vs_inputs(shader, is_scalar,
+ use_legacy_snorm_formula, key->gl_attrib_wa_flags);
+ brw_nir_lower_vue_outputs(shader, is_scalar);
+ shader = brw_postprocess_nir(shader, compiler, is_scalar);
+
+ const unsigned *assembly = NULL;
+
+ prog_data->base.clip_distance_mask =
+ ((1 << shader->info->clip_distance_array_size) - 1);
+ prog_data->base.cull_distance_mask =
+ ((1 << shader->info->cull_distance_array_size) - 1) <<
+ shader->info->clip_distance_array_size;
+
+ unsigned nr_attribute_slots = _mesa_bitcount_64(prog_data->inputs_read);
+
+ /* gl_VertexID and gl_InstanceID are system values, but arrive via an
+ * incoming vertex attribute. So, add an extra slot.
+ */
+ if (shader->info->system_values_read &
+ (BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX) |
+ BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) |
+ BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
+ BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) {
+ nr_attribute_slots++;
+ }
+
+ /* gl_DrawID has its very own vec4 */
+ if (shader->info->system_values_read &
+ BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)) {
+ nr_attribute_slots++;
+ }
+
+ unsigned nr_attributes = nr_attribute_slots -
+ DIV_ROUND_UP(_mesa_bitcount_64(shader->info->double_inputs_read), 2);
+
+ /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
+ * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in
+ * vec4 mode, the hardware appears to wedge unless we read something.
+ */
+ if (is_scalar)
+ prog_data->base.urb_read_length =
+ DIV_ROUND_UP(nr_attribute_slots, 2);
+ else
+ prog_data->base.urb_read_length =
+ DIV_ROUND_UP(MAX2(nr_attribute_slots, 1), 2);
+
+ prog_data->nr_attributes = nr_attributes;
+ prog_data->nr_attribute_slots = nr_attribute_slots;
+
+ /* Since vertex shaders reuse the same VUE entry for inputs and outputs
+ * (overwriting the original contents), we need to make sure the size is
+ * the larger of the two.
+ */
+ const unsigned vue_entries =
+ MAX2(nr_attribute_slots, (unsigned)prog_data->base.vue_map.num_slots);
+
+ if (compiler->devinfo->gen == 6)
+ prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8);
+ else
+ prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
+
+ if (INTEL_DEBUG & DEBUG_VS) {
+ fprintf(stderr, "VS Output ");
+ brw_print_vue_map(stderr, &prog_data->base.vue_map);
+ }
+
+ if (is_scalar) {
+ prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
+
+ fs_visitor v(compiler, log_data, mem_ctx, key, &prog_data->base.base,
+ NULL, /* prog; Only used for TEXTURE_RECTANGLE on gen < 8 */
+ shader, 8, shader_time_index);
+ if (!v.run_vs(clip_planes)) {
+ if (error_str)
+ *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+
+ return NULL;
+ }
+
+ prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
+
+ fs_generator g(compiler, log_data, mem_ctx, (void *) key,
+ &prog_data->base.base, v.promoted_constants,
+ v.runtime_check_aads_emit, MESA_SHADER_VERTEX);
+ if (INTEL_DEBUG & DEBUG_VS) {
+ const char *debug_name =
+ ralloc_asprintf(mem_ctx, "%s vertex shader %s",
+ shader->info->label ? shader->info->label :
+ "unnamed",
+ shader->info->name);
+
+ g.enable_debug(debug_name);
+ }
+ g.generate_code(v.cfg, 8);
+ assembly = g.get_assembly(final_assembly_size);
+ }
+
+ if (!assembly) {
+ prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
+
+ vec4_vs_visitor v(compiler, log_data, key, prog_data,
+ shader, clip_planes, mem_ctx,
+ shader_time_index, use_legacy_snorm_formula);
+ if (!v.run()) {
+ if (error_str)
+ *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+
+ return NULL;
+ }
+
+ assembly = brw_vec4_generate_assembly(compiler, log_data, mem_ctx,
+ shader, &prog_data->base, v.cfg,
+ final_assembly_size);
+ }
+
+ return assembly;
+}
+
+} /* extern "C" */
diff --git a/src/intel/compiler/brw_vec4.h b/src/intel/compiler/brw_vec4.h
new file mode 100644
index 00000000000..a84048d8c6a
--- /dev/null
+++ b/src/intel/compiler/brw_vec4.h
@@ -0,0 +1,399 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_VEC4_H
+#define BRW_VEC4_H
+
+#include "brw_shader.h"
+
+#ifdef __cplusplus
+#include "brw_ir_vec4.h"
+#endif
+
+#include "compiler/glsl/ir.h"
+#include "compiler/nir/nir.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+const unsigned *
+brw_vec4_generate_assembly(const struct brw_compiler *compiler,
+ void *log_data,
+ void *mem_ctx,
+ const nir_shader *nir,
+ struct brw_vue_prog_data *prog_data,
+ const struct cfg_t *cfg,
+ unsigned *out_assembly_size);
+
+#ifdef __cplusplus
+} /* extern "C" */
+
+namespace brw {
+
+class vec4_live_variables;
+
+/**
+ * The vertex shader front-end.
+ *
+ * Translates either GLSL IR or Mesa IR (for ARB_vertex_program and
+ * fixed-function) into VS IR.
+ */
+class vec4_visitor : public backend_shader
+{
+public:
+ vec4_visitor(const struct brw_compiler *compiler,
+ void *log_data,
+ const struct brw_sampler_prog_key_data *key,
+ struct brw_vue_prog_data *prog_data,
+ const nir_shader *shader,
+ void *mem_ctx,
+ bool no_spills,
+ int shader_time_index);
+ virtual ~vec4_visitor();
+
+ dst_reg dst_null_f()
+ {
+ return dst_reg(brw_null_reg());
+ }
+
+ dst_reg dst_null_df()
+ {
+ return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
+ }
+
+ dst_reg dst_null_d()
+ {
+ return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+ }
+
+ dst_reg dst_null_ud()
+ {
+ return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
+ }
+
+ const struct brw_sampler_prog_key_data * const key_tex;
+ struct brw_vue_prog_data * const prog_data;
+ char *fail_msg;
+ bool failed;
+
+ /**
+ * GLSL IR currently being processed, which is associated with our
+ * driver IR instructions for debugging purposes.
+ */
+ const void *base_ir;
+ const char *current_annotation;
+
+ int first_non_payload_grf;
+ unsigned int max_grf;
+ int *virtual_grf_start;
+ int *virtual_grf_end;
+ brw::vec4_live_variables *live_intervals;
+ dst_reg userplane[MAX_CLIP_PLANES];
+
+ bool need_all_constants_in_pull_buffer;
+
+ /* Regs for vertex results. Generated at ir_variable visiting time
+ * for the ir->location's used.
+ */
+ dst_reg output_reg[VARYING_SLOT_TESS_MAX][4];
+ unsigned output_num_components[VARYING_SLOT_TESS_MAX][4];
+ const char *output_reg_annotation[VARYING_SLOT_TESS_MAX];
+ int uniforms;
+
+ src_reg shader_start_time;
+
+ bool run();
+ void fail(const char *msg, ...);
+
+ int setup_uniforms(int payload_reg);
+
+ bool reg_allocate_trivial();
+ bool reg_allocate();
+ void evaluate_spill_costs(float *spill_costs, bool *no_spill);
+ int choose_spill_reg(struct ra_graph *g);
+ void spill_reg(int spill_reg);
+ void move_grf_array_access_to_scratch();
+ void move_uniform_array_access_to_pull_constants();
+ void move_push_constants_to_pull_constants();
+ void split_uniform_registers();
+ void pack_uniform_registers();
+ void calculate_live_intervals();
+ void invalidate_live_intervals();
+ void split_virtual_grfs();
+ bool opt_vector_float();
+ bool opt_reduce_swizzle();
+ bool dead_code_eliminate();
+ int var_range_start(unsigned v, unsigned n) const;
+ int var_range_end(unsigned v, unsigned n) const;
+ bool virtual_grf_interferes(int a, int b);
+ bool opt_cmod_propagation();
+ bool opt_copy_propagation(bool do_constant_prop = true);
+ bool opt_cse_local(bblock_t *block);
+ bool opt_cse();
+ bool opt_algebraic();
+ bool opt_register_coalesce();
+ bool eliminate_find_live_channel();
+ bool is_dep_ctrl_unsafe(const vec4_instruction *inst);
+ void opt_set_dependency_control();
+ void opt_schedule_instructions();
+ void convert_to_hw_regs();
+
+ bool is_supported_64bit_region(vec4_instruction *inst, unsigned arg);
+ bool lower_simd_width();
+ bool scalarize_df();
+ bool lower_64bit_mad_to_mul_add();
+ void apply_logical_swizzle(struct brw_reg *hw_reg,
+ vec4_instruction *inst, int arg);
+
+ vec4_instruction *emit(vec4_instruction *inst);
+
+ vec4_instruction *emit(enum opcode opcode);
+ vec4_instruction *emit(enum opcode opcode, const dst_reg &dst);
+ vec4_instruction *emit(enum opcode opcode, const dst_reg &dst,
+ const src_reg &src0);
+ vec4_instruction *emit(enum opcode opcode, const dst_reg &dst,
+ const src_reg &src0, const src_reg &src1);
+ vec4_instruction *emit(enum opcode opcode, const dst_reg &dst,
+ const src_reg &src0, const src_reg &src1,
+ const src_reg &src2);
+
+ vec4_instruction *emit_before(bblock_t *block,
+ vec4_instruction *inst,
+ vec4_instruction *new_inst);
+
+#define EMIT1(op) vec4_instruction *op(const dst_reg &, const src_reg &);
+#define EMIT2(op) vec4_instruction *op(const dst_reg &, const src_reg &, const src_reg &);
+#define EMIT3(op) vec4_instruction *op(const dst_reg &, const src_reg &, const src_reg &, const src_reg &);
+ EMIT1(MOV)
+ EMIT1(NOT)
+ EMIT1(RNDD)
+ EMIT1(RNDE)
+ EMIT1(RNDZ)
+ EMIT1(FRC)
+ EMIT1(F32TO16)
+ EMIT1(F16TO32)
+ EMIT2(ADD)
+ EMIT2(MUL)
+ EMIT2(MACH)
+ EMIT2(MAC)
+ EMIT2(AND)
+ EMIT2(OR)
+ EMIT2(XOR)
+ EMIT2(DP3)
+ EMIT2(DP4)
+ EMIT2(DPH)
+ EMIT2(SHL)
+ EMIT2(SHR)
+ EMIT2(ASR)
+ vec4_instruction *CMP(dst_reg dst, src_reg src0, src_reg src1,
+ enum brw_conditional_mod condition);
+ vec4_instruction *IF(src_reg src0, src_reg src1,
+ enum brw_conditional_mod condition);
+ vec4_instruction *IF(enum brw_predicate predicate);
+ EMIT1(SCRATCH_READ)
+ EMIT2(SCRATCH_WRITE)
+ EMIT3(LRP)
+ EMIT1(BFREV)
+ EMIT3(BFE)
+ EMIT2(BFI1)
+ EMIT3(BFI2)
+ EMIT1(FBH)
+ EMIT1(FBL)
+ EMIT1(CBIT)
+ EMIT3(MAD)
+ EMIT2(ADDC)
+ EMIT2(SUBB)
+ EMIT1(DIM)
+
+#undef EMIT1
+#undef EMIT2
+#undef EMIT3
+
+ int implied_mrf_writes(vec4_instruction *inst);
+
+ vec4_instruction *emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
+ src_reg src0, src_reg src1);
+
+ vec4_instruction *emit_lrp(const dst_reg &dst, const src_reg &x,
+ const src_reg &y, const src_reg &a);
+
+ /**
+ * Copy any live channel from \p src to the first channel of the
+ * result.
+ */
+ src_reg emit_uniformize(const src_reg &src);
+
+ src_reg fix_3src_operand(const src_reg &src);
+ src_reg resolve_source_modifiers(const src_reg &src);
+
+ vec4_instruction *emit_math(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+ const src_reg &src1 = src_reg());
+
+ src_reg fix_math_operand(const src_reg &src);
+
+ void emit_pack_half_2x16(dst_reg dst, src_reg src0);
+ void emit_unpack_half_2x16(dst_reg dst, src_reg src0);
+ void emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0);
+ void emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0);
+ void emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0);
+ void emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0);
+
+ void emit_texture(ir_texture_opcode op,
+ dst_reg dest,
+ const glsl_type *dest_type,
+ src_reg coordinate,
+ int coord_components,
+ src_reg shadow_comparator,
+ src_reg lod, src_reg lod2,
+ src_reg sample_index,
+ uint32_t constant_offset,
+ src_reg offset_value,
+ src_reg mcs,
+ uint32_t surface, src_reg surface_reg,
+ src_reg sampler_reg);
+
+ src_reg emit_mcs_fetch(const glsl_type *coordinate_type, src_reg coordinate,
+ src_reg surface);
+ void emit_gen6_gather_wa(uint8_t wa, dst_reg dst);
+
+ void emit_ndc_computation();
+ void emit_psiz_and_flags(dst_reg reg);
+ vec4_instruction *emit_generic_urb_slot(dst_reg reg, int varying, int comp);
+ virtual void emit_urb_slot(dst_reg reg, int varying);
+
+ void emit_shader_time_begin();
+ void emit_shader_time_end();
+ void emit_shader_time_write(int shader_time_subindex, src_reg value);
+
+ src_reg get_scratch_offset(bblock_t *block, vec4_instruction *inst,
+ src_reg *reladdr, int reg_offset);
+ void emit_scratch_read(bblock_t *block, vec4_instruction *inst,
+ dst_reg dst,
+ src_reg orig_src,
+ int base_offset);
+ void emit_scratch_write(bblock_t *block, vec4_instruction *inst,
+ int base_offset);
+ void emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
+ dst_reg dst,
+ src_reg orig_src,
+ int base_offset,
+ src_reg indirect);
+ void emit_pull_constant_load_reg(dst_reg dst,
+ src_reg surf_index,
+ src_reg offset,
+ bblock_t *before_block,
+ vec4_instruction *before_inst);
+ src_reg emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
+ vec4_instruction *inst, src_reg src);
+
+ void resolve_ud_negate(src_reg *reg);
+
+ bool lower_minmax();
+
+ src_reg get_timestamp();
+
+ void dump_instruction(backend_instruction *inst);
+ void dump_instruction(backend_instruction *inst, FILE *file);
+
+ bool is_high_sampler(src_reg sampler);
+
+ bool optimize_predicate(nir_alu_instr *instr, enum brw_predicate *predicate);
+
+ void emit_conversion_from_double(dst_reg dst, src_reg src, bool saturate,
+ brw_reg_type single_type);
+ void emit_conversion_to_double(dst_reg dst, src_reg src, bool saturate,
+ brw_reg_type single_type);
+
+ src_reg setup_imm_df(double v);
+
+ vec4_instruction *shuffle_64bit_data(dst_reg dst, src_reg src,
+ bool for_write,
+ bblock_t *block = NULL,
+ vec4_instruction *ref = NULL);
+
+ virtual void emit_nir_code();
+ virtual void nir_setup_uniforms();
+ virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
+ virtual void nir_setup_system_values();
+ virtual void nir_emit_impl(nir_function_impl *impl);
+ virtual void nir_emit_cf_list(exec_list *list);
+ virtual void nir_emit_if(nir_if *if_stmt);
+ virtual void nir_emit_loop(nir_loop *loop);
+ virtual void nir_emit_block(nir_block *block);
+ virtual void nir_emit_instr(nir_instr *instr);
+ virtual void nir_emit_load_const(nir_load_const_instr *instr);
+ virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
+ virtual void nir_emit_alu(nir_alu_instr *instr);
+ virtual void nir_emit_jump(nir_jump_instr *instr);
+ virtual void nir_emit_texture(nir_tex_instr *instr);
+ virtual void nir_emit_undef(nir_ssa_undef_instr *instr);
+ virtual void nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr);
+
+ dst_reg get_nir_dest(const nir_dest &dest, enum brw_reg_type type);
+ dst_reg get_nir_dest(const nir_dest &dest, nir_alu_type type);
+ dst_reg get_nir_dest(const nir_dest &dest);
+ src_reg get_nir_src(const nir_src &src, enum brw_reg_type type,
+ unsigned num_components = 4);
+ src_reg get_nir_src(const nir_src &src, nir_alu_type type,
+ unsigned num_components = 4);
+ src_reg get_nir_src(const nir_src &src,
+ unsigned num_components = 4);
+ src_reg get_indirect_offset(nir_intrinsic_instr *instr);
+
+ virtual dst_reg *make_reg_for_system_value(int location) = 0;
+
+ dst_reg *nir_locals;
+ dst_reg *nir_ssa_values;
+ dst_reg *nir_system_values;
+
+protected:
+ void emit_vertex();
+ void lower_attributes_to_hw_regs(const int *attribute_map,
+ bool interleaved);
+ void setup_payload_interference(struct ra_graph *g, int first_payload_node,
+ int reg_node_count);
+ virtual void setup_payload() = 0;
+ virtual void emit_prolog() = 0;
+ virtual void emit_thread_end() = 0;
+ virtual void emit_urb_write_header(int mrf) = 0;
+ virtual vec4_instruction *emit_urb_write_opcode(bool complete) = 0;
+ virtual void gs_emit_vertex(int stream_id);
+ virtual void gs_end_primitive();
+
+private:
+ /**
+ * If true, then register allocation should fail instead of spilling.
+ */
+ const bool no_spills;
+
+ int shader_time_index;
+
+ unsigned last_scratch; /**< measured in 32-byte (register size) units */
+};
+
+} /* namespace brw */
+#endif /* __cplusplus */
+
+#endif /* BRW_VEC4_H */
diff --git a/src/intel/compiler/brw_vec4_builder.h b/src/intel/compiler/brw_vec4_builder.h
new file mode 100644
index 00000000000..4c3efe8457b
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_builder.h
@@ -0,0 +1,634 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2010-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_VEC4_BUILDER_H
+#define BRW_VEC4_BUILDER_H
+
+#include "brw_ir_vec4.h"
+#include "brw_ir_allocator.h"
+
+namespace brw {
+ /**
+ * Toolbox to assemble a VEC4 IR program out of individual instructions.
+ *
+ * This object is meant to have an interface consistent with
+ * brw::fs_builder. They cannot be fully interchangeable because
+ * brw::fs_builder generates scalar code while brw::vec4_builder generates
+ * vector code.
+ */
+ class vec4_builder {
+ public:
+ /** Type used in this IR to represent a source of an instruction. */
+ typedef brw::src_reg src_reg;
+
+ /** Type used in this IR to represent the destination of an instruction. */
+ typedef brw::dst_reg dst_reg;
+
+ /** Type used in this IR to represent an instruction. */
+ typedef vec4_instruction instruction;
+
+ /**
+ * Construct a vec4_builder that inserts instructions into \p shader.
+ */
+ vec4_builder(backend_shader *shader, unsigned dispatch_width = 8) :
+ shader(shader), block(NULL), cursor(NULL),
+ _dispatch_width(dispatch_width), _group(0),
+ force_writemask_all(false),
+ annotation()
+ {
+ }
+
+ /**
+ * Construct a vec4_builder that inserts instructions into \p shader
+ * before instruction \p inst in basic block \p block. The default
+ * execution controls and debug annotation are initialized from the
+ * instruction passed as argument.
+ */
+ vec4_builder(backend_shader *shader, bblock_t *block, instruction *inst) :
+ shader(shader), block(block), cursor(inst),
+ _dispatch_width(inst->exec_size), _group(inst->group),
+ force_writemask_all(inst->force_writemask_all)
+ {
+ annotation.str = inst->annotation;
+ annotation.ir = inst->ir;
+ }
+
+ /**
+ * Construct a vec4_builder that inserts instructions before \p cursor
+ * in basic block \p block, inheriting other code generation parameters
+ * from this.
+ */
+ vec4_builder
+ at(bblock_t *block, exec_node *cursor) const
+ {
+ vec4_builder bld = *this;
+ bld.block = block;
+ bld.cursor = cursor;
+ return bld;
+ }
+
+ /**
+ * Construct a vec4_builder appending instructions at the end of the
+ * instruction list of the shader, inheriting other code generation
+ * parameters from this.
+ */
+ vec4_builder
+ at_end() const
+ {
+ return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
+ }
+
+ /**
+ * Construct a builder specifying the default SIMD width and group of
+ * channel enable signals, inheriting other code generation parameters
+ * from this.
+ *
+ * \p n gives the default SIMD width, \p i gives the slot group used for
+ * predication and control flow masking in multiples of \p n channels.
+ */
+ vec4_builder
+ group(unsigned n, unsigned i) const
+ {
+ assert(force_writemask_all ||
+ (n <= dispatch_width() && i < dispatch_width() / n));
+ vec4_builder bld = *this;
+ bld._dispatch_width = n;
+ bld._group += i * n;
+ return bld;
+ }
+
+ /**
+ * Construct a builder with per-channel control flow execution masking
+ * disabled if \p b is true. If control flow execution masking is
+ * already disabled this has no effect.
+ */
+ vec4_builder
+ exec_all(bool b = true) const
+ {
+ vec4_builder bld = *this;
+ if (b)
+ bld.force_writemask_all = true;
+ return bld;
+ }
+
+ /**
+ * Construct a builder with the given debug annotation info.
+ */
+ vec4_builder
+ annotate(const char *str, const void *ir = NULL) const
+ {
+ vec4_builder bld = *this;
+ bld.annotation.str = str;
+ bld.annotation.ir = ir;
+ return bld;
+ }
+
+ /**
+ * Get the SIMD width in use.
+ */
+ unsigned
+ dispatch_width() const
+ {
+ return _dispatch_width;
+ }
+
+ /**
+ * Get the channel group in use.
+ */
+ unsigned
+ group() const
+ {
+ return _group;
+ }
+
+ /**
+ * Allocate a virtual register of natural vector size (four for this IR)
+ * and SIMD width. \p n gives the amount of space to allocate in
+ * dispatch_width units (which is just enough space for four logical
+ * components in this IR).
+ */
+ dst_reg
+ vgrf(enum brw_reg_type type, unsigned n = 1) const
+ {
+ assert(dispatch_width() <= 32);
+
+ if (n > 0)
+ return retype(dst_reg(VGRF, shader->alloc.allocate(
+ n * DIV_ROUND_UP(type_sz(type), 4))),
+ type);
+ else
+ return retype(null_reg_ud(), type);
+ }
+
+ /**
+ * Create a null register of floating type.
+ */
+ dst_reg
+ null_reg_f() const
+ {
+ return dst_reg(retype(brw_null_vec(dispatch_width()),
+ BRW_REGISTER_TYPE_F));
+ }
+
+ /**
+ * Create a null register of signed integer type.
+ */
+ dst_reg
+ null_reg_d() const
+ {
+ return dst_reg(retype(brw_null_vec(dispatch_width()),
+ BRW_REGISTER_TYPE_D));
+ }
+
+ /**
+ * Create a null register of unsigned integer type.
+ */
+ dst_reg
+ null_reg_ud() const
+ {
+ return dst_reg(retype(brw_null_vec(dispatch_width()),
+ BRW_REGISTER_TYPE_UD));
+ }
+
+ /**
+ * Insert an instruction into the program.
+ */
+ instruction *
+ emit(const instruction &inst) const
+ {
+ return emit(new(shader->mem_ctx) instruction(inst));
+ }
+
+ /**
+ * Create and insert a nullary control instruction into the program.
+ */
+ instruction *
+ emit(enum opcode opcode) const
+ {
+ return emit(instruction(opcode));
+ }
+
+ /**
+ * Create and insert a nullary instruction into the program.
+ */
+ instruction *
+ emit(enum opcode opcode, const dst_reg &dst) const
+ {
+ return emit(instruction(opcode, dst));
+ }
+
+ /**
+ * Create and insert a unary instruction into the program.
+ */
+ instruction *
+ emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
+ {
+ switch (opcode) {
+ case SHADER_OPCODE_RCP:
+ case SHADER_OPCODE_RSQ:
+ case SHADER_OPCODE_SQRT:
+ case SHADER_OPCODE_EXP2:
+ case SHADER_OPCODE_LOG2:
+ case SHADER_OPCODE_SIN:
+ case SHADER_OPCODE_COS:
+ return fix_math_instruction(
+ emit(instruction(opcode, dst,
+ fix_math_operand(src0))));
+
+ default:
+ return emit(instruction(opcode, dst, src0));
+ }
+ }
+
+ /**
+ * Create and insert a binary instruction into the program.
+ */
+ instruction *
+ emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+ const src_reg &src1) const
+ {
+ switch (opcode) {
+ case SHADER_OPCODE_POW:
+ case SHADER_OPCODE_INT_QUOTIENT:
+ case SHADER_OPCODE_INT_REMAINDER:
+ return fix_math_instruction(
+ emit(instruction(opcode, dst,
+ fix_math_operand(src0),
+ fix_math_operand(src1))));
+
+ default:
+ return emit(instruction(opcode, dst, src0, src1));
+ }
+ }
+
+ /**
+ * Create and insert a ternary instruction into the program.
+ */
+ instruction *
+ emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+ const src_reg &src1, const src_reg &src2) const
+ {
+ switch (opcode) {
+ case BRW_OPCODE_BFE:
+ case BRW_OPCODE_BFI2:
+ case BRW_OPCODE_MAD:
+ case BRW_OPCODE_LRP:
+ return emit(instruction(opcode, dst,
+ fix_3src_operand(src0),
+ fix_3src_operand(src1),
+ fix_3src_operand(src2)));
+
+ default:
+ return emit(instruction(opcode, dst, src0, src1, src2));
+ }
+ }
+
+ /**
+ * Insert a preallocated instruction into the program.
+ */
+ instruction *
+ emit(instruction *inst) const
+ {
+ inst->exec_size = dispatch_width();
+ inst->group = group();
+ inst->force_writemask_all = force_writemask_all;
+ inst->size_written = inst->exec_size * type_sz(inst->dst.type);
+ inst->annotation = annotation.str;
+ inst->ir = annotation.ir;
+
+ if (block)
+ static_cast<instruction *>(cursor)->insert_before(block, inst);
+ else
+ cursor->insert_before(inst);
+
+ return inst;
+ }
+
+ /**
+ * Select \p src0 if the comparison of both sources with the given
+ * conditional mod evaluates to true, otherwise select \p src1.
+ *
+ * Generally useful to get the minimum or maximum of two values.
+ */
+ instruction *
+ emit_minmax(const dst_reg &dst, const src_reg &src0,
+ const src_reg &src1, brw_conditional_mod mod) const
+ {
+ assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
+
+ return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
+ fix_unsigned_negate(src1)));
+ }
+
+ /**
+ * Copy any live channel from \p src to the first channel of the result.
+ */
+ src_reg
+ emit_uniformize(const src_reg &src) const
+ {
+ const vec4_builder ubld = exec_all();
+ const dst_reg chan_index =
+ writemask(vgrf(BRW_REGISTER_TYPE_UD), WRITEMASK_X);
+ const dst_reg dst = vgrf(src.type);
+
+ ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
+ ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, src_reg(chan_index));
+
+ return src_reg(dst);
+ }
+
+ /**
+ * Assorted arithmetic ops.
+ * @{
+ */
+#define ALU1(op) \
+ instruction * \
+ op(const dst_reg &dst, const src_reg &src0) const \
+ { \
+ return emit(BRW_OPCODE_##op, dst, src0); \
+ }
+
+#define ALU2(op) \
+ instruction * \
+ op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
+ { \
+ return emit(BRW_OPCODE_##op, dst, src0, src1); \
+ }
+
+#define ALU2_ACC(op) \
+ instruction * \
+ op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
+ { \
+ instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
+ inst->writes_accumulator = true; \
+ return inst; \
+ }
+
+#define ALU3(op) \
+ instruction * \
+ op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
+ const src_reg &src2) const \
+ { \
+ return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
+ }
+
+ ALU2(ADD)
+ ALU2_ACC(ADDC)
+ ALU2(AND)
+ ALU2(ASR)
+ ALU2(AVG)
+ ALU3(BFE)
+ ALU2(BFI1)
+ ALU3(BFI2)
+ ALU1(BFREV)
+ ALU1(CBIT)
+ ALU2(CMPN)
+ ALU3(CSEL)
+ ALU1(DIM)
+ ALU2(DP2)
+ ALU2(DP3)
+ ALU2(DP4)
+ ALU2(DPH)
+ ALU1(F16TO32)
+ ALU1(F32TO16)
+ ALU1(FBH)
+ ALU1(FBL)
+ ALU1(FRC)
+ ALU2(LINE)
+ ALU1(LZD)
+ ALU2(MAC)
+ ALU2_ACC(MACH)
+ ALU3(MAD)
+ ALU1(MOV)
+ ALU2(MUL)
+ ALU1(NOT)
+ ALU2(OR)
+ ALU2(PLN)
+ ALU1(RNDD)
+ ALU1(RNDE)
+ ALU1(RNDU)
+ ALU1(RNDZ)
+ ALU2(SAD2)
+ ALU2_ACC(SADA2)
+ ALU2(SEL)
+ ALU2(SHL)
+ ALU2(SHR)
+ ALU2_ACC(SUBB)
+ ALU2(XOR)
+
+#undef ALU3
+#undef ALU2_ACC
+#undef ALU2
+#undef ALU1
+ /** @} */
+
+ /**
+ * CMP: Sets the low bit of the destination channels with the result
+ * of the comparison, while the upper bits are undefined, and updates
+ * the flag register with the packed 16 bits of the result.
+ */
+ instruction *
+ CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
+ brw_conditional_mod condition) const
+ {
+ /* Take the instruction:
+ *
+ * CMP null<d> src0<f> src1<f>
+ *
+ * Original gen4 does type conversion to the destination type
+ * before comparison, producing garbage results for floating
+ * point comparisons.
+ *
+ * The destination type doesn't matter on newer generations,
+ * so we set the type to match src0 so we can compact the
+ * instruction.
+ */
+ return set_condmod(condition,
+ emit(BRW_OPCODE_CMP, retype(dst, src0.type),
+ fix_unsigned_negate(src0),
+ fix_unsigned_negate(src1)));
+ }
+
+ /**
+ * Gen4 predicated IF.
+ */
+ instruction *
+ IF(brw_predicate predicate) const
+ {
+ return set_predicate(predicate, emit(BRW_OPCODE_IF));
+ }
+
+ /**
+ * Gen6 IF with embedded comparison.
+ */
+ instruction *
+ IF(const src_reg &src0, const src_reg &src1,
+ brw_conditional_mod condition) const
+ {
+ assert(shader->devinfo->gen == 6);
+ return set_condmod(condition,
+ emit(BRW_OPCODE_IF,
+ null_reg_d(),
+ fix_unsigned_negate(src0),
+ fix_unsigned_negate(src1)));
+ }
+
+ /**
+ * Emit a linear interpolation instruction.
+ */
+ instruction *
+ LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
+ const src_reg &a) const
+ {
+ if (shader->devinfo->gen >= 6) {
+ /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
+ * we need to reorder the operands.
+ */
+ return emit(BRW_OPCODE_LRP, dst, a, y, x);
+
+ } else {
+ /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
+ const dst_reg y_times_a = vgrf(dst.type);
+ const dst_reg one_minus_a = vgrf(dst.type);
+ const dst_reg x_times_one_minus_a = vgrf(dst.type);
+
+ MUL(y_times_a, y, a);
+ ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
+ MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
+ return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
+ }
+ }
+
+ backend_shader *shader;
+
+ protected:
+ /**
+ * Workaround for negation of UD registers. See comment in
+ * fs_generator::generate_code() for the details.
+ */
+ src_reg
+ fix_unsigned_negate(const src_reg &src) const
+ {
+ if (src.type == BRW_REGISTER_TYPE_UD && src.negate) {
+ dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
+ MOV(temp, src);
+ return src_reg(temp);
+ } else {
+ return src;
+ }
+ }
+
+ /**
+ * Workaround for register access modes not supported by the ternary
+ * instruction encoding.
+ */
+ src_reg
+ fix_3src_operand(const src_reg &src) const
+ {
+ /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
+ * able to use vertical stride of zero to replicate the vec4 uniform, like
+ *
+ * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
+ *
+ * But you can't, since vertical stride is always four in three-source
+ * instructions. Instead, insert a MOV instruction to do the replication so
+ * that the three-source instruction can consume it.
+ */
+
+ /* The MOV is only needed if the source is a uniform or immediate. */
+ if (src.file != UNIFORM && src.file != IMM)
+ return src;
+
+ if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
+ return src;
+
+ const dst_reg expanded = vgrf(src.type);
+ emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
+ return src_reg(expanded);
+ }
+
+ /**
+ * Workaround for register access modes not supported by the math
+ * instruction.
+ */
+ src_reg
+ fix_math_operand(const src_reg &src) const
+ {
+ /* The gen6 math instruction ignores the source modifiers --
+ * swizzle, abs, negate, and at least some parts of the register
+ * region description.
+ *
+ * Rather than trying to enumerate all these cases, *always* expand the
+ * operand to a temp GRF for gen6.
+ *
+ * For gen7, keep the operand as-is, except if immediate, which gen7 still
+ * can't use.
+ */
+ if (shader->devinfo->gen == 6 ||
+ (shader->devinfo->gen == 7 && src.file == IMM)) {
+ const dst_reg tmp = vgrf(src.type);
+ MOV(tmp, src);
+ return src_reg(tmp);
+ } else {
+ return src;
+ }
+ }
+
+ /**
+ * Workaround other weirdness of the math instruction.
+ */
+ instruction *
+ fix_math_instruction(instruction *inst) const
+ {
+ if (shader->devinfo->gen == 6 &&
+ inst->dst.writemask != WRITEMASK_XYZW) {
+ const dst_reg tmp = vgrf(inst->dst.type);
+ MOV(inst->dst, src_reg(tmp));
+ inst->dst = tmp;
+
+ } else if (shader->devinfo->gen < 6) {
+ const unsigned sources = (inst->src[1].file == BAD_FILE ? 1 : 2);
+ inst->base_mrf = 1;
+ inst->mlen = sources;
+ }
+
+ return inst;
+ }
+
+ bblock_t *block;
+ exec_node *cursor;
+
+ unsigned _dispatch_width;
+ unsigned _group;
+ bool force_writemask_all;
+
+ /** Debug annotation info. */
+ struct {
+ const char *str;
+ const void *ir;
+ } annotation;
+ };
+}
+
+#endif
diff --git a/src/intel/compiler/brw_vec4_cmod_propagation.cpp b/src/intel/compiler/brw_vec4_cmod_propagation.cpp
new file mode 100644
index 00000000000..4454cdbfc94
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_cmod_propagation.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+/** @file brw_vec4_cmod_propagation.cpp
+ *
+ * Really similar to brw_fs_cmod_propagation but adapted to vec4 needs. Check
+ * brw_fs_cmod_propagation for further details on the rationale behind this
+ * optimization.
+ */
+
+#include "brw_vec4.h"
+#include "brw_cfg.h"
+#include "brw_eu.h"
+
+namespace brw {
+
+static bool
+opt_cmod_propagation_local(bblock_t *block)
+{
+ bool progress = false;
+ int ip = block->end_ip + 1;
+
+ foreach_inst_in_block_reverse_safe(vec4_instruction, inst, block) {
+ ip--;
+
+ if ((inst->opcode != BRW_OPCODE_AND &&
+ inst->opcode != BRW_OPCODE_CMP &&
+ inst->opcode != BRW_OPCODE_MOV) ||
+ inst->predicate != BRW_PREDICATE_NONE ||
+ !inst->dst.is_null() ||
+ inst->src[0].file != VGRF ||
+ inst->src[0].abs)
+ continue;
+
+ if (inst->opcode == BRW_OPCODE_AND &&
+ !(inst->src[1].is_one() &&
+ inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+ !inst->src[0].negate))
+ continue;
+
+ if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero())
+ continue;
+
+ if (inst->opcode == BRW_OPCODE_MOV &&
+ inst->conditional_mod != BRW_CONDITIONAL_NZ)
+ continue;
+
+ bool read_flag = false;
+ foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst, inst) {
+ if (regions_overlap(inst->src[0], inst->size_read(0),
+ scan_inst->dst, scan_inst->size_written)) {
+ if ((scan_inst->predicate && scan_inst->opcode != BRW_OPCODE_SEL) ||
+ scan_inst->dst.offset != inst->src[0].offset ||
+ (scan_inst->dst.writemask != WRITEMASK_X &&
+ scan_inst->dst.writemask != WRITEMASK_XYZW) ||
+ (scan_inst->dst.writemask == WRITEMASK_XYZW &&
+ inst->src[0].swizzle != BRW_SWIZZLE_XYZW) ||
+ (inst->dst.writemask & ~scan_inst->dst.writemask) != 0 ||
+ scan_inst->exec_size != inst->exec_size ||
+ scan_inst->group != inst->group) {
+ break;
+ }
+
+ /* CMP's result is the same regardless of dest type. */
+ if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+ scan_inst->opcode == BRW_OPCODE_CMP &&
+ (inst->dst.type == BRW_REGISTER_TYPE_D ||
+ inst->dst.type == BRW_REGISTER_TYPE_UD)) {
+ inst->remove(block);
+ progress = true;
+ break;
+ }
+
+ /* If the AND wasn't handled by the previous case, it isn't safe
+ * to remove it.
+ */
+ if (inst->opcode == BRW_OPCODE_AND)
+ break;
+
+ /* Comparisons operate differently for ints and floats */
+ if (scan_inst->dst.type != inst->dst.type &&
+ (scan_inst->dst.type == BRW_REGISTER_TYPE_F ||
+ inst->dst.type == BRW_REGISTER_TYPE_F))
+ break;
+
+ /* If the instruction generating inst's source also wrote the
+ * flag, and inst is doing a simple .nz comparison, then inst
+ * is redundant - the appropriate value is already in the flag
+ * register. Delete inst.
+ */
+ if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+ !inst->src[0].negate &&
+ scan_inst->writes_flag()) {
+ inst->remove(block);
+ progress = true;
+ break;
+ }
+
+ /* The conditional mod of the CMP/CMPN instructions behaves
+ * specially because the flag output is not calculated from the
+ * result of the instruction, but the other way around, which
+ * means that even if the condmod to propagate and the condmod
+ * from the CMP instruction are the same they will in general give
+ * different results because they are evaluated based on different
+ * inputs.
+ */
+ if (scan_inst->opcode == BRW_OPCODE_CMP ||
+ scan_inst->opcode == BRW_OPCODE_CMPN)
+ break;
+
+ /* Otherwise, try propagating the conditional. */
+ enum brw_conditional_mod cond =
+ inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
+ : inst->conditional_mod;
+
+ if (scan_inst->can_do_cmod() &&
+ ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
+ scan_inst->conditional_mod == cond)) {
+ scan_inst->conditional_mod = cond;
+ inst->remove(block);
+ progress = true;
+ }
+ break;
+ }
+
+ if (scan_inst->writes_flag())
+ break;
+
+ read_flag = read_flag || scan_inst->reads_flag();
+ }
+ }
+
+ return progress;
+}
+
+bool
+vec4_visitor::opt_cmod_propagation()
+{
+ bool progress = false;
+
+ foreach_block_reverse(block, cfg) {
+ progress = opt_cmod_propagation_local(block) || progress;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+} /* namespace brw */
diff --git a/src/intel/compiler/brw_vec4_copy_propagation.cpp b/src/intel/compiler/brw_vec4_copy_propagation.cpp
new file mode 100644
index 00000000000..e7f6f93f8bd
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_copy_propagation.cpp
@@ -0,0 +1,558 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file brw_vec4_copy_propagation.cpp
+ *
+ * Implements tracking of values copied between registers, and
+ * optimizations based on that: copy propagation and constant
+ * propagation.
+ */
+
+#include "brw_vec4.h"
+#include "brw_cfg.h"
+#include "brw_eu.h"
+
+namespace brw {
+
+struct copy_entry {
+ src_reg *value[4];
+ int saturatemask;
+};
+
+static bool
+is_direct_copy(vec4_instruction *inst)
+{
+ return (inst->opcode == BRW_OPCODE_MOV &&
+ !inst->predicate &&
+ inst->dst.file == VGRF &&
+ inst->dst.offset % REG_SIZE == 0 &&
+ !inst->dst.reladdr &&
+ !inst->src[0].reladdr &&
+ (inst->dst.type == inst->src[0].type ||
+ (inst->dst.type == BRW_REGISTER_TYPE_F &&
+ inst->src[0].type == BRW_REGISTER_TYPE_VF)));
+}
+
+static bool
+is_dominated_by_previous_instruction(vec4_instruction *inst)
+{
+ return (inst->opcode != BRW_OPCODE_DO &&
+ inst->opcode != BRW_OPCODE_WHILE &&
+ inst->opcode != BRW_OPCODE_ELSE &&
+ inst->opcode != BRW_OPCODE_ENDIF);
+}
+
+static bool
+is_channel_updated(vec4_instruction *inst, src_reg *values[4], int ch)
+{
+ const src_reg *src = values[ch];
+
+ /* consider GRF only */
+ assert(inst->dst.file == VGRF);
+ if (!src || src->file != VGRF)
+ return false;
+
+ return regions_overlap(*src, REG_SIZE, inst->dst, inst->size_written) &&
+ (inst->dst.offset != src->offset ||
+ inst->dst.writemask & (1 << BRW_GET_SWZ(src->swizzle, ch)));
+}
+
+static bool
+is_logic_op(enum opcode opcode)
+{
+ return (opcode == BRW_OPCODE_AND ||
+ opcode == BRW_OPCODE_OR ||
+ opcode == BRW_OPCODE_XOR ||
+ opcode == BRW_OPCODE_NOT);
+}
+
+/**
+ * Get the origin of a copy as a single register if all components present in
+ * the given readmask originate from the same register and have compatible
+ * regions, otherwise return a BAD_FILE register.
+ */
+static src_reg
+get_copy_value(const copy_entry &entry, unsigned readmask)
+{
+ unsigned swz[4] = {};
+ src_reg value;
+
+ for (unsigned i = 0; i < 4; i++) {
+ if (readmask & (1 << i)) {
+ if (entry.value[i]) {
+ src_reg src = *entry.value[i];
+
+ if (src.file == IMM) {
+ swz[i] = i;
+ } else {
+ swz[i] = BRW_GET_SWZ(src.swizzle, i);
+ /* Overwrite the original swizzle so the src_reg::equals call
+ * below doesn't care about it, the correct swizzle will be
+ * calculated once the swizzles of all components are known.
+ */
+ src.swizzle = BRW_SWIZZLE_XYZW;
+ }
+
+ if (value.file == BAD_FILE) {
+ value = src;
+ } else if (!value.equals(src)) {
+ return src_reg();
+ }
+ } else {
+ return src_reg();
+ }
+ }
+ }
+
+ return swizzle(value,
+ brw_compose_swizzle(brw_swizzle_for_mask(readmask),
+ BRW_SWIZZLE4(swz[0], swz[1],
+ swz[2], swz[3])));
+}
+
+static bool
+try_constant_propagate(const struct gen_device_info *devinfo,
+ vec4_instruction *inst,
+ int arg, const copy_entry *entry)
+{
+ /* For constant propagation, we only handle the same constant
+ * across all 4 channels. Some day, we should handle the 8-bit
+ * float vector format, which would let us constant propagate
+ * vectors better.
+ * We could be more aggressive here -- some channels might not get used
+ * based on the destination writemask.
+ */
+ src_reg value =
+ get_copy_value(*entry,
+ brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle,
+ WRITEMASK_XYZW));
+
+ if (value.file != IMM)
+ return false;
+
+ /* 64-bit types can't be used except for one-source instructions, which
+ * higher levels should have constant folded away, so there's no point in
+ * propagating immediates here.
+ */
+ if (type_sz(value.type) == 8 || type_sz(inst->src[arg].type) == 8)
+ return false;
+
+ if (value.type == BRW_REGISTER_TYPE_VF) {
+ /* The result of bit-casting the component values of a vector float
+ * cannot in general be represented as an immediate.
+ */
+ if (inst->src[arg].type != BRW_REGISTER_TYPE_F)
+ return false;
+ } else {
+ value.type = inst->src[arg].type;
+ }
+
+ if (inst->src[arg].abs) {
+ if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
+ !brw_abs_immediate(value.type, &value.as_brw_reg())) {
+ return false;
+ }
+ }
+
+ if (inst->src[arg].negate) {
+ if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
+ !brw_negate_immediate(value.type, &value.as_brw_reg())) {
+ return false;
+ }
+ }
+
+ value = swizzle(value, inst->src[arg].swizzle);
+
+ switch (inst->opcode) {
+ case BRW_OPCODE_MOV:
+ case SHADER_OPCODE_BROADCAST:
+ inst->src[arg] = value;
+ return true;
+
+ case SHADER_OPCODE_POW:
+ case SHADER_OPCODE_INT_QUOTIENT:
+ case SHADER_OPCODE_INT_REMAINDER:
+ if (devinfo->gen < 8)
+ break;
+ /* fallthrough */
+ case BRW_OPCODE_DP2:
+ case BRW_OPCODE_DP3:
+ case BRW_OPCODE_DP4:
+ case BRW_OPCODE_DPH:
+ case BRW_OPCODE_BFI1:
+ case BRW_OPCODE_ASR:
+ case BRW_OPCODE_SHL:
+ case BRW_OPCODE_SHR:
+ case BRW_OPCODE_SUBB:
+ if (arg == 1) {
+ inst->src[arg] = value;
+ return true;
+ }
+ break;
+
+ case BRW_OPCODE_MACH:
+ case BRW_OPCODE_MUL:
+ case SHADER_OPCODE_MULH:
+ case BRW_OPCODE_ADD:
+ case BRW_OPCODE_OR:
+ case BRW_OPCODE_AND:
+ case BRW_OPCODE_XOR:
+ case BRW_OPCODE_ADDC:
+ if (arg == 1) {
+ inst->src[arg] = value;
+ return true;
+ } else if (arg == 0 && inst->src[1].file != IMM) {
+ /* Fit this constant in by commuting the operands. Exception: we
+ * can't do this for 32-bit integer MUL/MACH because it's asymmetric.
+ */
+ if ((inst->opcode == BRW_OPCODE_MUL ||
+ inst->opcode == BRW_OPCODE_MACH) &&
+ (inst->src[1].type == BRW_REGISTER_TYPE_D ||
+ inst->src[1].type == BRW_REGISTER_TYPE_UD))
+ break;
+ inst->src[0] = inst->src[1];
+ inst->src[1] = value;
+ return true;
+ }
+ break;
+ case GS_OPCODE_SET_WRITE_OFFSET:
+ /* This is just a multiply by a constant with special strides.
+ * The generator will handle immediates in both arguments (generating
+ * a single MOV of the product). So feel free to propagate in src0.
+ */
+ inst->src[arg] = value;
+ return true;
+
+ case BRW_OPCODE_CMP:
+ if (arg == 1) {
+ inst->src[arg] = value;
+ return true;
+ } else if (arg == 0 && inst->src[1].file != IMM) {
+ enum brw_conditional_mod new_cmod;
+
+ new_cmod = brw_swap_cmod(inst->conditional_mod);
+ if (new_cmod != BRW_CONDITIONAL_NONE) {
+ /* Fit this constant in by swapping the operands and
+ * flipping the test.
+ */
+ inst->src[0] = inst->src[1];
+ inst->src[1] = value;
+ inst->conditional_mod = new_cmod;
+ return true;
+ }
+ }
+ break;
+
+ case BRW_OPCODE_SEL:
+ if (arg == 1) {
+ inst->src[arg] = value;
+ return true;
+ } else if (arg == 0 && inst->src[1].file != IMM) {
+ inst->src[0] = inst->src[1];
+ inst->src[1] = value;
+
+ /* If this was predicated, flipping operands means
+ * we also need to flip the predicate.
+ */
+ if (inst->conditional_mod == BRW_CONDITIONAL_NONE) {
+ inst->predicate_inverse = !inst->predicate_inverse;
+ }
+ return true;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ return false;
+}
+
+static bool
+is_align1_opcode(unsigned opcode)
+{
+ switch (opcode) {
+ case VEC4_OPCODE_FROM_DOUBLE:
+ case VEC4_OPCODE_TO_DOUBLE:
+ case VEC4_OPCODE_PICK_LOW_32BIT:
+ case VEC4_OPCODE_PICK_HIGH_32BIT:
+ case VEC4_OPCODE_SET_LOW_32BIT:
+ case VEC4_OPCODE_SET_HIGH_32BIT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool
+try_copy_propagate(const struct gen_device_info *devinfo,
+ vec4_instruction *inst, int arg,
+ const copy_entry *entry, int attributes_per_reg)
+{
+ /* Build up the value we are propagating as if it were the source of a
+ * single MOV
+ */
+ src_reg value =
+ get_copy_value(*entry,
+ brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle,
+ WRITEMASK_XYZW));
+
+ /* Check that we can propagate that value */
+ if (value.file != UNIFORM &&
+ value.file != VGRF &&
+ value.file != ATTR)
+ return false;
+
+ /* In gen < 8 instructions that write 2 registers also need to read 2
+ * registers. Make sure we don't break that restriction by copy
+ * propagating from a uniform.
+ */
+ if (devinfo->gen < 8 && inst->size_written > REG_SIZE && is_uniform(value))
+ return false;
+
+ /* There is a regioning restriction such that if execsize == width
+ * and hstride != 0 then the vstride can't be 0. When we split instrutions
+ * that take a single-precision source (like F->DF conversions) we end up
+ * with a 4-wide source on an instruction with an execution size of 4.
+ * If we then copy-propagate the source from a uniform we also end up with a
+ * vstride of 0 and we violate the restriction.
+ */
+ if (inst->exec_size == 4 && value.file == UNIFORM &&
+ type_sz(value.type) == 4)
+ return false;
+
+ /* If the type of the copy value is different from the type of the
+ * instruction then the swizzles and writemasks involved don't have the same
+ * meaning and simply replacing the source would produce different semantics.
+ */
+ if (type_sz(value.type) != type_sz(inst->src[arg].type))
+ return false;
+
+ if (devinfo->gen >= 8 && (value.negate || value.abs) &&
+ is_logic_op(inst->opcode)) {
+ return false;
+ }
+
+ if (inst->src[arg].offset % REG_SIZE || value.offset % REG_SIZE)
+ return false;
+
+ bool has_source_modifiers = value.negate || value.abs;
+
+ /* gen6 math and gen7+ SENDs from GRFs ignore source modifiers on
+ * instructions.
+ */
+ if ((has_source_modifiers || value.file == UNIFORM ||
+ value.swizzle != BRW_SWIZZLE_XYZW) && !inst->can_do_source_mods(devinfo))
+ return false;
+
+ if (has_source_modifiers &&
+ value.type != inst->src[arg].type &&
+ !inst->can_change_types())
+ return false;
+
+ if (has_source_modifiers &&
+ inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE)
+ return false;
+
+ unsigned composed_swizzle = brw_compose_swizzle(inst->src[arg].swizzle,
+ value.swizzle);
+
+ /* Instructions that operate on vectors in ALIGN1 mode will ignore swizzles
+ * so copy-propagation won't be safe if the composed swizzle is anything
+ * other than the identity.
+ */
+ if (is_align1_opcode(inst->opcode) && composed_swizzle != BRW_SWIZZLE_XYZW)
+ return false;
+
+ if (inst->is_3src(devinfo) &&
+ (value.file == UNIFORM ||
+ (value.file == ATTR && attributes_per_reg != 1)) &&
+ !brw_is_single_value_swizzle(composed_swizzle))
+ return false;
+
+ if (inst->is_send_from_grf())
+ return false;
+
+ /* we can't generally copy-propagate UD negations becuse we
+ * end up accessing the resulting values as signed integers
+ * instead. See also resolve_ud_negate().
+ */
+ if (value.negate &&
+ value.type == BRW_REGISTER_TYPE_UD)
+ return false;
+
+ /* Don't report progress if this is a noop. */
+ if (value.equals(inst->src[arg]))
+ return false;
+
+ const unsigned dst_saturate_mask = inst->dst.writemask &
+ brw_apply_swizzle_to_mask(inst->src[arg].swizzle, entry->saturatemask);
+
+ if (dst_saturate_mask) {
+ /* We either saturate all or nothing. */
+ if (dst_saturate_mask != inst->dst.writemask)
+ return false;
+
+ /* Limit saturate propagation only to SEL with src1 bounded within 0.0
+ * and 1.0, otherwise skip copy propagate altogether.
+ */
+ switch(inst->opcode) {
+ case BRW_OPCODE_SEL:
+ if (arg != 0 ||
+ inst->src[0].type != BRW_REGISTER_TYPE_F ||
+ inst->src[1].file != IMM ||
+ inst->src[1].type != BRW_REGISTER_TYPE_F ||
+ inst->src[1].f < 0.0 ||
+ inst->src[1].f > 1.0) {
+ return false;
+ }
+ if (!inst->saturate)
+ inst->saturate = true;
+ break;
+ default:
+ return false;
+ }
+ }
+
+ /* Build the final value */
+ if (inst->src[arg].abs) {
+ value.negate = false;
+ value.abs = true;
+ }
+ if (inst->src[arg].negate)
+ value.negate = !value.negate;
+
+ value.swizzle = composed_swizzle;
+ if (has_source_modifiers &&
+ value.type != inst->src[arg].type) {
+ assert(inst->can_change_types());
+ for (int i = 0; i < 3; i++) {
+ inst->src[i].type = value.type;
+ }
+ inst->dst.type = value.type;
+ } else {
+ value.type = inst->src[arg].type;
+ }
+
+ inst->src[arg] = value;
+ return true;
+}
+
+bool
+vec4_visitor::opt_copy_propagation(bool do_constant_prop)
+{
+ /* If we are in dual instanced or single mode, then attributes are going
+ * to be interleaved, so one register contains two attribute slots.
+ */
+ const int attributes_per_reg =
+ prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
+ bool progress = false;
+ struct copy_entry entries[alloc.total_size];
+
+ memset(&entries, 0, sizeof(entries));
+
+ foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+ /* This pass only works on basic blocks. If there's flow
+ * control, throw out all our information and start from
+ * scratch.
+ *
+ * This should really be fixed by using a structure like in
+ * src/glsl/opt_copy_propagation.cpp to track available copies.
+ */
+ if (!is_dominated_by_previous_instruction(inst)) {
+ memset(&entries, 0, sizeof(entries));
+ continue;
+ }
+
+ /* For each source arg, see if each component comes from a copy
+ * from the same type file (IMM, VGRF, UNIFORM), and try
+ * optimizing out access to the copy result
+ */
+ for (int i = 2; i >= 0; i--) {
+ /* Copied values end up in GRFs, and we don't track reladdr
+ * accesses.
+ */
+ if (inst->src[i].file != VGRF ||
+ inst->src[i].reladdr)
+ continue;
+
+ /* We only handle register-aligned single GRF copies. */
+ if (inst->size_read(i) != REG_SIZE ||
+ inst->src[i].offset % REG_SIZE)
+ continue;
+
+ const unsigned reg = (alloc.offsets[inst->src[i].nr] +
+ inst->src[i].offset / REG_SIZE);
+ const copy_entry &entry = entries[reg];
+
+ if (do_constant_prop && try_constant_propagate(devinfo, inst, i, &entry))
+ progress = true;
+ else if (try_copy_propagate(devinfo, inst, i, &entry, attributes_per_reg))
+ progress = true;
+ }
+
+ /* Track available source registers. */
+ if (inst->dst.file == VGRF) {
+ const int reg =
+ alloc.offsets[inst->dst.nr] + inst->dst.offset / REG_SIZE;
+
+ /* Update our destination's current channel values. For a direct copy,
+ * the value is the newly propagated source. Otherwise, we don't know
+ * the new value, so clear it.
+ */
+ bool direct_copy = is_direct_copy(inst);
+ entries[reg].saturatemask &= ~inst->dst.writemask;
+ for (int i = 0; i < 4; i++) {
+ if (inst->dst.writemask & (1 << i)) {
+ entries[reg].value[i] = direct_copy ? &inst->src[0] : NULL;
+ entries[reg].saturatemask |=
+ inst->saturate && direct_copy ? 1 << i : 0;
+ }
+ }
+
+ /* Clear the records for any registers whose current value came from
+ * our destination's updated channels, as the two are no longer equal.
+ */
+ if (inst->dst.reladdr)
+ memset(&entries, 0, sizeof(entries));
+ else {
+ for (unsigned i = 0; i < alloc.total_size; i++) {
+ for (int j = 0; j < 4; j++) {
+ if (is_channel_updated(inst, entries[i].value, j)) {
+ entries[i].value[j] = NULL;
+ entries[i].saturatemask &= ~(1 << j);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
+} /* namespace brw */
diff --git a/src/intel/compiler/brw_vec4_cse.cpp b/src/intel/compiler/brw_vec4_cse.cpp
new file mode 100644
index 00000000000..2e65ef78548
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_cse.cpp
@@ -0,0 +1,296 @@
+/*
+ * Copyright © 2012, 2013, 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4.h"
+#include "brw_vec4_live_variables.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+/** @file brw_vec4_cse.cpp
+ *
+ * Support for local common subexpression elimination.
+ *
+ * See Muchnick's Advanced Compiler Design and Implementation, section
+ * 13.1 (p378).
+ */
+
+namespace {
+struct aeb_entry : public exec_node {
+ /** The instruction that generates the expression value. */
+ vec4_instruction *generator;
+
+ /** The temporary where the value is stored. */
+ src_reg tmp;
+};
+}
+
+static bool
+is_expression(const vec4_instruction *const inst)
+{
+ switch (inst->opcode) {
+ case BRW_OPCODE_MOV:
+ case BRW_OPCODE_SEL:
+ case BRW_OPCODE_NOT:
+ case BRW_OPCODE_AND:
+ case BRW_OPCODE_OR:
+ case BRW_OPCODE_XOR:
+ case BRW_OPCODE_SHR:
+ case BRW_OPCODE_SHL:
+ case BRW_OPCODE_ASR:
+ case BRW_OPCODE_CMP:
+ case BRW_OPCODE_CMPN:
+ case BRW_OPCODE_ADD:
+ case BRW_OPCODE_MUL:
+ case SHADER_OPCODE_MULH:
+ case BRW_OPCODE_FRC:
+ case BRW_OPCODE_RNDU:
+ case BRW_OPCODE_RNDD:
+ case BRW_OPCODE_RNDE:
+ case BRW_OPCODE_RNDZ:
+ case BRW_OPCODE_LINE:
+ case BRW_OPCODE_PLN:
+ case BRW_OPCODE_MAD:
+ case BRW_OPCODE_LRP:
+ case VEC4_OPCODE_UNPACK_UNIFORM:
+ case SHADER_OPCODE_FIND_LIVE_CHANNEL:
+ case SHADER_OPCODE_BROADCAST:
+ case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+ case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+ return true;
+ case SHADER_OPCODE_RCP:
+ case SHADER_OPCODE_RSQ:
+ case SHADER_OPCODE_SQRT:
+ case SHADER_OPCODE_EXP2:
+ case SHADER_OPCODE_LOG2:
+ case SHADER_OPCODE_POW:
+ case SHADER_OPCODE_INT_QUOTIENT:
+ case SHADER_OPCODE_INT_REMAINDER:
+ case SHADER_OPCODE_SIN:
+ case SHADER_OPCODE_COS:
+ return inst->mlen == 0;
+ default:
+ return false;
+ }
+}
+
+static bool
+operands_match(const vec4_instruction *a, const vec4_instruction *b)
+{
+ const src_reg *xs = a->src;
+ const src_reg *ys = b->src;
+
+ if (a->opcode == BRW_OPCODE_MAD) {
+ return xs[0].equals(ys[0]) &&
+ ((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) ||
+ (xs[2].equals(ys[1]) && xs[1].equals(ys[2])));
+ } else if (!a->is_commutative()) {
+ return xs[0].equals(ys[0]) && xs[1].equals(ys[1]) && xs[2].equals(ys[2]);
+ } else {
+ return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
+ (xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
+ }
+}
+
+static bool
+instructions_match(vec4_instruction *a, vec4_instruction *b)
+{
+ return a->opcode == b->opcode &&
+ a->saturate == b->saturate &&
+ a->predicate == b->predicate &&
+ a->predicate_inverse == b->predicate_inverse &&
+ a->conditional_mod == b->conditional_mod &&
+ a->flag_subreg == b->flag_subreg &&
+ a->dst.type == b->dst.type &&
+ a->offset == b->offset &&
+ a->mlen == b->mlen &&
+ a->base_mrf == b->base_mrf &&
+ a->header_size == b->header_size &&
+ a->shadow_compare == b->shadow_compare &&
+ a->dst.writemask == b->dst.writemask &&
+ a->force_writemask_all == b->force_writemask_all &&
+ a->size_written == b->size_written &&
+ a->exec_size == b->exec_size &&
+ a->group == b->group &&
+ operands_match(a, b);
+}
+
+bool
+vec4_visitor::opt_cse_local(bblock_t *block)
+{
+ bool progress = false;
+ exec_list aeb;
+
+ void *cse_ctx = ralloc_context(NULL);
+
+ int ip = block->start_ip;
+ foreach_inst_in_block (vec4_instruction, inst, block) {
+ /* Skip some cases. */
+ if (is_expression(inst) && !inst->predicate && inst->mlen == 0 &&
+ ((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) ||
+ inst->dst.is_null()))
+ {
+ bool found = false;
+
+ foreach_in_list_use_after(aeb_entry, entry, &aeb) {
+ /* Match current instruction's expression against those in AEB. */
+ if (!(entry->generator->dst.is_null() && !inst->dst.is_null()) &&
+ instructions_match(inst, entry->generator)) {
+ found = true;
+ progress = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ if (inst->opcode != BRW_OPCODE_MOV ||
+ (inst->opcode == BRW_OPCODE_MOV &&
+ inst->src[0].file == IMM &&
+ inst->src[0].type == BRW_REGISTER_TYPE_VF)) {
+ /* Our first sighting of this expression. Create an entry. */
+ aeb_entry *entry = ralloc(cse_ctx, aeb_entry);
+ entry->tmp = src_reg(); /* file will be BAD_FILE */
+ entry->generator = inst;
+ aeb.push_tail(entry);
+ }
+ } else {
+ /* This is at least our second sighting of this expression.
+ * If we don't have a temporary already, make one.
+ */
+ bool no_existing_temp = entry->tmp.file == BAD_FILE;
+ if (no_existing_temp && !entry->generator->dst.is_null()) {
+ entry->tmp = retype(src_reg(VGRF, alloc.allocate(
+ regs_written(entry->generator)),
+ NULL), inst->dst.type);
+
+ const unsigned width = entry->generator->exec_size;
+ unsigned component_size = width * type_sz(entry->tmp.type);
+ unsigned num_copy_movs =
+ DIV_ROUND_UP(entry->generator->size_written, component_size);
+ for (unsigned i = 0; i < num_copy_movs; ++i) {
+ vec4_instruction *copy =
+ MOV(offset(entry->generator->dst, width, i),
+ offset(entry->tmp, width, i));
+ copy->exec_size = width;
+ copy->group = entry->generator->group;
+ copy->force_writemask_all =
+ entry->generator->force_writemask_all;
+ entry->generator->insert_after(block, copy);
+ }
+
+ entry->generator->dst = dst_reg(entry->tmp);
+ }
+
+ /* dest <- temp */
+ if (!inst->dst.is_null()) {
+ assert(inst->dst.type == entry->tmp.type);
+ const unsigned width = inst->exec_size;
+ unsigned component_size = width * type_sz(inst->dst.type);
+ unsigned num_copy_movs =
+ DIV_ROUND_UP(inst->size_written, component_size);
+ for (unsigned i = 0; i < num_copy_movs; ++i) {
+ vec4_instruction *copy =
+ MOV(offset(inst->dst, width, i),
+ offset(entry->tmp, width, i));
+ copy->exec_size = inst->exec_size;
+ copy->group = inst->group;
+ copy->force_writemask_all = inst->force_writemask_all;
+ inst->insert_before(block, copy);
+ }
+ }
+
+ /* Set our iterator so that next time through the loop inst->next
+ * will get the instruction in the basic block after the one we've
+ * removed.
+ */
+ vec4_instruction *prev = (vec4_instruction *)inst->prev;
+
+ inst->remove(block);
+ inst = prev;
+ }
+ }
+
+ foreach_in_list_safe(aeb_entry, entry, &aeb) {
+ /* Kill all AEB entries that write a different value to or read from
+ * the flag register if we just wrote it.
+ */
+ if (inst->writes_flag()) {
+ if (entry->generator->reads_flag() ||
+ (entry->generator->writes_flag() &&
+ !instructions_match(inst, entry->generator))) {
+ entry->remove();
+ ralloc_free(entry);
+ continue;
+ }
+ }
+
+ for (int i = 0; i < 3; i++) {
+ src_reg *src = &entry->generator->src[i];
+
+ /* Kill all AEB entries that use the destination we just
+ * overwrote.
+ */
+ if (inst->dst.file == entry->generator->src[i].file &&
+ inst->dst.nr == entry->generator->src[i].nr) {
+ entry->remove();
+ ralloc_free(entry);
+ break;
+ }
+
+ /* Kill any AEB entries using registers that don't get reused any
+ * more -- a sure sign they'll fail operands_match().
+ */
+ if (src->file == VGRF) {
+ if (var_range_end(var_from_reg(alloc, dst_reg(*src)), 8) < ip) {
+ entry->remove();
+ ralloc_free(entry);
+ break;
+ }
+ }
+ }
+ }
+
+ ip++;
+ }
+
+ ralloc_free(cse_ctx);
+
+ return progress;
+}
+
+bool
+vec4_visitor::opt_cse()
+{
+ bool progress = false;
+
+ calculate_live_intervals();
+
+ foreach_block (block, cfg) {
+ progress = opt_cse_local(block) || progress;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
diff --git a/src/intel/compiler/brw_vec4_dead_code_eliminate.cpp b/src/intel/compiler/brw_vec4_dead_code_eliminate.cpp
new file mode 100644
index 00000000000..5b22a096dd1
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_dead_code_eliminate.cpp
@@ -0,0 +1,160 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4.h"
+#include "brw_vec4_live_variables.h"
+#include "brw_cfg.h"
+
+/** @file brw_vec4_dead_code_eliminate.cpp
+ *
+ * Dataflow-aware dead code elimination.
+ *
+ * Walks the instruction list from the bottom, removing instructions that
+ * have results that both aren't used in later blocks and haven't been read
+ * yet in the tail end of this block.
+ */
+
+using namespace brw;
+
+bool
+vec4_visitor::dead_code_eliminate()
+{
+ bool progress = false;
+
+ calculate_live_intervals();
+
+ int num_vars = live_intervals->num_vars;
+ BITSET_WORD *live = rzalloc_array(NULL, BITSET_WORD, BITSET_WORDS(num_vars));
+ BITSET_WORD *flag_live = rzalloc_array(NULL, BITSET_WORD, 1);
+
+ foreach_block_reverse_safe(block, cfg) {
+ memcpy(live, live_intervals->block_data[block->num].liveout,
+ sizeof(BITSET_WORD) * BITSET_WORDS(num_vars));
+ memcpy(flag_live, live_intervals->block_data[block->num].flag_liveout,
+ sizeof(BITSET_WORD));
+
+ foreach_inst_in_block_reverse_safe(vec4_instruction, inst, block) {
+ if ((inst->dst.file == VGRF && !inst->has_side_effects()) ||
+ (inst->dst.is_null() && inst->writes_flag())){
+ bool result_live[4] = { false };
+ if (inst->dst.file == VGRF) {
+ for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) {
+ for (int c = 0; c < 4; c++) {
+ const unsigned v = var_from_reg(alloc, inst->dst, c, i);
+ result_live[c] |= BITSET_TEST(live, v);
+ }
+ }
+ } else {
+ for (unsigned c = 0; c < 4; c++)
+ result_live[c] = BITSET_TEST(flag_live, c);
+ }
+
+ /* If the instruction can't do writemasking, then it's all or
+ * nothing.
+ */
+ if (!inst->can_do_writemask(devinfo)) {
+ bool result = result_live[0] | result_live[1] |
+ result_live[2] | result_live[3];
+ result_live[0] = result;
+ result_live[1] = result;
+ result_live[2] = result;
+ result_live[3] = result;
+ }
+
+ for (int c = 0; c < 4; c++) {
+ if (!result_live[c] && inst->dst.writemask & (1 << c)) {
+ inst->dst.writemask &= ~(1 << c);
+ progress = true;
+
+ if (inst->dst.writemask == 0) {
+ if (inst->writes_accumulator || inst->writes_flag()) {
+ inst->dst = dst_reg(retype(brw_null_reg(), inst->dst.type));
+ } else {
+ inst->opcode = BRW_OPCODE_NOP;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ if (inst->dst.is_null() && inst->writes_flag()) {
+ bool combined_live = false;
+ for (unsigned c = 0; c < 4; c++)
+ combined_live |= BITSET_TEST(flag_live, c);
+
+ if (!combined_live) {
+ inst->opcode = BRW_OPCODE_NOP;
+ progress = true;
+ }
+ }
+
+ if (inst->dst.file == VGRF && !inst->predicate &&
+ !inst->is_align1_partial_write()) {
+ for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) {
+ for (int c = 0; c < 4; c++) {
+ if (inst->dst.writemask & (1 << c)) {
+ const unsigned v = var_from_reg(alloc, inst->dst, c, i);
+ BITSET_CLEAR(live, v);
+ }
+ }
+ }
+ }
+
+ if (inst->writes_flag() && !inst->predicate) {
+ for (unsigned c = 0; c < 4; c++)
+ BITSET_CLEAR(flag_live, c);
+ }
+
+ if (inst->opcode == BRW_OPCODE_NOP) {
+ inst->remove(block);
+ continue;
+ }
+
+ for (int i = 0; i < 3; i++) {
+ if (inst->src[i].file == VGRF) {
+ for (unsigned j = 0; j < DIV_ROUND_UP(inst->size_read(i), 16); j++) {
+ for (int c = 0; c < 4; c++) {
+ const unsigned v = var_from_reg(alloc, inst->src[i], c, j);
+ BITSET_SET(live, v);
+ }
+ }
+ }
+ }
+
+ for (unsigned c = 0; c < 4; c++) {
+ if (inst->reads_flag(c)) {
+ BITSET_SET(flag_live, c);
+ }
+ }
+ }
+ }
+
+ ralloc_free(live);
+ ralloc_free(flag_live);
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp
new file mode 100644
index 00000000000..2ac287f17fa
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_generator.cpp
@@ -0,0 +1,2217 @@
+/* Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4.h"
+#include "brw_cfg.h"
+#include "brw_eu.h"
+#include "common/gen_debug.h"
+
+using namespace brw;
+
+static void
+generate_math1_gen4(struct brw_codegen *p,
+ vec4_instruction *inst,
+ struct brw_reg dst,
+ struct brw_reg src)
+{
+ gen4_math(p,
+ dst,
+ brw_math_function(inst->opcode),
+ inst->base_mrf,
+ src,
+ BRW_MATH_PRECISION_FULL);
+}
+
+static void
+check_gen6_math_src_arg(struct brw_reg src)
+{
+ /* Source swizzles are ignored. */
+ assert(!src.abs);
+ assert(!src.negate);
+ assert(src.swizzle == BRW_SWIZZLE_XYZW);
+}
+
+static void
+generate_math_gen6(struct brw_codegen *p,
+ vec4_instruction *inst,
+ struct brw_reg dst,
+ struct brw_reg src0,
+ struct brw_reg src1)
+{
+ /* Can't do writemask because math can't be align16. */
+ assert(dst.writemask == WRITEMASK_XYZW);
+ /* Source swizzles are ignored. */
+ check_gen6_math_src_arg(src0);
+ if (src1.file == BRW_GENERAL_REGISTER_FILE)
+ check_gen6_math_src_arg(src1);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ gen6_math(p, dst, brw_math_function(inst->opcode), src0, src1);
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+}
+
+static void
+generate_math2_gen4(struct brw_codegen *p,
+ vec4_instruction *inst,
+ struct brw_reg dst,
+ struct brw_reg src0,
+ struct brw_reg src1)
+{
+ /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
+ * "Message Payload":
+ *
+ * "Operand0[7]. For the INT DIV functions, this operand is the
+ * denominator."
+ * ...
+ * "Operand1[7]. For the INT DIV functions, this operand is the
+ * numerator."
+ */
+ bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
+ struct brw_reg &op0 = is_int_div ? src1 : src0;
+ struct brw_reg &op1 = is_int_div ? src0 : src1;
+
+ brw_push_insn_state(p);
+ brw_set_default_saturate(p, false);
+ brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+ brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), op1.type), op1);
+ brw_pop_insn_state(p);
+
+ gen4_math(p,
+ dst,
+ brw_math_function(inst->opcode),
+ inst->base_mrf,
+ op0,
+ BRW_MATH_PRECISION_FULL);
+}
+
+static void
+generate_tex(struct brw_codegen *p,
+ struct brw_vue_prog_data *prog_data,
+ gl_shader_stage stage,
+ vec4_instruction *inst,
+ struct brw_reg dst,
+ struct brw_reg src,
+ struct brw_reg surface_index,
+ struct brw_reg sampler_index)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ int msg_type = -1;
+
+ if (devinfo->gen >= 5) {
+ switch (inst->opcode) {
+ case SHADER_OPCODE_TEX:
+ case SHADER_OPCODE_TXL:
+ if (inst->shadow_compare) {
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
+ } else {
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
+ }
+ break;
+ case SHADER_OPCODE_TXD:
+ if (inst->shadow_compare) {
+ /* Gen7.5+. Otherwise, lowered by brw_lower_texture_gradients(). */
+ assert(devinfo->gen >= 8 || devinfo->is_haswell);
+ msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
+ } else {
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
+ }
+ break;
+ case SHADER_OPCODE_TXF:
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
+ break;
+ case SHADER_OPCODE_TXF_CMS_W:
+ assert(devinfo->gen >= 9);
+ msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
+ break;
+ case SHADER_OPCODE_TXF_CMS:
+ if (devinfo->gen >= 7)
+ msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
+ else
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
+ break;
+ case SHADER_OPCODE_TXF_MCS:
+ assert(devinfo->gen >= 7);
+ msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
+ break;
+ case SHADER_OPCODE_TXS:
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
+ break;
+ case SHADER_OPCODE_TG4:
+ if (inst->shadow_compare) {
+ msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
+ } else {
+ msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
+ }
+ break;
+ case SHADER_OPCODE_TG4_OFFSET:
+ if (inst->shadow_compare) {
+ msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
+ } else {
+ msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
+ }
+ break;
+ case SHADER_OPCODE_SAMPLEINFO:
+ msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
+ break;
+ default:
+ unreachable("should not get here: invalid vec4 texture opcode");
+ }
+ } else {
+ switch (inst->opcode) {
+ case SHADER_OPCODE_TEX:
+ case SHADER_OPCODE_TXL:
+ if (inst->shadow_compare) {
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE;
+ assert(inst->mlen == 3);
+ } else {
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD;
+ assert(inst->mlen == 2);
+ }
+ break;
+ case SHADER_OPCODE_TXD:
+ /* There is no sample_d_c message; comparisons are done manually. */
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS;
+ assert(inst->mlen == 4);
+ break;
+ case SHADER_OPCODE_TXF:
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_LD;
+ assert(inst->mlen == 2);
+ break;
+ case SHADER_OPCODE_TXS:
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO;
+ assert(inst->mlen == 2);
+ break;
+ default:
+ unreachable("should not get here: invalid vec4 texture opcode");
+ }
+ }
+
+ assert(msg_type != -1);
+
+ assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
+
+ /* Load the message header if present. If there's a texture offset, we need
+ * to set it up explicitly and load the offset bitfield. Otherwise, we can
+ * use an implied move from g0 to the first message register.
+ */
+ if (inst->header_size != 0) {
+ if (devinfo->gen < 6 && !inst->offset) {
+ /* Set up an implied move from g0 to the MRF. */
+ src = brw_vec8_grf(0, 0);
+ } else {
+ struct brw_reg header =
+ retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD);
+ uint32_t dw2 = 0;
+
+ /* Explicitly set up the message header by copying g0 to the MRF. */
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_MOV(p, header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ if (inst->offset)
+ /* Set the texel offset bits in DWord 2. */
+ dw2 = inst->offset;
+
+ if (devinfo->gen >= 9)
+ /* SKL+ overloads BRW_SAMPLER_SIMD_MODE_SIMD4X2 to also do SIMD8D,
+ * based on bit 22 in the header.
+ */
+ dw2 |= GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2;
+
+ /* The VS, DS, and FS stages have the g0.2 payload delivered as 0,
+ * so header0.2 is 0 when g0 is copied. The HS and GS stages do
+ * not, so we must set to to 0 to avoid setting undesirable bits
+ * in the message header.
+ */
+ if (dw2 ||
+ stage == MESA_SHADER_TESS_CTRL ||
+ stage == MESA_SHADER_GEOMETRY) {
+ brw_MOV(p, get_element_ud(header, 2), brw_imm_ud(dw2));
+ }
+
+ brw_adjust_sampler_state_pointer(p, header, sampler_index);
+ brw_pop_insn_state(p);
+ }
+ }
+
+ uint32_t return_format;
+
+ switch (dst.type) {
+ case BRW_REGISTER_TYPE_D:
+ return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
+ break;
+ case BRW_REGISTER_TYPE_UD:
+ return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
+ break;
+ default:
+ return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
+ break;
+ }
+
+ uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
+ inst->opcode == SHADER_OPCODE_TG4_OFFSET)
+ ? prog_data->base.binding_table.gather_texture_start
+ : prog_data->base.binding_table.texture_start;
+
+ if (surface_index.file == BRW_IMMEDIATE_VALUE &&
+ sampler_index.file == BRW_IMMEDIATE_VALUE) {
+ uint32_t surface = surface_index.ud;
+ uint32_t sampler = sampler_index.ud;
+
+ brw_SAMPLE(p,
+ dst,
+ inst->base_mrf,
+ src,
+ surface + base_binding_table_index,
+ sampler % 16,
+ msg_type,
+ 1, /* response length */
+ inst->mlen,
+ inst->header_size != 0,
+ BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+ return_format);
+
+ brw_mark_surface_used(&prog_data->base, sampler + base_binding_table_index);
+ } else {
+ /* Non-constant sampler index. */
+
+ struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
+ struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
+ struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
+
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ if (brw_regs_equal(&surface_reg, &sampler_reg)) {
+ brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
+ } else {
+ if (sampler_reg.file == BRW_IMMEDIATE_VALUE) {
+ brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8));
+ } else {
+ brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
+ brw_OR(p, addr, addr, surface_reg);
+ }
+ }
+ if (base_binding_table_index)
+ brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
+ brw_AND(p, addr, addr, brw_imm_ud(0xfff));
+
+ brw_pop_insn_state(p);
+
+ if (inst->base_mrf != -1)
+ gen6_resolve_implied_move(p, &src, inst->base_mrf);
+
+ /* dst = send(offset, a0.0 | <descriptor>) */
+ brw_inst *insn = brw_send_indirect_message(
+ p, BRW_SFID_SAMPLER, dst, src, addr);
+ brw_set_sampler_message(p, insn,
+ 0 /* surface */,
+ 0 /* sampler */,
+ msg_type,
+ 1 /* rlen */,
+ inst->mlen /* mlen */,
+ inst->header_size != 0 /* header */,
+ BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+ return_format);
+
+ /* visitor knows more than we do about the surface limit required,
+ * so has already done marking.
+ */
+ }
+}
+
+static void
+generate_vs_urb_write(struct brw_codegen *p, vec4_instruction *inst)
+{
+ brw_urb_WRITE(p,
+ brw_null_reg(), /* dest */
+ inst->base_mrf, /* starting mrf reg nr */
+ brw_vec8_grf(0, 0), /* src */
+ inst->urb_write_flags,
+ inst->mlen,
+ 0, /* response len */
+ inst->offset, /* urb destination offset */
+ BRW_URB_SWIZZLE_INTERLEAVE);
+}
+
+static void
+generate_gs_urb_write(struct brw_codegen *p, vec4_instruction *inst)
+{
+ struct brw_reg src = brw_message_reg(inst->base_mrf);
+ brw_urb_WRITE(p,
+ brw_null_reg(), /* dest */
+ inst->base_mrf, /* starting mrf reg nr */
+ src,
+ inst->urb_write_flags,
+ inst->mlen,
+ 0, /* response len */
+ inst->offset, /* urb destination offset */
+ BRW_URB_SWIZZLE_INTERLEAVE);
+}
+
+static void
+generate_gs_urb_write_allocate(struct brw_codegen *p, vec4_instruction *inst)
+{
+ struct brw_reg src = brw_message_reg(inst->base_mrf);
+
+ /* We pass the temporary passed in src0 as the writeback register */
+ brw_urb_WRITE(p,
+ inst->src[0].as_brw_reg(), /* dest */
+ inst->base_mrf, /* starting mrf reg nr */
+ src,
+ BRW_URB_WRITE_ALLOCATE_COMPLETE,
+ inst->mlen,
+ 1, /* response len */
+ inst->offset, /* urb destination offset */
+ BRW_URB_SWIZZLE_INTERLEAVE);
+
+ /* Now put allocated urb handle in dst.0 */
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_MOV(p, get_element_ud(inst->dst.as_brw_reg(), 0),
+ get_element_ud(inst->src[0].as_brw_reg(), 0));
+ brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_thread_end(struct brw_codegen *p, vec4_instruction *inst)
+{
+ struct brw_reg src = brw_message_reg(inst->base_mrf);
+ brw_urb_WRITE(p,
+ brw_null_reg(), /* dest */
+ inst->base_mrf, /* starting mrf reg nr */
+ src,
+ BRW_URB_WRITE_EOT | inst->urb_write_flags,
+ inst->mlen,
+ 0, /* response len */
+ 0, /* urb destination offset */
+ BRW_URB_SWIZZLE_INTERLEAVE);
+}
+
+static void
+generate_gs_set_write_offset(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg src0,
+ struct brw_reg src1)
+{
+ /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
+ * Header: M0.3):
+ *
+ * Slot 0 Offset. This field, after adding to the Global Offset field
+ * in the message descriptor, specifies the offset (in 256-bit units)
+ * from the start of the URB entry, as referenced by URB Handle 0, at
+ * which the data will be accessed.
+ *
+ * Similar text describes DWORD M0.4, which is slot 1 offset.
+ *
+ * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components
+ * of the register for geometry shader invocations 0 and 1) by the
+ * immediate value in src1, and store the result in DWORDs 3 and 4 of dst.
+ *
+ * We can do this with the following EU instruction:
+ *
+ * mul(2) dst.3<1>UD src0<8;2,4>UD src1<...>UW { Align1 WE_all }
+ */
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ assert(p->devinfo->gen >= 7 &&
+ src1.file == BRW_IMMEDIATE_VALUE &&
+ src1.type == BRW_REGISTER_TYPE_UD &&
+ src1.ud <= USHRT_MAX);
+ if (src0.file == BRW_IMMEDIATE_VALUE) {
+ brw_MOV(p, suboffset(stride(dst, 2, 2, 1), 3),
+ brw_imm_ud(src0.ud * src1.ud));
+ } else {
+ brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4),
+ retype(src1, BRW_REGISTER_TYPE_UW));
+ }
+ brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_set_vertex_count(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg src)
+{
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+ if (p->devinfo->gen >= 8) {
+ /* Move the vertex count into the second MRF for the EOT write. */
+ brw_MOV(p, retype(brw_message_reg(dst.nr + 1), BRW_REGISTER_TYPE_UD),
+ src);
+ } else {
+ /* If we think of the src and dst registers as composed of 8 DWORDs each,
+ * we want to pick up the contents of DWORDs 0 and 4 from src, truncate
+ * them to WORDs, and then pack them into DWORD 2 of dst.
+ *
+ * It's easier to get the EU to do this if we think of the src and dst
+ * registers as composed of 16 WORDS each; then, we want to pick up the
+ * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5
+ * of dst.
+ *
+ * We can do that by the following EU instruction:
+ *
+ * mov (2) dst.4<1>:uw src<8;1,0>:uw { Align1, Q1, NoMask }
+ */
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_MOV(p,
+ suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4),
+ stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0));
+ }
+ brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_svb_write(struct brw_codegen *p,
+ struct brw_vue_prog_data *prog_data,
+ vec4_instruction *inst,
+ struct brw_reg dst,
+ struct brw_reg src0,
+ struct brw_reg src1)
+{
+ int binding = inst->sol_binding;
+ bool final_write = inst->sol_final_write;
+
+ brw_push_insn_state(p);
+ brw_set_default_exec_size(p, BRW_EXECUTE_4);
+ /* Copy Vertex data into M0.x */
+ brw_MOV(p, stride(dst, 4, 4, 1),
+ stride(retype(src0, BRW_REGISTER_TYPE_UD), 4, 4, 1));
+ brw_pop_insn_state(p);
+
+ brw_push_insn_state(p);
+ /* Send SVB Write */
+ brw_svb_write(p,
+ final_write ? src1 : brw_null_reg(), /* dest == src1 */
+ 1, /* msg_reg_nr */
+ dst, /* src0 == previous dst */
+ BRW_GEN6_SOL_BINDING_START + binding, /* binding_table_index */
+ final_write); /* send_commit_msg */
+
+ /* Finally, wait for the write commit to occur so that we can proceed to
+ * other things safely.
+ *
+ * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
+ *
+ * The write commit does not modify the destination register, but
+ * merely clears the dependency associated with the destination
+ * register. Thus, a simple “mov” instruction using the register as a
+ * source is sufficient to wait for the write commit to occur.
+ */
+ if (final_write) {
+ brw_MOV(p, src1, src1);
+ }
+ brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_svb_set_destination_index(struct brw_codegen *p,
+ vec4_instruction *inst,
+ struct brw_reg dst,
+ struct brw_reg src)
+{
+ int vertex = inst->sol_vertex;
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_MOV(p, get_element_ud(dst, 5), get_element_ud(src, vertex));
+ brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_set_dword_2(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg src)
+{
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_MOV(p, suboffset(vec1(dst), 2), suboffset(vec1(src), 0));
+ brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_prepare_channel_masks(struct brw_codegen *p,
+ struct brw_reg dst)
+{
+ /* We want to left shift just DWORD 4 (the x component belonging to the
+ * second geometry shader invocation) by 4 bits. So generate the
+ * instruction:
+ *
+ * shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all }
+ */
+ dst = suboffset(vec1(dst), 4);
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_SHL(p, dst, dst, brw_imm_ud(4));
+ brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_set_channel_masks(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg src)
+{
+ /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
+ * Header: M0.5):
+ *
+ * 15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask
+ *
+ * When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1
+ * DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls
+ * Vertex 0 DATA[7]. This bit is ANDed with the corresponding
+ * channel enable to determine the final channel enable. For the
+ * URB_READ_OWORD & URB_READ_HWORD messages, when final channel
+ * enable is 1 it indicates that Vertex 1 DATA [3] will be included
+ * in the writeback message. For the URB_WRITE_OWORD &
+ * URB_WRITE_HWORD messages, when final channel enable is 1 it
+ * indicates that Vertex 1 DATA [3] will be written to the surface.
+ *
+ * 0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included
+ * 1: Vertex DATA [3] / Vertex 0 DATA[7] channel included
+ *
+ * 14 Vertex 1 DATA [2] Channel Mask
+ * 13 Vertex 1 DATA [1] Channel Mask
+ * 12 Vertex 1 DATA [0] Channel Mask
+ * 11 Vertex 0 DATA [3] Channel Mask
+ * 10 Vertex 0 DATA [2] Channel Mask
+ * 9 Vertex 0 DATA [1] Channel Mask
+ * 8 Vertex 0 DATA [0] Channel Mask
+ *
+ * (This is from a section of the PRM that is agnostic to the particular
+ * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to
+ * geometry shader invocations 0 and 1, respectively). Since we have the
+ * enable flags for geometry shader invocation 0 in bits 3:0 of DWORD 0,
+ * and the enable flags for geometry shader invocation 1 in bits 7:0 of
+ * DWORD 4, we just need to OR them together and store the result in bits
+ * 15:8 of DWORD 5.
+ *
+ * It's easier to get the EU to do this if we think of the src and dst
+ * registers as composed of 32 bytes each; then, we want to pick up the
+ * contents of bytes 0 and 16 from src, OR them together, and store them in
+ * byte 21.
+ *
+ * We can do that by the following EU instruction:
+ *
+ * or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all }
+ *
+ * Note: this relies on the source register having zeros in (a) bits 7:4 of
+ * DWORD 0 and (b) bits 3:0 of DWORD 4. We can rely on (b) because the
+ * source register was prepared by GS_OPCODE_PREPARE_CHANNEL_MASKS (which
+ * shifts DWORD 4 left by 4 bits), and we can rely on (a) because prior to
+ * the execution of GS_OPCODE_PREPARE_CHANNEL_MASKS, DWORDs 0 and 4 need to
+ * contain valid channel mask values (which are in the range 0x0-0xf).
+ */
+ dst = retype(dst, BRW_REGISTER_TYPE_UB);
+ src = retype(src, BRW_REGISTER_TYPE_UB);
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_OR(p, suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16));
+ brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_get_instance_id(struct brw_codegen *p,
+ struct brw_reg dst)
+{
+ /* We want to right shift R0.0 & R0.1 by GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT
+ * and store into dst.0 & dst.4. So generate the instruction:
+ *
+ * shr(8) dst<1> R0<1,4,0> GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT { align1 WE_normal 1Q }
+ */
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ dst = retype(dst, BRW_REGISTER_TYPE_UD);
+ struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+ brw_SHR(p, dst, stride(r0, 1, 4, 0),
+ brw_imm_ud(GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT));
+ brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_ff_sync_set_primitives(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg src0,
+ struct brw_reg src1,
+ struct brw_reg src2)
+{
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ /* Save src0 data in 16:31 bits of dst.0 */
+ brw_AND(p, suboffset(vec1(dst), 0), suboffset(vec1(src0), 0),
+ brw_imm_ud(0xffffu));
+ brw_SHL(p, suboffset(vec1(dst), 0), suboffset(vec1(dst), 0), brw_imm_ud(16));
+ /* Save src1 data in 0:15 bits of dst.0 */
+ brw_AND(p, suboffset(vec1(src2), 0), suboffset(vec1(src1), 0),
+ brw_imm_ud(0xffffu));
+ brw_OR(p, suboffset(vec1(dst), 0),
+ suboffset(vec1(dst), 0),
+ suboffset(vec1(src2), 0));
+ brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_ff_sync(struct brw_codegen *p,
+ vec4_instruction *inst,
+ struct brw_reg dst,
+ struct brw_reg src0,
+ struct brw_reg src1)
+{
+ /* This opcode uses an implied MRF register for:
+ * - the header of the ff_sync message. And as such it is expected to be
+ * initialized to r0 before calling here.
+ * - the destination where we will write the allocated URB handle.
+ */
+ struct brw_reg header =
+ retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD);
+
+ /* Overwrite dword 0 of the header (SO vertices to write) and
+ * dword 1 (number of primitives written).
+ */
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_MOV(p, get_element_ud(header, 0), get_element_ud(src1, 0));
+ brw_MOV(p, get_element_ud(header, 1), get_element_ud(src0, 0));
+ brw_pop_insn_state(p);
+
+ /* Allocate URB handle in dst */
+ brw_ff_sync(p,
+ dst,
+ 0,
+ header,
+ 1, /* allocate */
+ 1, /* response length */
+ 0 /* eot */);
+
+ /* Now put allocated urb handle in header.0 */
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_MOV(p, get_element_ud(header, 0), get_element_ud(dst, 0));
+
+ /* src1 is not an immediate when we use transform feedback */
+ if (src1.file != BRW_IMMEDIATE_VALUE) {
+ brw_set_default_exec_size(p, BRW_EXECUTE_4);
+ brw_MOV(p, brw_vec4_grf(src1.nr, 0), brw_vec4_grf(dst.nr, 1));
+ }
+
+ brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_set_primitive_id(struct brw_codegen *p, struct brw_reg dst)
+{
+ /* In gen6, PrimitiveID is delivered in R0.1 of the payload */
+ struct brw_reg src = brw_vec8_grf(0, 0);
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_MOV(p, get_element_ud(dst, 0), get_element_ud(src, 1));
+ brw_pop_insn_state(p);
+}
+
+static void
+generate_tcs_get_instance_id(struct brw_codegen *p, struct brw_reg dst)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail;
+
+ /* "Instance Count" comes as part of the payload in r0.2 bits 23:17.
+ *
+ * Since we operate in SIMD4x2 mode, we need run half as many threads
+ * as necessary. So we assign (2i + 1, 2i) as the thread counts. We
+ * shift right by one less to accomplish the multiplication by two.
+ */
+ dst = retype(dst, BRW_REGISTER_TYPE_UD);
+ struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ const int mask = ivb ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17);
+ const int shift = ivb ? 16 : 17;
+
+ brw_AND(p, get_element_ud(dst, 0), get_element_ud(r0, 2), brw_imm_ud(mask));
+ brw_SHR(p, get_element_ud(dst, 0), get_element_ud(dst, 0),
+ brw_imm_ud(shift - 1));
+ brw_ADD(p, get_element_ud(dst, 4), get_element_ud(dst, 0), brw_imm_ud(1));
+
+ brw_pop_insn_state(p);
+}
+
+static void
+generate_tcs_urb_write(struct brw_codegen *p,
+ vec4_instruction *inst,
+ struct brw_reg urb_header)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+
+ brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+ brw_set_dest(p, send, brw_null_reg());
+ brw_set_src0(p, send, urb_header);
+
+ brw_set_message_descriptor(p, send, BRW_SFID_URB,
+ inst->mlen /* mlen */, 0 /* rlen */,
+ true /* header */, false /* eot */);
+ brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_WRITE_OWORD);
+ brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
+ if (inst->urb_write_flags & BRW_URB_WRITE_EOT) {
+ brw_inst_set_eot(devinfo, send, 1);
+ } else {
+ brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
+ brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
+ }
+
+ /* what happens to swizzles? */
+}
+
+
+static void
+generate_tcs_input_urb_offsets(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg vertex,
+ struct brw_reg offset)
+{
+ /* Generates an URB read/write message header for HS/DS operation.
+ * Inputs are a vertex index, and a byte offset from the beginning of
+ * the vertex. */
+
+ /* If `vertex` is not an immediate, we clobber a0.0 */
+
+ assert(vertex.file == BRW_IMMEDIATE_VALUE || vertex.file == BRW_GENERAL_REGISTER_FILE);
+ assert(vertex.type == BRW_REGISTER_TYPE_UD || vertex.type == BRW_REGISTER_TYPE_D);
+
+ assert(dst.file == BRW_GENERAL_REGISTER_FILE);
+
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_MOV(p, dst, brw_imm_ud(0));
+
+ /* m0.5 bits 8-15 are channel enables */
+ brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00));
+
+ /* m0.0-0.1: URB handles */
+ if (vertex.file == BRW_IMMEDIATE_VALUE) {
+ uint32_t vertex_index = vertex.ud;
+ struct brw_reg index_reg = brw_vec1_grf(
+ 1 + (vertex_index >> 3), vertex_index & 7);
+
+ brw_MOV(p, vec2(get_element_ud(dst, 0)),
+ retype(index_reg, BRW_REGISTER_TYPE_UD));
+ } else {
+ /* Use indirect addressing. ICP Handles are DWords (single channels
+ * of a register) and start at g1.0.
+ *
+ * In order to start our region at g1.0, we add 8 to the vertex index,
+ * effectively skipping over the 8 channels in g0.0. This gives us a
+ * DWord offset to the ICP Handle.
+ *
+ * Indirect addressing works in terms of bytes, so we then multiply
+ * the DWord offset by 4 (by shifting left by 2).
+ */
+ struct brw_reg addr = brw_address_reg(0);
+
+ /* bottom half: m0.0 = g[1.0 + vertex.0]UD */
+ brw_ADD(p, addr, retype(get_element_ud(vertex, 0), BRW_REGISTER_TYPE_UW),
+ brw_imm_uw(0x8));
+ brw_SHL(p, addr, addr, brw_imm_uw(2));
+ brw_MOV(p, get_element_ud(dst, 0), deref_1ud(brw_indirect(0, 0), 0));
+
+ /* top half: m0.1 = g[1.0 + vertex.4]UD */
+ brw_ADD(p, addr, retype(get_element_ud(vertex, 4), BRW_REGISTER_TYPE_UW),
+ brw_imm_uw(0x8));
+ brw_SHL(p, addr, addr, brw_imm_uw(2));
+ brw_MOV(p, get_element_ud(dst, 1), deref_1ud(brw_indirect(0, 0), 0));
+ }
+
+ /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
+ if (offset.file != ARF)
+ brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
+
+ brw_pop_insn_state(p);
+}
+
+
+static void
+generate_tcs_output_urb_offsets(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg write_mask,
+ struct brw_reg offset)
+{
+ /* Generates an URB read/write message header for HS/DS operation, for the patch URB entry. */
+ assert(dst.file == BRW_GENERAL_REGISTER_FILE || dst.file == BRW_MESSAGE_REGISTER_FILE);
+
+ assert(write_mask.file == BRW_IMMEDIATE_VALUE);
+ assert(write_mask.type == BRW_REGISTER_TYPE_UD);
+
+ brw_push_insn_state(p);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_MOV(p, dst, brw_imm_ud(0));
+
+ unsigned mask = write_mask.ud;
+
+ /* m0.5 bits 15:12 and 11:8 are channel enables */
+ brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud((mask << 8) | (mask << 12)));
+
+ /* HS patch URB handle is delivered in r0.0 */
+ struct brw_reg urb_handle = brw_vec1_grf(0, 0);
+
+ /* m0.0-0.1: URB handles */
+ brw_MOV(p, vec2(get_element_ud(dst, 0)),
+ retype(urb_handle, BRW_REGISTER_TYPE_UD));
+
+ /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
+ if (offset.file != ARF)
+ brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
+
+ brw_pop_insn_state(p);
+}
+
+static void
+generate_tes_create_input_read_header(struct brw_codegen *p,
+ struct brw_reg dst)
+{
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+ /* Initialize the register to 0 */
+ brw_MOV(p, dst, brw_imm_ud(0));
+
+ /* Enable all the channels in m0.5 bits 15:8 */
+ brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00));
+
+ /* Copy g1.3 (the patch URB handle) to m0.0 and m0.1. For safety,
+ * mask out irrelevant "Reserved" bits, as they're not marked MBZ.
+ */
+ brw_AND(p, vec2(get_element_ud(dst, 0)),
+ retype(brw_vec1_grf(1, 3), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(0x1fff));
+ brw_pop_insn_state(p);
+}
+
+static void
+generate_tes_add_indirect_urb_offset(struct brw_codegen *p,
+ struct brw_reg dst,
+ struct brw_reg header,
+ struct brw_reg offset)
+{
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+ brw_MOV(p, dst, header);
+ /* m0.3-0.4: 128-bit-granular offsets into the URB from the handles */
+ brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
+
+ brw_pop_insn_state(p);
+}
+
+static void
+generate_vec4_urb_read(struct brw_codegen *p,
+ vec4_instruction *inst,
+ struct brw_reg dst,
+ struct brw_reg header)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+
+ assert(header.file == BRW_GENERAL_REGISTER_FILE);
+ assert(header.type == BRW_REGISTER_TYPE_UD);
+
+ brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+ brw_set_dest(p, send, dst);
+ brw_set_src0(p, send, header);
+
+ brw_set_message_descriptor(p, send, BRW_SFID_URB,
+ 1 /* mlen */, 1 /* rlen */,
+ true /* header */, false /* eot */);
+ brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
+ brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
+ brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
+
+ brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
+}
+
+static void
+generate_tcs_release_input(struct brw_codegen *p,
+ struct brw_reg header,
+ struct brw_reg vertex,
+ struct brw_reg is_unpaired)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+
+ assert(vertex.file == BRW_IMMEDIATE_VALUE);
+ assert(vertex.type == BRW_REGISTER_TYPE_UD);
+
+ /* m0.0-0.1: URB handles */
+ struct brw_reg urb_handles =
+ retype(brw_vec2_grf(1 + (vertex.ud >> 3), vertex.ud & 7),
+ BRW_REGISTER_TYPE_UD);
+
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_MOV(p, header, brw_imm_ud(0));
+ brw_MOV(p, vec2(get_element_ud(header, 0)), urb_handles);
+ brw_pop_insn_state(p);
+
+ brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+ brw_set_dest(p, send, brw_null_reg());
+ brw_set_src0(p, send, header);
+ brw_set_message_descriptor(p, send, BRW_SFID_URB,
+ 1 /* mlen */, 0 /* rlen */,
+ true /* header */, false /* eot */);
+ brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
+ brw_inst_set_urb_complete(devinfo, send, 1);
+ brw_inst_set_urb_swizzle_control(devinfo, send, is_unpaired.ud ?
+ BRW_URB_SWIZZLE_NONE :
+ BRW_URB_SWIZZLE_INTERLEAVE);
+}
+
+static void
+generate_tcs_thread_end(struct brw_codegen *p, vec4_instruction *inst)
+{
+ struct brw_reg header = brw_message_reg(inst->base_mrf);
+
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_MOV(p, header, brw_imm_ud(0));
+ brw_MOV(p, get_element_ud(header, 5), brw_imm_ud(WRITEMASK_X << 8));
+ brw_MOV(p, get_element_ud(header, 0),
+ retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
+ brw_MOV(p, brw_message_reg(inst->base_mrf + 1), brw_imm_ud(0u));
+ brw_pop_insn_state(p);
+
+ brw_urb_WRITE(p,
+ brw_null_reg(), /* dest */
+ inst->base_mrf, /* starting mrf reg nr */
+ header,
+ BRW_URB_WRITE_EOT | BRW_URB_WRITE_OWORD |
+ BRW_URB_WRITE_USE_CHANNEL_MASKS,
+ inst->mlen,
+ 0, /* response len */
+ 0, /* urb destination offset */
+ 0);
+}
+
+static void
+generate_tes_get_primitive_id(struct brw_codegen *p, struct brw_reg dst)
+{
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_MOV(p, dst, retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_D));
+ brw_pop_insn_state(p);
+}
+
+static void
+generate_tcs_get_primitive_id(struct brw_codegen *p, struct brw_reg dst)
+{
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_MOV(p, dst, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
+ brw_pop_insn_state(p);
+}
+
+static void
+generate_tcs_create_barrier_header(struct brw_codegen *p,
+ struct brw_vue_prog_data *prog_data,
+ struct brw_reg dst)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail;
+ struct brw_reg m0_2 = get_element_ud(dst, 2);
+ unsigned instances = ((struct brw_tcs_prog_data *) prog_data)->instances;
+
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+ /* Zero the message header */
+ brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
+
+ /* Copy "Barrier ID" from r0.2, bits 16:13 (Gen7.5+) or 15:12 (Gen7) */
+ brw_AND(p, m0_2,
+ retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(ivb ? INTEL_MASK(15, 12) : INTEL_MASK(16, 13)));
+
+ /* Shift it up to bits 27:24. */
+ brw_SHL(p, m0_2, get_element_ud(dst, 2), brw_imm_ud(ivb ? 12 : 11));
+
+ /* Set the Barrier Count and the enable bit */
+ brw_OR(p, m0_2, m0_2, brw_imm_ud(instances << 9 | (1 << 15)));
+
+ brw_pop_insn_state(p);
+}
+
+static void
+generate_oword_dual_block_offsets(struct brw_codegen *p,
+ struct brw_reg m1,
+ struct brw_reg index)
+{
+ int second_vertex_offset;
+
+ if (p->devinfo->gen >= 6)
+ second_vertex_offset = 1;
+ else
+ second_vertex_offset = 16;
+
+ m1 = retype(m1, BRW_REGISTER_TYPE_D);
+
+ /* Set up M1 (message payload). Only the block offsets in M1.0 and
+ * M1.4 are used, and the rest are ignored.
+ */
+ struct brw_reg m1_0 = suboffset(vec1(m1), 0);
+ struct brw_reg m1_4 = suboffset(vec1(m1), 4);
+ struct brw_reg index_0 = suboffset(vec1(index), 0);
+ struct brw_reg index_4 = suboffset(vec1(index), 4);
+
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ brw_MOV(p, m1_0, index_0);
+
+ if (index.file == BRW_IMMEDIATE_VALUE) {
+ index_4.ud += second_vertex_offset;
+ brw_MOV(p, m1_4, index_4);
+ } else {
+ brw_ADD(p, m1_4, index_4, brw_imm_d(second_vertex_offset));
+ }
+
+ brw_pop_insn_state(p);
+}
+
+static void
+generate_unpack_flags(struct brw_codegen *p,
+ struct brw_reg dst)
+{
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ struct brw_reg flags = brw_flag_reg(0, 0);
+ struct brw_reg dst_0 = suboffset(vec1(dst), 0);
+ struct brw_reg dst_4 = suboffset(vec1(dst), 4);
+
+ brw_AND(p, dst_0, flags, brw_imm_ud(0x0f));
+ brw_AND(p, dst_4, flags, brw_imm_ud(0xf0));
+ brw_SHR(p, dst_4, dst_4, brw_imm_ud(4));
+
+ brw_pop_insn_state(p);
+}
+
+static void
+generate_scratch_read(struct brw_codegen *p,
+ vec4_instruction *inst,
+ struct brw_reg dst,
+ struct brw_reg index)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ struct brw_reg header = brw_vec8_grf(0, 0);
+
+ gen6_resolve_implied_move(p, &header, inst->base_mrf);
+
+ generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1),
+ index);
+
+ uint32_t msg_type;
+
+ if (devinfo->gen >= 6)
+ msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+ else if (devinfo->gen == 5 || devinfo->is_g4x)
+ msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+ else
+ msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+
+ const unsigned target_cache =
+ devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
+ devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
+ BRW_DATAPORT_READ_TARGET_RENDER_CACHE;
+
+ /* Each of the 8 channel enables is considered for whether each
+ * dword is written.
+ */
+ brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+ brw_set_dest(p, send, dst);
+ brw_set_src0(p, send, header);
+ if (devinfo->gen < 6)
+ brw_inst_set_cond_modifier(devinfo, send, inst->base_mrf);
+ brw_set_dp_read_message(p, send,
+ brw_scratch_surface_idx(p),
+ BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+ msg_type, target_cache,
+ 2, /* mlen */
+ true, /* header_present */
+ 1 /* rlen */);
+}
+
+static void
+generate_scratch_write(struct brw_codegen *p,
+ vec4_instruction *inst,
+ struct brw_reg dst,
+ struct brw_reg src,
+ struct brw_reg index)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ const unsigned target_cache =
+ (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
+ devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
+ BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
+ struct brw_reg header = brw_vec8_grf(0, 0);
+ bool write_commit;
+
+ /* If the instruction is predicated, we'll predicate the send, not
+ * the header setup.
+ */
+ brw_set_default_predicate_control(p, false);
+
+ gen6_resolve_implied_move(p, &header, inst->base_mrf);
+
+ generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1),
+ index);
+
+ brw_MOV(p,
+ retype(brw_message_reg(inst->base_mrf + 2), BRW_REGISTER_TYPE_D),
+ retype(src, BRW_REGISTER_TYPE_D));
+
+ uint32_t msg_type;
+
+ if (devinfo->gen >= 7)
+ msg_type = GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE;
+ else if (devinfo->gen == 6)
+ msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
+ else
+ msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
+
+ brw_set_default_predicate_control(p, inst->predicate);
+
+ /* Pre-gen6, we have to specify write commits to ensure ordering
+ * between reads and writes within a thread. Afterwards, that's
+ * guaranteed and write commits only matter for inter-thread
+ * synchronization.
+ */
+ if (devinfo->gen >= 6) {
+ write_commit = false;
+ } else {
+ /* The visitor set up our destination register to be g0. This
+ * means that when the next read comes along, we will end up
+ * reading from g0 and causing a block on the write commit. For
+ * write-after-read, we are relying on the value of the previous
+ * read being used (and thus blocking on completion) before our
+ * write is executed. This means we have to be careful in
+ * instruction scheduling to not violate this assumption.
+ */
+ write_commit = true;
+ }
+
+ /* Each of the 8 channel enables is considered for whether each
+ * dword is written.
+ */
+ brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+ brw_set_dest(p, send, dst);
+ brw_set_src0(p, send, header);
+ if (devinfo->gen < 6)
+ brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
+ brw_set_dp_write_message(p, send,
+ brw_scratch_surface_idx(p),
+ BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+ msg_type,
+ target_cache,
+ 3, /* mlen */
+ true, /* header present */
+ false, /* not a render target write */
+ write_commit, /* rlen */
+ false, /* eot */
+ write_commit);
+}
+
+static void
+generate_pull_constant_load(struct brw_codegen *p,
+ struct brw_vue_prog_data *prog_data,
+ vec4_instruction *inst,
+ struct brw_reg dst,
+ struct brw_reg index,
+ struct brw_reg offset)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ const unsigned target_cache =
+ (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_SAMPLER_CACHE :
+ BRW_DATAPORT_READ_TARGET_DATA_CACHE);
+ assert(index.file == BRW_IMMEDIATE_VALUE &&
+ index.type == BRW_REGISTER_TYPE_UD);
+ uint32_t surf_index = index.ud;
+
+ struct brw_reg header = brw_vec8_grf(0, 0);
+
+ gen6_resolve_implied_move(p, &header, inst->base_mrf);
+
+ if (devinfo->gen >= 6) {
+ if (offset.file == BRW_IMMEDIATE_VALUE) {
+ brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1),
+ BRW_REGISTER_TYPE_D),
+ brw_imm_d(offset.ud >> 4));
+ } else {
+ brw_SHR(p, retype(brw_message_reg(inst->base_mrf + 1),
+ BRW_REGISTER_TYPE_D),
+ offset, brw_imm_d(4));
+ }
+ } else {
+ brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1),
+ BRW_REGISTER_TYPE_D),
+ offset);
+ }
+
+ uint32_t msg_type;
+
+ if (devinfo->gen >= 6)
+ msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+ else if (devinfo->gen == 5 || devinfo->is_g4x)
+ msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+ else
+ msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+
+ /* Each of the 8 channel enables is considered for whether each
+ * dword is written.
+ */
+ brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+ brw_set_dest(p, send, dst);
+ brw_set_src0(p, send, header);
+ if (devinfo->gen < 6)
+ brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
+ brw_set_dp_read_message(p, send,
+ surf_index,
+ BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+ msg_type,
+ target_cache,
+ 2, /* mlen */
+ true, /* header_present */
+ 1 /* rlen */);
+}
+
+static void
+generate_get_buffer_size(struct brw_codegen *p,
+ struct brw_vue_prog_data *prog_data,
+ vec4_instruction *inst,
+ struct brw_reg dst,
+ struct brw_reg src,
+ struct brw_reg surf_index)
+{
+ assert(p->devinfo->gen >= 7);
+ assert(surf_index.type == BRW_REGISTER_TYPE_UD &&
+ surf_index.file == BRW_IMMEDIATE_VALUE);
+
+ brw_SAMPLE(p,
+ dst,
+ inst->base_mrf,
+ src,
+ surf_index.ud,
+ 0,
+ GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
+ 1, /* response length */
+ inst->mlen,
+ inst->header_size > 0,
+ BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+ BRW_SAMPLER_RETURN_FORMAT_SINT32);
+
+ brw_mark_surface_used(&prog_data->base, surf_index.ud);
+}
+
+static void
+generate_pull_constant_load_gen7(struct brw_codegen *p,
+ struct brw_vue_prog_data *prog_data,
+ vec4_instruction *inst,
+ struct brw_reg dst,
+ struct brw_reg surf_index,
+ struct brw_reg offset)
+{
+ assert(surf_index.type == BRW_REGISTER_TYPE_UD);
+
+ if (surf_index.file == BRW_IMMEDIATE_VALUE) {
+
+ brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND);
+ brw_set_dest(p, insn, dst);
+ brw_set_src0(p, insn, offset);
+ brw_set_sampler_message(p, insn,
+ surf_index.ud,
+ 0, /* LD message ignores sampler unit */
+ GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+ 1, /* rlen */
+ inst->mlen,
+ inst->header_size != 0,
+ BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+ 0);
+
+ brw_mark_surface_used(&prog_data->base, surf_index.ud);
+
+ } else {
+
+ struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
+
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ /* a0.0 = surf_index & 0xff */
+ brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
+ brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
+ brw_set_dest(p, insn_and, addr);
+ brw_set_src0(p, insn_and, vec1(retype(surf_index, BRW_REGISTER_TYPE_UD)));
+ brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
+
+ brw_pop_insn_state(p);
+
+ /* dst = send(offset, a0.0 | <descriptor>) */
+ brw_inst *insn = brw_send_indirect_message(
+ p, BRW_SFID_SAMPLER, dst, offset, addr);
+ brw_set_sampler_message(p, insn,
+ 0 /* surface */,
+ 0 /* sampler */,
+ GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+ 1 /* rlen */,
+ inst->mlen,
+ inst->header_size != 0,
+ BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+ 0);
+ }
+}
+
+static void
+generate_set_simd4x2_header_gen9(struct brw_codegen *p,
+ vec4_instruction *inst,
+ struct brw_reg dst)
+{
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+ brw_set_default_exec_size(p, BRW_EXECUTE_8);
+ brw_MOV(p, vec8(dst), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_MOV(p, get_element_ud(dst, 2),
+ brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2));
+
+ brw_pop_insn_state(p);
+}
+
+static void
+generate_mov_indirect(struct brw_codegen *p,
+ vec4_instruction *inst,
+ struct brw_reg dst, struct brw_reg reg,
+ struct brw_reg indirect, struct brw_reg length)
+{
+ assert(indirect.type == BRW_REGISTER_TYPE_UD);
+ assert(p->devinfo->gen >= 6);
+
+ unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr * (REG_SIZE / 2);
+
+ /* This instruction acts in align1 mode */
+ assert(dst.writemask == WRITEMASK_XYZW);
+
+ if (indirect.file == BRW_IMMEDIATE_VALUE) {
+ imm_byte_offset += indirect.ud;
+
+ reg.nr = imm_byte_offset / REG_SIZE;
+ reg.subnr = (imm_byte_offset / (REG_SIZE / 2)) % 2;
+ unsigned shift = (imm_byte_offset / 4) % 4;
+ reg.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift);
+
+ brw_MOV(p, dst, reg);
+ } else {
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+ struct brw_reg addr = vec8(brw_address_reg(0));
+
+ /* We need to move the indirect value into the address register. In
+ * order to make things make some sense, we want to respect at least the
+ * X component of the swizzle. In order to do that, we need to convert
+ * the subnr (probably 0) to an align1 subnr and add in the swizzle.
+ */
+ assert(brw_is_single_value_swizzle(indirect.swizzle));
+ indirect.subnr = (indirect.subnr * 4 + BRW_GET_SWZ(indirect.swizzle, 0));
+
+ /* We then use a region of <8,4,0>:uw to pick off the first 2 bytes of
+ * the indirect and splat it out to all four channels of the given half
+ * of a0.
+ */
+ indirect.subnr *= 2;
+ indirect = stride(retype(indirect, BRW_REGISTER_TYPE_UW), 8, 4, 0);
+ brw_ADD(p, addr, indirect, brw_imm_uw(imm_byte_offset));
+
+ /* Now we need to incorporate the swizzle from the source register */
+ if (reg.swizzle != BRW_SWIZZLE_XXXX) {
+ uint32_t uv_swiz = BRW_GET_SWZ(reg.swizzle, 0) << 2 |
+ BRW_GET_SWZ(reg.swizzle, 1) << 6 |
+ BRW_GET_SWZ(reg.swizzle, 2) << 10 |
+ BRW_GET_SWZ(reg.swizzle, 3) << 14;
+ uv_swiz |= uv_swiz << 16;
+
+ brw_ADD(p, addr, addr, brw_imm_uv(uv_swiz));
+ }
+
+ brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), reg.type));
+
+ brw_pop_insn_state(p);
+ }
+}
+
+static void
+generate_code(struct brw_codegen *p,
+ const struct brw_compiler *compiler,
+ void *log_data,
+ const nir_shader *nir,
+ struct brw_vue_prog_data *prog_data,
+ const struct cfg_t *cfg)
+{
+ const struct gen_device_info *devinfo = p->devinfo;
+ const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->stage);
+ bool debug_flag = INTEL_DEBUG &
+ intel_debug_flag_for_shader_stage(nir->stage);
+ struct annotation_info annotation;
+ memset(&annotation, 0, sizeof(annotation));
+ int spill_count = 0, fill_count = 0;
+ int loop_count = 0;
+
+ foreach_block_and_inst (block, vec4_instruction, inst, cfg) {
+ struct brw_reg src[3], dst;
+
+ if (unlikely(debug_flag))
+ annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset);
+
+ for (unsigned int i = 0; i < 3; i++) {
+ src[i] = inst->src[i].as_brw_reg();
+ }
+ dst = inst->dst.as_brw_reg();
+
+ brw_set_default_predicate_control(p, inst->predicate);
+ brw_set_default_predicate_inverse(p, inst->predicate_inverse);
+ brw_set_default_flag_reg(p, 0, inst->flag_subreg);
+ brw_set_default_saturate(p, inst->saturate);
+ brw_set_default_mask_control(p, inst->force_writemask_all);
+ brw_set_default_acc_write_control(p, inst->writes_accumulator);
+ brw_set_default_exec_size(p, cvt(inst->exec_size) - 1);
+
+ assert(inst->group % inst->exec_size == 0);
+ assert(inst->group % 8 == 0 ||
+ inst->dst.type == BRW_REGISTER_TYPE_DF ||
+ inst->src[0].type == BRW_REGISTER_TYPE_DF ||
+ inst->src[1].type == BRW_REGISTER_TYPE_DF ||
+ inst->src[2].type == BRW_REGISTER_TYPE_DF);
+ if (!inst->force_writemask_all)
+ brw_set_default_group(p, inst->group);
+
+ assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
+ assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
+
+ unsigned pre_emit_nr_insn = p->nr_insn;
+
+ switch (inst->opcode) {
+ case VEC4_OPCODE_UNPACK_UNIFORM:
+ case BRW_OPCODE_MOV:
+ brw_MOV(p, dst, src[0]);
+ break;
+ case BRW_OPCODE_ADD:
+ brw_ADD(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_MUL:
+ brw_MUL(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_MACH:
+ brw_MACH(p, dst, src[0], src[1]);
+ break;
+
+ case BRW_OPCODE_MAD:
+ assert(devinfo->gen >= 6);
+ brw_MAD(p, dst, src[0], src[1], src[2]);
+ break;
+
+ case BRW_OPCODE_FRC:
+ brw_FRC(p, dst, src[0]);
+ break;
+ case BRW_OPCODE_RNDD:
+ brw_RNDD(p, dst, src[0]);
+ break;
+ case BRW_OPCODE_RNDE:
+ brw_RNDE(p, dst, src[0]);
+ break;
+ case BRW_OPCODE_RNDZ:
+ brw_RNDZ(p, dst, src[0]);
+ break;
+
+ case BRW_OPCODE_AND:
+ brw_AND(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_OR:
+ brw_OR(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_XOR:
+ brw_XOR(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_NOT:
+ brw_NOT(p, dst, src[0]);
+ break;
+ case BRW_OPCODE_ASR:
+ brw_ASR(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_SHR:
+ brw_SHR(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_SHL:
+ brw_SHL(p, dst, src[0], src[1]);
+ break;
+
+ case BRW_OPCODE_CMP:
+ brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
+ break;
+ case BRW_OPCODE_SEL:
+ brw_SEL(p, dst, src[0], src[1]);
+ break;
+
+ case BRW_OPCODE_DPH:
+ brw_DPH(p, dst, src[0], src[1]);
+ break;
+
+ case BRW_OPCODE_DP4:
+ brw_DP4(p, dst, src[0], src[1]);
+ break;
+
+ case BRW_OPCODE_DP3:
+ brw_DP3(p, dst, src[0], src[1]);
+ break;
+
+ case BRW_OPCODE_DP2:
+ brw_DP2(p, dst, src[0], src[1]);
+ break;
+
+ case BRW_OPCODE_F32TO16:
+ assert(devinfo->gen >= 7);
+ brw_F32TO16(p, dst, src[0]);
+ break;
+
+ case BRW_OPCODE_F16TO32:
+ assert(devinfo->gen >= 7);
+ brw_F16TO32(p, dst, src[0]);
+ break;
+
+ case BRW_OPCODE_LRP:
+ assert(devinfo->gen >= 6);
+ brw_LRP(p, dst, src[0], src[1], src[2]);
+ break;
+
+ case BRW_OPCODE_BFREV:
+ assert(devinfo->gen >= 7);
+ /* BFREV only supports UD type for src and dst. */
+ brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
+ retype(src[0], BRW_REGISTER_TYPE_UD));
+ break;
+ case BRW_OPCODE_FBH:
+ assert(devinfo->gen >= 7);
+ /* FBH only supports UD type for dst. */
+ brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
+ break;
+ case BRW_OPCODE_FBL:
+ assert(devinfo->gen >= 7);
+ /* FBL only supports UD type for dst. */
+ brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
+ break;
+ case BRW_OPCODE_LZD:
+ brw_LZD(p, dst, src[0]);
+ break;
+ case BRW_OPCODE_CBIT:
+ assert(devinfo->gen >= 7);
+ /* CBIT only supports UD type for dst. */
+ brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
+ break;
+ case BRW_OPCODE_ADDC:
+ assert(devinfo->gen >= 7);
+ brw_ADDC(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_SUBB:
+ assert(devinfo->gen >= 7);
+ brw_SUBB(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_MAC:
+ brw_MAC(p, dst, src[0], src[1]);
+ break;
+
+ case BRW_OPCODE_BFE:
+ assert(devinfo->gen >= 7);
+ brw_BFE(p, dst, src[0], src[1], src[2]);
+ break;
+
+ case BRW_OPCODE_BFI1:
+ assert(devinfo->gen >= 7);
+ brw_BFI1(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_BFI2:
+ assert(devinfo->gen >= 7);
+ brw_BFI2(p, dst, src[0], src[1], src[2]);
+ break;
+
+ case BRW_OPCODE_IF:
+ if (!inst->src[0].is_null()) {
+ /* The instruction has an embedded compare (only allowed on gen6) */
+ assert(devinfo->gen == 6);
+ gen6_IF(p, inst->conditional_mod, src[0], src[1]);
+ } else {
+ brw_inst *if_inst = brw_IF(p, BRW_EXECUTE_8);
+ brw_inst_set_pred_control(p->devinfo, if_inst, inst->predicate);
+ }
+ break;
+
+ case BRW_OPCODE_ELSE:
+ brw_ELSE(p);
+ break;
+ case BRW_OPCODE_ENDIF:
+ brw_ENDIF(p);
+ break;
+
+ case BRW_OPCODE_DO:
+ brw_DO(p, BRW_EXECUTE_8);
+ break;
+
+ case BRW_OPCODE_BREAK:
+ brw_BREAK(p);
+ brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+ break;
+ case BRW_OPCODE_CONTINUE:
+ brw_CONT(p);
+ brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+ break;
+
+ case BRW_OPCODE_WHILE:
+ brw_WHILE(p);
+ loop_count++;
+ break;
+
+ case SHADER_OPCODE_RCP:
+ case SHADER_OPCODE_RSQ:
+ case SHADER_OPCODE_SQRT:
+ case SHADER_OPCODE_EXP2:
+ case SHADER_OPCODE_LOG2:
+ case SHADER_OPCODE_SIN:
+ case SHADER_OPCODE_COS:
+ assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
+ if (devinfo->gen >= 7) {
+ gen6_math(p, dst, brw_math_function(inst->opcode), src[0],
+ brw_null_reg());
+ } else if (devinfo->gen == 6) {
+ generate_math_gen6(p, inst, dst, src[0], brw_null_reg());
+ } else {
+ generate_math1_gen4(p, inst, dst, src[0]);
+ }
+ break;
+
+ case SHADER_OPCODE_POW:
+ case SHADER_OPCODE_INT_QUOTIENT:
+ case SHADER_OPCODE_INT_REMAINDER:
+ assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
+ if (devinfo->gen >= 7) {
+ gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
+ } else if (devinfo->gen == 6) {
+ generate_math_gen6(p, inst, dst, src[0], src[1]);
+ } else {
+ generate_math2_gen4(p, inst, dst, src[0], src[1]);
+ }
+ break;
+
+ case SHADER_OPCODE_TEX:
+ case SHADER_OPCODE_TXD:
+ case SHADER_OPCODE_TXF:
+ case SHADER_OPCODE_TXF_CMS:
+ case SHADER_OPCODE_TXF_CMS_W:
+ case SHADER_OPCODE_TXF_MCS:
+ case SHADER_OPCODE_TXL:
+ case SHADER_OPCODE_TXS:
+ case SHADER_OPCODE_TG4:
+ case SHADER_OPCODE_TG4_OFFSET:
+ case SHADER_OPCODE_SAMPLEINFO:
+ generate_tex(p, prog_data, nir->stage,
+ inst, dst, src[0], src[1], src[2]);
+ break;
+
+ case VS_OPCODE_URB_WRITE:
+ generate_vs_urb_write(p, inst);
+ break;
+
+ case SHADER_OPCODE_GEN4_SCRATCH_READ:
+ generate_scratch_read(p, inst, dst, src[0]);
+ fill_count++;
+ break;
+
+ case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
+ generate_scratch_write(p, inst, dst, src[0], src[1]);
+ spill_count++;
+ break;
+
+ case VS_OPCODE_PULL_CONSTANT_LOAD:
+ generate_pull_constant_load(p, prog_data, inst, dst, src[0], src[1]);
+ break;
+
+ case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
+ generate_pull_constant_load_gen7(p, prog_data, inst, dst, src[0], src[1]);
+ break;
+
+ case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
+ generate_set_simd4x2_header_gen9(p, inst, dst);
+ break;
+
+
+ case VS_OPCODE_GET_BUFFER_SIZE:
+ generate_get_buffer_size(p, prog_data, inst, dst, src[0], src[1]);
+ break;
+
+ case GS_OPCODE_URB_WRITE:
+ generate_gs_urb_write(p, inst);
+ break;
+
+ case GS_OPCODE_URB_WRITE_ALLOCATE:
+ generate_gs_urb_write_allocate(p, inst);
+ break;
+
+ case GS_OPCODE_SVB_WRITE:
+ generate_gs_svb_write(p, prog_data, inst, dst, src[0], src[1]);
+ break;
+
+ case GS_OPCODE_SVB_SET_DST_INDEX:
+ generate_gs_svb_set_destination_index(p, inst, dst, src[0]);
+ break;
+
+ case GS_OPCODE_THREAD_END:
+ generate_gs_thread_end(p, inst);
+ break;
+
+ case GS_OPCODE_SET_WRITE_OFFSET:
+ generate_gs_set_write_offset(p, dst, src[0], src[1]);
+ break;
+
+ case GS_OPCODE_SET_VERTEX_COUNT:
+ generate_gs_set_vertex_count(p, dst, src[0]);
+ break;
+
+ case GS_OPCODE_FF_SYNC:
+ generate_gs_ff_sync(p, inst, dst, src[0], src[1]);
+ break;
+
+ case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
+ generate_gs_ff_sync_set_primitives(p, dst, src[0], src[1], src[2]);
+ break;
+
+ case GS_OPCODE_SET_PRIMITIVE_ID:
+ generate_gs_set_primitive_id(p, dst);
+ break;
+
+ case GS_OPCODE_SET_DWORD_2:
+ generate_gs_set_dword_2(p, dst, src[0]);
+ break;
+
+ case GS_OPCODE_PREPARE_CHANNEL_MASKS:
+ generate_gs_prepare_channel_masks(p, dst);
+ break;
+
+ case GS_OPCODE_SET_CHANNEL_MASKS:
+ generate_gs_set_channel_masks(p, dst, src[0]);
+ break;
+
+ case GS_OPCODE_GET_INSTANCE_ID:
+ generate_gs_get_instance_id(p, dst);
+ break;
+
+ case SHADER_OPCODE_SHADER_TIME_ADD:
+ brw_shader_time_add(p, src[0],
+ prog_data->base.binding_table.shader_time_start);
+ brw_mark_surface_used(&prog_data->base,
+ prog_data->base.binding_table.shader_time_start);
+ break;
+
+ case SHADER_OPCODE_UNTYPED_ATOMIC:
+ assert(src[2].file == BRW_IMMEDIATE_VALUE);
+ brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
+ !inst->dst.is_null());
+ break;
+
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+ assert(src[2].file == BRW_IMMEDIATE_VALUE);
+ brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen,
+ src[2].ud);
+ break;
+
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+ assert(src[2].file == BRW_IMMEDIATE_VALUE);
+ brw_untyped_surface_write(p, src[0], src[1], inst->mlen,
+ src[2].ud);
+ break;
+
+ case SHADER_OPCODE_TYPED_ATOMIC:
+ assert(src[2].file == BRW_IMMEDIATE_VALUE);
+ brw_typed_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
+ !inst->dst.is_null());
+ break;
+
+ case SHADER_OPCODE_TYPED_SURFACE_READ:
+ assert(src[2].file == BRW_IMMEDIATE_VALUE);
+ brw_typed_surface_read(p, dst, src[0], src[1], inst->mlen,
+ src[2].ud);
+ break;
+
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+ assert(src[2].file == BRW_IMMEDIATE_VALUE);
+ brw_typed_surface_write(p, src[0], src[1], inst->mlen,
+ src[2].ud);
+ break;
+
+ case SHADER_OPCODE_MEMORY_FENCE:
+ brw_memory_fence(p, dst);
+ break;
+
+ case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
+ const struct brw_reg mask =
+ brw_stage_has_packed_dispatch(devinfo, nir->stage,
+ &prog_data->base) ? brw_imm_ud(~0u) :
+ brw_dmask_reg();
+ brw_find_live_channel(p, dst, mask);
+ break;
+ }
+
+ case SHADER_OPCODE_BROADCAST:
+ assert(inst->force_writemask_all);
+ brw_broadcast(p, dst, src[0], src[1]);
+ break;
+
+ case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
+ generate_unpack_flags(p, dst);
+ break;
+
+ case VEC4_OPCODE_MOV_BYTES: {
+ /* Moves the low byte from each channel, using an Align1 access mode
+ * and a <4,1,0> source region.
+ */
+ assert(src[0].type == BRW_REGISTER_TYPE_UB ||
+ src[0].type == BRW_REGISTER_TYPE_B);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ src[0].vstride = BRW_VERTICAL_STRIDE_4;
+ src[0].width = BRW_WIDTH_1;
+ src[0].hstride = BRW_HORIZONTAL_STRIDE_0;
+ brw_MOV(p, dst, src[0]);
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ break;
+ }
+
+ case VEC4_OPCODE_FROM_DOUBLE: {
+ assert(type_sz(src[0].type) == 8);
+ assert(type_sz(dst.type) == 4);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ dst.hstride = BRW_HORIZONTAL_STRIDE_2;
+ dst.width = BRW_WIDTH_4;
+ src[0].vstride = BRW_VERTICAL_STRIDE_4;
+ src[0].width = BRW_WIDTH_4;
+ brw_MOV(p, dst, src[0]);
+
+ struct brw_reg dst_as_src = dst;
+ dst.hstride = BRW_HORIZONTAL_STRIDE_1;
+ dst.width = BRW_WIDTH_8;
+ brw_MOV(p, dst, dst_as_src);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ break;
+ }
+
+ case VEC4_OPCODE_TO_DOUBLE: {
+ assert(type_sz(src[0].type) == 4);
+ assert(type_sz(dst.type) == 8);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ struct brw_reg tmp = retype(dst, src[0].type);
+ tmp.hstride = BRW_HORIZONTAL_STRIDE_2;
+ tmp.width = BRW_WIDTH_4;
+ src[0].vstride = BRW_VERTICAL_STRIDE_4;
+ src[0].hstride = BRW_HORIZONTAL_STRIDE_1;
+ src[0].width = BRW_WIDTH_4;
+ brw_MOV(p, tmp, src[0]);
+
+ tmp.vstride = BRW_VERTICAL_STRIDE_8;
+ tmp.hstride = BRW_HORIZONTAL_STRIDE_2;
+ tmp.width = BRW_WIDTH_4;
+ brw_MOV(p, dst, tmp);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ break;
+ }
+
+ case VEC4_OPCODE_PICK_LOW_32BIT:
+ case VEC4_OPCODE_PICK_HIGH_32BIT: {
+ /* Stores the low/high 32-bit of each 64-bit element in src[0] into
+ * dst using ALIGN1 mode and a <8,4,2>:UD region on the source.
+ */
+ assert(type_sz(src[0].type) == 8);
+ assert(type_sz(dst.type) == 4);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ dst = retype(dst, BRW_REGISTER_TYPE_UD);
+ dst.hstride = BRW_HORIZONTAL_STRIDE_1;
+
+ src[0] = retype(src[0], BRW_REGISTER_TYPE_UD);
+ if (inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT)
+ src[0] = suboffset(src[0], 1);
+ src[0].vstride = BRW_VERTICAL_STRIDE_8;
+ src[0].width = BRW_WIDTH_4;
+ src[0].hstride = BRW_HORIZONTAL_STRIDE_2;
+ brw_MOV(p, dst, src[0]);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ break;
+ }
+
+ case VEC4_OPCODE_SET_LOW_32BIT:
+ case VEC4_OPCODE_SET_HIGH_32BIT: {
+ /* Reads consecutive 32-bit elements from src[0] and writes
+ * them to the low/high 32-bit of each 64-bit element in dst.
+ */
+ assert(type_sz(src[0].type) == 4);
+ assert(type_sz(dst.type) == 8);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ dst = retype(dst, BRW_REGISTER_TYPE_UD);
+ if (inst->opcode == VEC4_OPCODE_SET_HIGH_32BIT)
+ dst = suboffset(dst, 1);
+ dst.hstride = BRW_HORIZONTAL_STRIDE_2;
+
+ src[0] = retype(src[0], BRW_REGISTER_TYPE_UD);
+ src[0].vstride = BRW_VERTICAL_STRIDE_4;
+ src[0].width = BRW_WIDTH_4;
+ src[0].hstride = BRW_HORIZONTAL_STRIDE_1;
+ brw_MOV(p, dst, src[0]);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ break;
+ }
+
+ case VEC4_OPCODE_PACK_BYTES: {
+ /* Is effectively:
+ *
+ * mov(8) dst<16,4,1>:UB src<4,1,0>:UB
+ *
+ * but destinations' only regioning is horizontal stride, so instead we
+ * have to use two instructions:
+ *
+ * mov(4) dst<1>:UB src<4,1,0>:UB
+ * mov(4) dst.16<1>:UB src.16<4,1,0>:UB
+ *
+ * where they pack the four bytes from the low and high four DW.
+ */
+ assert(_mesa_is_pow_two(dst.writemask) &&
+ dst.writemask != 0);
+ unsigned offset = __builtin_ctz(dst.writemask);
+
+ dst.type = BRW_REGISTER_TYPE_UB;
+
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ src[0].type = BRW_REGISTER_TYPE_UB;
+ src[0].vstride = BRW_VERTICAL_STRIDE_4;
+ src[0].width = BRW_WIDTH_1;
+ src[0].hstride = BRW_HORIZONTAL_STRIDE_0;
+ dst.subnr = offset * 4;
+ struct brw_inst *insn = brw_MOV(p, dst, src[0]);
+ brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4);
+ brw_inst_set_no_dd_clear(p->devinfo, insn, true);
+ brw_inst_set_no_dd_check(p->devinfo, insn, inst->no_dd_check);
+
+ src[0].subnr = 16;
+ dst.subnr = 16 + offset * 4;
+ insn = brw_MOV(p, dst, src[0]);
+ brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4);
+ brw_inst_set_no_dd_clear(p->devinfo, insn, inst->no_dd_clear);
+ brw_inst_set_no_dd_check(p->devinfo, insn, true);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ break;
+ }
+
+ case TCS_OPCODE_URB_WRITE:
+ generate_tcs_urb_write(p, inst, src[0]);
+ break;
+
+ case VEC4_OPCODE_URB_READ:
+ generate_vec4_urb_read(p, inst, dst, src[0]);
+ break;
+
+ case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+ generate_tcs_input_urb_offsets(p, dst, src[0], src[1]);
+ break;
+
+ case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+ generate_tcs_output_urb_offsets(p, dst, src[0], src[1]);
+ break;
+
+ case TCS_OPCODE_GET_INSTANCE_ID:
+ generate_tcs_get_instance_id(p, dst);
+ break;
+
+ case TCS_OPCODE_GET_PRIMITIVE_ID:
+ generate_tcs_get_primitive_id(p, dst);
+ break;
+
+ case TCS_OPCODE_CREATE_BARRIER_HEADER:
+ generate_tcs_create_barrier_header(p, prog_data, dst);
+ break;
+
+ case TES_OPCODE_CREATE_INPUT_READ_HEADER:
+ generate_tes_create_input_read_header(p, dst);
+ break;
+
+ case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
+ generate_tes_add_indirect_urb_offset(p, dst, src[0], src[1]);
+ break;
+
+ case TES_OPCODE_GET_PRIMITIVE_ID:
+ generate_tes_get_primitive_id(p, dst);
+ break;
+
+ case TCS_OPCODE_SRC0_010_IS_ZERO:
+ /* If src_reg had stride like fs_reg, we wouldn't need this. */
+ brw_MOV(p, brw_null_reg(), stride(src[0], 0, 1, 0));
+ break;
+
+ case TCS_OPCODE_RELEASE_INPUT:
+ generate_tcs_release_input(p, dst, src[0], src[1]);
+ break;
+
+ case TCS_OPCODE_THREAD_END:
+ generate_tcs_thread_end(p, inst);
+ break;
+
+ case SHADER_OPCODE_BARRIER:
+ brw_barrier(p, src[0]);
+ brw_WAIT(p);
+ break;
+
+ case SHADER_OPCODE_MOV_INDIRECT:
+ generate_mov_indirect(p, inst, dst, src[0], src[1], src[2]);
+ break;
+
+ case BRW_OPCODE_DIM:
+ assert(devinfo->is_haswell);
+ assert(src[0].type == BRW_REGISTER_TYPE_DF);
+ assert(dst.type == BRW_REGISTER_TYPE_DF);
+ brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
+ break;
+
+ default:
+ unreachable("Unsupported opcode");
+ }
+
+ if (inst->opcode == VEC4_OPCODE_PACK_BYTES) {
+ /* Handled dependency hints in the generator. */
+
+ assert(!inst->conditional_mod);
+ } else if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
+ assert(p->nr_insn == pre_emit_nr_insn + 1 ||
+ !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
+ "emitting more than 1 instruction");
+
+ brw_inst *last = &p->store[pre_emit_nr_insn];
+
+ if (inst->conditional_mod)
+ brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
+ brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
+ brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
+ }
+ }
+
+ brw_set_uip_jip(p, 0);
+ annotation_finalize(&annotation, p->next_insn_offset);
+
+#ifndef NDEBUG
+ bool validated = brw_validate_instructions(p, 0, &annotation);
+#else
+ if (unlikely(debug_flag))
+ brw_validate_instructions(p, 0, &annotation);
+#endif
+
+ int before_size = p->next_insn_offset;
+ brw_compact_instructions(p, 0, annotation.ann_count, annotation.ann);
+ int after_size = p->next_insn_offset;
+
+ if (unlikely(debug_flag)) {
+ fprintf(stderr, "Native code for %s %s shader %s:\n",
+ nir->info->label ? nir->info->label : "unnamed",
+ _mesa_shader_stage_to_string(nir->stage), nir->info->name);
+
+ fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d "
+ "spills:fills. Compacted %d to %d bytes (%.0f%%)\n",
+ stage_abbrev, before_size / 16, loop_count, cfg->cycle_count,
+ spill_count, fill_count, before_size, after_size,
+ 100.0f * (before_size - after_size) / before_size);
+
+ dump_assembly(p->store, annotation.ann_count, annotation.ann,
+ p->devinfo);
+ ralloc_free(annotation.mem_ctx);
+ }
+ assert(validated);
+
+ compiler->shader_debug_log(log_data,
+ "%s vec4 shader: %d inst, %d loops, %u cycles, "
+ "%d:%d spills:fills, compacted %d to %d bytes.",
+ stage_abbrev, before_size / 16,
+ loop_count, cfg->cycle_count, spill_count,
+ fill_count, before_size, after_size);
+
+}
+
+extern "C" const unsigned *
+brw_vec4_generate_assembly(const struct brw_compiler *compiler,
+ void *log_data,
+ void *mem_ctx,
+ const nir_shader *nir,
+ struct brw_vue_prog_data *prog_data,
+ const struct cfg_t *cfg,
+ unsigned *out_assembly_size)
+{
+ struct brw_codegen *p = rzalloc(mem_ctx, struct brw_codegen);
+ brw_init_codegen(compiler->devinfo, p, mem_ctx);
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+ generate_code(p, compiler, log_data, nir, prog_data, cfg);
+
+ return brw_get_program(p, out_assembly_size);
+}
diff --git a/src/intel/compiler/brw_vec4_gs_nir.cpp b/src/intel/compiler/brw_vec4_gs_nir.cpp
new file mode 100644
index 00000000000..ed8c03b0594
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_gs_nir.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4_gs_visitor.h"
+
+namespace brw {
+
+void
+vec4_gs_visitor::nir_setup_inputs()
+{
+}
+
+void
+vec4_gs_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
+{
+ dst_reg *reg;
+
+ switch (instr->intrinsic) {
+ case nir_intrinsic_load_primitive_id:
+ /* We'll just read g1 directly; don't create a temporary. */
+ break;
+
+ case nir_intrinsic_load_invocation_id:
+ reg = &this->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
+ if (reg->file == BAD_FILE)
+ *reg = *this->make_reg_for_system_value(SYSTEM_VALUE_INVOCATION_ID);
+ break;
+
+ default:
+ vec4_visitor::nir_setup_system_value_intrinsic(instr);
+ }
+
+}
+
+void
+vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+{
+ dst_reg dest;
+ src_reg src;
+
+ switch (instr->intrinsic) {
+ case nir_intrinsic_load_per_vertex_input: {
+ /* The EmitNoIndirectInput flag guarantees our vertex index will
+ * be constant. We should handle indirects someday.
+ */
+ nir_const_value *vertex = nir_src_as_const_value(instr->src[0]);
+ nir_const_value *offset_reg = nir_src_as_const_value(instr->src[1]);
+
+ if (nir_dest_bit_size(instr->dest) == 64) {
+ src = src_reg(ATTR, BRW_VARYING_SLOT_COUNT * vertex->u32[0] +
+ instr->const_index[0] + offset_reg->u32[0],
+ glsl_type::dvec4_type);
+
+ dst_reg tmp = dst_reg(this, glsl_type::dvec4_type);
+ shuffle_64bit_data(tmp, src, false);
+
+ src = src_reg(tmp);
+ src.swizzle = BRW_SWZ_COMP_INPUT(nir_intrinsic_component(instr) / 2);
+
+ /* Write to dst reg taking into account original writemask */
+ dest = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_DF);
+ dest.writemask = brw_writemask_for_size(instr->num_components);
+ emit(MOV(dest, src));
+ } else {
+ /* Make up a type...we have no way of knowing... */
+ const glsl_type *const type = glsl_type::ivec(instr->num_components);
+
+ src = src_reg(ATTR, BRW_VARYING_SLOT_COUNT * vertex->u32[0] +
+ instr->const_index[0] + offset_reg->u32[0],
+ type);
+ src.swizzle = BRW_SWZ_COMP_INPUT(nir_intrinsic_component(instr));
+
+ /* gl_PointSize is passed in the .w component of the VUE header */
+ if (instr->const_index[0] == VARYING_SLOT_PSIZ)
+ src.swizzle = BRW_SWIZZLE_WWWW;
+
+ dest = get_nir_dest(instr->dest, src.type);
+ dest.writemask = brw_writemask_for_size(instr->num_components);
+ emit(MOV(dest, src));
+ }
+ break;
+ }
+
+ case nir_intrinsic_load_input:
+ unreachable("nir_lower_io should have produced per_vertex intrinsics");
+
+ case nir_intrinsic_emit_vertex_with_counter: {
+ this->vertex_count =
+ retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD);
+ int stream_id = instr->const_index[0];
+ gs_emit_vertex(stream_id);
+ break;
+ }
+
+ case nir_intrinsic_end_primitive_with_counter:
+ this->vertex_count =
+ retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD);
+ gs_end_primitive();
+ break;
+
+ case nir_intrinsic_set_vertex_count:
+ this->vertex_count =
+ retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD);
+ break;
+
+ case nir_intrinsic_load_primitive_id:
+ assert(gs_prog_data->include_primitive_id);
+ dest = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
+ emit(MOV(dest, retype(brw_vec4_grf(1, 0), BRW_REGISTER_TYPE_D)));
+ break;
+
+ case nir_intrinsic_load_invocation_id: {
+ src_reg invocation_id =
+ src_reg(nir_system_values[SYSTEM_VALUE_INVOCATION_ID]);
+ assert(invocation_id.file != BAD_FILE);
+ dest = get_nir_dest(instr->dest, invocation_id.type);
+ emit(MOV(dest, invocation_id));
+ break;
+ }
+
+ default:
+ vec4_visitor::nir_emit_intrinsic(instr);
+ }
+}
+}
diff --git a/src/intel/compiler/brw_vec4_gs_visitor.cpp b/src/intel/compiler/brw_vec4_gs_visitor.cpp
new file mode 100644
index 00000000000..4a8b5be30e1
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_gs_visitor.cpp
@@ -0,0 +1,933 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_gs_visitor.cpp
+ *
+ * Geometry-shader-specific code derived from the vec4_visitor class.
+ */
+
+#include "brw_vec4_gs_visitor.h"
+#include "gen6_gs_visitor.h"
+#include "brw_fs.h"
+#include "brw_nir.h"
+#include "common/gen_debug.h"
+
+namespace brw {
+
+vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler,
+ void *log_data,
+ struct brw_gs_compile *c,
+ struct brw_gs_prog_data *prog_data,
+ const nir_shader *shader,
+ void *mem_ctx,
+ bool no_spills,
+ int shader_time_index)
+ : vec4_visitor(compiler, log_data, &c->key.tex,
+ &prog_data->base, shader, mem_ctx,
+ no_spills, shader_time_index),
+ c(c),
+ gs_prog_data(prog_data)
+{
+}
+
+
+dst_reg *
+vec4_gs_visitor::make_reg_for_system_value(int location)
+{
+ dst_reg *reg = new(mem_ctx) dst_reg(this, glsl_type::int_type);
+
+ switch (location) {
+ case SYSTEM_VALUE_INVOCATION_ID:
+ this->current_annotation = "initialize gl_InvocationID";
+ if (gs_prog_data->invocations > 1)
+ emit(GS_OPCODE_GET_INSTANCE_ID, *reg);
+ else
+ emit(MOV(*reg, brw_imm_ud(0)));
+ break;
+ default:
+ unreachable("not reached");
+ }
+
+ return reg;
+}
+
+
+int
+vec4_gs_visitor::setup_varying_inputs(int payload_reg, int *attribute_map,
+ int attributes_per_reg)
+{
+ /* For geometry shaders there are N copies of the input attributes, where N
+ * is the number of input vertices. attribute_map[BRW_VARYING_SLOT_COUNT *
+ * i + j] represents attribute j for vertex i.
+ *
+ * Note that GS inputs are read from the VUE 256 bits (2 vec4's) at a time,
+ * so the total number of input slots that will be delivered to the GS (and
+ * thus the stride of the input arrays) is urb_read_length * 2.
+ */
+ const unsigned num_input_vertices = nir->info->gs.vertices_in;
+ assert(num_input_vertices <= MAX_GS_INPUT_VERTICES);
+ unsigned input_array_stride = prog_data->urb_read_length * 2;
+
+ for (int slot = 0; slot < c->input_vue_map.num_slots; slot++) {
+ int varying = c->input_vue_map.slot_to_varying[slot];
+ for (unsigned vertex = 0; vertex < num_input_vertices; vertex++) {
+ attribute_map[BRW_VARYING_SLOT_COUNT * vertex + varying] =
+ attributes_per_reg * payload_reg + input_array_stride * vertex +
+ slot;
+ }
+ }
+
+ int regs_used = ALIGN(input_array_stride * num_input_vertices,
+ attributes_per_reg) / attributes_per_reg;
+ return payload_reg + regs_used;
+}
+
+
+void
+vec4_gs_visitor::setup_payload()
+{
+ int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
+
+ /* If we are in dual instanced or single mode, then attributes are going
+ * to be interleaved, so one register contains two attribute slots.
+ */
+ int attributes_per_reg =
+ prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
+
+ /* If a geometry shader tries to read from an input that wasn't written by
+ * the vertex shader, that produces undefined results, but it shouldn't
+ * crash anything. So initialize attribute_map to zeros--that ensures that
+ * these undefined results are read from r0.
+ */
+ memset(attribute_map, 0, sizeof(attribute_map));
+
+ int reg = 0;
+
+ /* The payload always contains important data in r0, which contains
+ * the URB handles that are passed on to the URB write at the end
+ * of the thread.
+ */
+ reg++;
+
+ /* If the shader uses gl_PrimitiveIDIn, that goes in r1. */
+ if (gs_prog_data->include_primitive_id)
+ attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg++;
+
+ reg = setup_uniforms(reg);
+
+ reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
+
+ lower_attributes_to_hw_regs(attribute_map, attributes_per_reg > 1);
+
+ this->first_non_payload_grf = reg;
+}
+
+
+void
+vec4_gs_visitor::emit_prolog()
+{
+ /* In vertex shaders, r0.2 is guaranteed to be initialized to zero. In
+ * geometry shaders, it isn't (it contains a bunch of information we don't
+ * need, like the input primitive type). We need r0.2 to be zero in order
+ * to build scratch read/write messages correctly (otherwise this value
+ * will be interpreted as a global offset, causing us to do our scratch
+ * reads/writes to garbage memory). So just set it to zero at the top of
+ * the shader.
+ */
+ this->current_annotation = "clear r0.2";
+ dst_reg r0(retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD));
+ vec4_instruction *inst = emit(GS_OPCODE_SET_DWORD_2, r0, brw_imm_ud(0u));
+ inst->force_writemask_all = true;
+
+ /* Create a virtual register to hold the vertex count */
+ this->vertex_count = src_reg(this, glsl_type::uint_type);
+
+ /* Initialize the vertex_count register to 0 */
+ this->current_annotation = "initialize vertex_count";
+ inst = emit(MOV(dst_reg(this->vertex_count), brw_imm_ud(0u)));
+ inst->force_writemask_all = true;
+
+ if (c->control_data_header_size_bits > 0) {
+ /* Create a virtual register to hold the current set of control data
+ * bits.
+ */
+ this->control_data_bits = src_reg(this, glsl_type::uint_type);
+
+ /* If we're outputting more than 32 control data bits, then EmitVertex()
+ * will set control_data_bits to 0 after emitting the first vertex.
+ * Otherwise, we need to initialize it to 0 here.
+ */
+ if (c->control_data_header_size_bits <= 32) {
+ this->current_annotation = "initialize control data bits";
+ inst = emit(MOV(dst_reg(this->control_data_bits), brw_imm_ud(0u)));
+ inst->force_writemask_all = true;
+ }
+ }
+
+ this->current_annotation = NULL;
+}
+
+void
+vec4_gs_visitor::emit_thread_end()
+{
+ if (c->control_data_header_size_bits > 0) {
+ /* During shader execution, we only ever call emit_control_data_bits()
+ * just prior to outputting a vertex. Therefore, the control data bits
+ * corresponding to the most recently output vertex still need to be
+ * emitted.
+ */
+ current_annotation = "thread end: emit control data bits";
+ emit_control_data_bits();
+ }
+
+ /* MRF 0 is reserved for the debugger, so start with message header
+ * in MRF 1.
+ */
+ int base_mrf = 1;
+
+ bool static_vertex_count = gs_prog_data->static_vertex_count != -1;
+
+ /* If the previous instruction was a URB write, we don't need to issue
+ * a second one - we can just set the EOT bit on the previous write.
+ *
+ * Skip this on Gen8+ unless there's a static vertex count, as we also
+ * need to write the vertex count out, and combining the two may not be
+ * possible (or at least not straightforward).
+ */
+ vec4_instruction *last = (vec4_instruction *) instructions.get_tail();
+ if (last && last->opcode == GS_OPCODE_URB_WRITE &&
+ !(INTEL_DEBUG & DEBUG_SHADER_TIME) &&
+ devinfo->gen >= 8 && static_vertex_count) {
+ last->urb_write_flags = BRW_URB_WRITE_EOT | last->urb_write_flags;
+ return;
+ }
+
+ current_annotation = "thread end";
+ dst_reg mrf_reg(MRF, base_mrf);
+ src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+ vec4_instruction *inst = emit(MOV(mrf_reg, r0));
+ inst->force_writemask_all = true;
+ if (devinfo->gen < 8 || !static_vertex_count)
+ emit(GS_OPCODE_SET_VERTEX_COUNT, mrf_reg, this->vertex_count);
+ if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+ emit_shader_time_end();
+ inst = emit(GS_OPCODE_THREAD_END);
+ inst->base_mrf = base_mrf;
+ inst->mlen = devinfo->gen >= 8 && !static_vertex_count ? 2 : 1;
+}
+
+
+void
+vec4_gs_visitor::emit_urb_write_header(int mrf)
+{
+ /* The SEND instruction that writes the vertex data to the VUE will use
+ * per_slot_offset=true, which means that DWORDs 3 and 4 of the message
+ * header specify an offset (in multiples of 256 bits) into the URB entry
+ * at which the write should take place.
+ *
+ * So we have to prepare a message header with the appropriate offset
+ * values.
+ */
+ dst_reg mrf_reg(MRF, mrf);
+ src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+ this->current_annotation = "URB write header";
+ vec4_instruction *inst = emit(MOV(mrf_reg, r0));
+ inst->force_writemask_all = true;
+ emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, this->vertex_count,
+ brw_imm_ud(gs_prog_data->output_vertex_size_hwords));
+}
+
+
+vec4_instruction *
+vec4_gs_visitor::emit_urb_write_opcode(bool complete)
+{
+ /* We don't care whether the vertex is complete, because in general
+ * geometry shaders output multiple vertices, and we don't terminate the
+ * thread until all vertices are complete.
+ */
+ (void) complete;
+
+ vec4_instruction *inst = emit(GS_OPCODE_URB_WRITE);
+ inst->offset = gs_prog_data->control_data_header_size_hwords;
+
+ /* We need to increment Global Offset by 1 to make room for Broadwell's
+ * extra "Vertex Count" payload at the beginning of the URB entry.
+ */
+ if (devinfo->gen >= 8 && gs_prog_data->static_vertex_count == -1)
+ inst->offset++;
+
+ inst->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
+ return inst;
+}
+
+
+/**
+ * Write out a batch of 32 control data bits from the control_data_bits
+ * register to the URB.
+ *
+ * The current value of the vertex_count register determines which DWORD in
+ * the URB receives the control data bits. The control_data_bits register is
+ * assumed to contain the correct data for the vertex that was most recently
+ * output, and all previous vertices that share the same DWORD.
+ *
+ * This function takes care of ensuring that if no vertices have been output
+ * yet, no control bits are emitted.
+ */
+void
+vec4_gs_visitor::emit_control_data_bits()
+{
+ assert(c->control_data_bits_per_vertex != 0);
+
+ /* Since the URB_WRITE_OWORD message operates with 128-bit (vec4 sized)
+ * granularity, we need to use two tricks to ensure that the batch of 32
+ * control data bits is written to the appropriate DWORD in the URB. To
+ * select which vec4 we are writing to, we use the "slot {0,1} offset"
+ * fields of the message header. To select which DWORD in the vec4 we are
+ * writing to, we use the channel mask fields of the message header. To
+ * avoid penalizing geometry shaders that emit a small number of vertices
+ * with extra bookkeeping, we only do each of these tricks when
+ * c->prog_data.control_data_header_size_bits is large enough to make it
+ * necessary.
+ *
+ * Note: this means that if we're outputting just a single DWORD of control
+ * data bits, we'll actually replicate it four times since we won't do any
+ * channel masking. But that's not a problem since in this case the
+ * hardware only pays attention to the first DWORD.
+ */
+ enum brw_urb_write_flags urb_write_flags = BRW_URB_WRITE_OWORD;
+ if (c->control_data_header_size_bits > 32)
+ urb_write_flags = urb_write_flags | BRW_URB_WRITE_USE_CHANNEL_MASKS;
+ if (c->control_data_header_size_bits > 128)
+ urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET;
+
+ /* If we are using either channel masks or a per-slot offset, then we
+ * need to figure out which DWORD we are trying to write to, using the
+ * formula:
+ *
+ * dword_index = (vertex_count - 1) * bits_per_vertex / 32
+ *
+ * Since bits_per_vertex is a power of two, and is known at compile
+ * time, this can be optimized to:
+ *
+ * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
+ */
+ src_reg dword_index(this, glsl_type::uint_type);
+ if (urb_write_flags) {
+ src_reg prev_count(this, glsl_type::uint_type);
+ emit(ADD(dst_reg(prev_count), this->vertex_count,
+ brw_imm_ud(0xffffffffu)));
+ unsigned log2_bits_per_vertex =
+ util_last_bit(c->control_data_bits_per_vertex);
+ emit(SHR(dst_reg(dword_index), prev_count,
+ brw_imm_ud(6 - log2_bits_per_vertex)));
+ }
+
+ /* Start building the URB write message. The first MRF gets a copy of
+ * R0.
+ */
+ int base_mrf = 1;
+ dst_reg mrf_reg(MRF, base_mrf);
+ src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+ vec4_instruction *inst = emit(MOV(mrf_reg, r0));
+ inst->force_writemask_all = true;
+
+ if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) {
+ /* Set the per-slot offset to dword_index / 4, to that we'll write to
+ * the appropriate OWORD within the control data header.
+ */
+ src_reg per_slot_offset(this, glsl_type::uint_type);
+ emit(SHR(dst_reg(per_slot_offset), dword_index, brw_imm_ud(2u)));
+ emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset,
+ brw_imm_ud(1u));
+ }
+
+ if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) {
+ /* Set the channel masks to 1 << (dword_index % 4), so that we'll
+ * write to the appropriate DWORD within the OWORD. We need to do
+ * this computation with force_writemask_all, otherwise garbage data
+ * from invocation 0 might clobber the mask for invocation 1 when
+ * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks
+ * together.
+ */
+ src_reg channel(this, glsl_type::uint_type);
+ inst = emit(AND(dst_reg(channel), dword_index, brw_imm_ud(3u)));
+ inst->force_writemask_all = true;
+ src_reg one(this, glsl_type::uint_type);
+ inst = emit(MOV(dst_reg(one), brw_imm_ud(1u)));
+ inst->force_writemask_all = true;
+ src_reg channel_mask(this, glsl_type::uint_type);
+ inst = emit(SHL(dst_reg(channel_mask), one, channel));
+ inst->force_writemask_all = true;
+ emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask),
+ channel_mask);
+ emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask);
+ }
+
+ /* Store the control data bits in the message payload and send it. */
+ dst_reg mrf_reg2(MRF, base_mrf + 1);
+ inst = emit(MOV(mrf_reg2, this->control_data_bits));
+ inst->force_writemask_all = true;
+ inst = emit(GS_OPCODE_URB_WRITE);
+ inst->urb_write_flags = urb_write_flags;
+ /* We need to increment Global Offset by 256-bits to make room for
+ * Broadwell's extra "Vertex Count" payload at the beginning of the
+ * URB entry. Since this is an OWord message, Global Offset is counted
+ * in 128-bit units, so we must set it to 2.
+ */
+ if (devinfo->gen >= 8 && gs_prog_data->static_vertex_count == -1)
+ inst->offset = 2;
+ inst->base_mrf = base_mrf;
+ inst->mlen = 2;
+}
+
+void
+vec4_gs_visitor::set_stream_control_data_bits(unsigned stream_id)
+{
+ /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
+
+ /* Note: we are calling this *before* increasing vertex_count, so
+ * this->vertex_count == vertex_count - 1 in the formula above.
+ */
+
+ /* Stream mode uses 2 bits per vertex */
+ assert(c->control_data_bits_per_vertex == 2);
+
+ /* Must be a valid stream */
+ assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
+
+ /* Control data bits are initialized to 0 so we don't have to set any
+ * bits when sending vertices to stream 0.
+ */
+ if (stream_id == 0)
+ return;
+
+ /* reg::sid = stream_id */
+ src_reg sid(this, glsl_type::uint_type);
+ emit(MOV(dst_reg(sid), brw_imm_ud(stream_id)));
+
+ /* reg:shift_count = 2 * (vertex_count - 1) */
+ src_reg shift_count(this, glsl_type::uint_type);
+ emit(SHL(dst_reg(shift_count), this->vertex_count, brw_imm_ud(1u)));
+
+ /* Note: we're relying on the fact that the GEN SHL instruction only pays
+ * attention to the lower 5 bits of its second source argument, so on this
+ * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
+ * stream_id << ((2 * (vertex_count - 1)) % 32).
+ */
+ src_reg mask(this, glsl_type::uint_type);
+ emit(SHL(dst_reg(mask), sid, shift_count));
+ emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
+}
+
+void
+vec4_gs_visitor::gs_emit_vertex(int stream_id)
+{
+ this->current_annotation = "emit vertex: safety check";
+
+ /* Haswell and later hardware ignores the "Render Stream Select" bits
+ * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
+ * and instead sends all primitives down the pipeline for rasterization.
+ * If the SOL stage is enabled, "Render Stream Select" is honored and
+ * primitives bound to non-zero streams are discarded after stream output.
+ *
+ * Since the only purpose of primives sent to non-zero streams is to
+ * be recorded by transform feedback, we can simply discard all geometry
+ * bound to these streams when transform feedback is disabled.
+ */
+ if (stream_id > 0 && !nir->info->has_transform_feedback_varyings)
+ return;
+
+ /* If we're outputting 32 control data bits or less, then we can wait
+ * until the shader is over to output them all. Otherwise we need to
+ * output them as we go. Now is the time to do it, since we're about to
+ * output the vertex_count'th vertex, so it's guaranteed that the
+ * control data bits associated with the (vertex_count - 1)th vertex are
+ * correct.
+ */
+ if (c->control_data_header_size_bits > 32) {
+ this->current_annotation = "emit vertex: emit control data bits";
+ /* Only emit control data bits if we've finished accumulating a batch
+ * of 32 bits. This is the case when:
+ *
+ * (vertex_count * bits_per_vertex) % 32 == 0
+ *
+ * (in other words, when the last 5 bits of vertex_count *
+ * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some
+ * integer n (which is always the case, since bits_per_vertex is
+ * always 1 or 2), this is equivalent to requiring that the last 5-n
+ * bits of vertex_count are 0:
+ *
+ * vertex_count & (2^(5-n) - 1) == 0
+ *
+ * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
+ * equivalent to:
+ *
+ * vertex_count & (32 / bits_per_vertex - 1) == 0
+ */
+ vec4_instruction *inst =
+ emit(AND(dst_null_ud(), this->vertex_count,
+ brw_imm_ud(32 / c->control_data_bits_per_vertex - 1)));
+ inst->conditional_mod = BRW_CONDITIONAL_Z;
+
+ emit(IF(BRW_PREDICATE_NORMAL));
+ {
+ /* If vertex_count is 0, then no control data bits have been
+ * accumulated yet, so we skip emitting them.
+ */
+ emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u),
+ BRW_CONDITIONAL_NEQ));
+ emit(IF(BRW_PREDICATE_NORMAL));
+ emit_control_data_bits();
+ emit(BRW_OPCODE_ENDIF);
+
+ /* Reset control_data_bits to 0 so we can start accumulating a new
+ * batch.
+ *
+ * Note: in the case where vertex_count == 0, this neutralizes the
+ * effect of any call to EndPrimitive() that the shader may have
+ * made before outputting its first vertex.
+ */
+ inst = emit(MOV(dst_reg(this->control_data_bits), brw_imm_ud(0u)));
+ inst->force_writemask_all = true;
+ }
+ emit(BRW_OPCODE_ENDIF);
+ }
+
+ this->current_annotation = "emit vertex: vertex data";
+ emit_vertex();
+
+ /* In stream mode we have to set control data bits for all vertices
+ * unless we have disabled control data bits completely (which we do
+ * do for GL_POINTS outputs that don't use streams).
+ */
+ if (c->control_data_header_size_bits > 0 &&
+ gs_prog_data->control_data_format ==
+ GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
+ this->current_annotation = "emit vertex: Stream control data bits";
+ set_stream_control_data_bits(stream_id);
+ }
+
+ this->current_annotation = NULL;
+}
+
+void
+vec4_gs_visitor::gs_end_primitive()
+{
+ /* We can only do EndPrimitive() functionality when the control data
+ * consists of cut bits. Fortunately, the only time it isn't is when the
+ * output type is points, in which case EndPrimitive() is a no-op.
+ */
+ if (gs_prog_data->control_data_format !=
+ GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
+ return;
+ }
+
+ if (c->control_data_header_size_bits == 0)
+ return;
+
+ /* Cut bits use one bit per vertex. */
+ assert(c->control_data_bits_per_vertex == 1);
+
+ /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
+ * vertex n, 0 otherwise. So all we need to do here is mark bit
+ * (vertex_count - 1) % 32 in the cut_bits register to indicate that
+ * EndPrimitive() was called after emitting vertex (vertex_count - 1);
+ * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
+ *
+ * Note that if EndPrimitve() is called before emitting any vertices, this
+ * will cause us to set bit 31 of the control_data_bits register to 1.
+ * That's fine because:
+ *
+ * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
+ * output, so the hardware will ignore cut bit 31.
+ *
+ * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
+ * last vertex, so setting cut bit 31 has no effect (since the primitive
+ * is automatically ended when the GS terminates).
+ *
+ * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
+ * control_data_bits register to 0 when the first vertex is emitted.
+ */
+
+ /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
+ src_reg one(this, glsl_type::uint_type);
+ emit(MOV(dst_reg(one), brw_imm_ud(1u)));
+ src_reg prev_count(this, glsl_type::uint_type);
+ emit(ADD(dst_reg(prev_count), this->vertex_count, brw_imm_ud(0xffffffffu)));
+ src_reg mask(this, glsl_type::uint_type);
+ /* Note: we're relying on the fact that the GEN SHL instruction only pays
+ * attention to the lower 5 bits of its second source argument, so on this
+ * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
+ * ((vertex_count - 1) % 32).
+ */
+ emit(SHL(dst_reg(mask), one, prev_count));
+ emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
+}
+
+static const GLuint gl_prim_to_hw_prim[GL_TRIANGLE_STRIP_ADJACENCY+1] = {
+ [GL_POINTS] =_3DPRIM_POINTLIST,
+ [GL_LINES] = _3DPRIM_LINELIST,
+ [GL_LINE_LOOP] = _3DPRIM_LINELOOP,
+ [GL_LINE_STRIP] = _3DPRIM_LINESTRIP,
+ [GL_TRIANGLES] = _3DPRIM_TRILIST,
+ [GL_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
+ [GL_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
+ [GL_QUADS] = _3DPRIM_QUADLIST,
+ [GL_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
+ [GL_POLYGON] = _3DPRIM_POLYGON,
+ [GL_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
+ [GL_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
+ [GL_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
+ [GL_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
+};
+
+extern "C" const unsigned *
+brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
+ void *mem_ctx,
+ const struct brw_gs_prog_key *key,
+ struct brw_gs_prog_data *prog_data,
+ const nir_shader *src_shader,
+ struct gl_program *prog,
+ int shader_time_index,
+ unsigned *final_assembly_size,
+ char **error_str)
+{
+ struct brw_gs_compile c;
+ memset(&c, 0, sizeof(c));
+ c.key = *key;
+
+ const bool is_scalar = compiler->scalar_stage[MESA_SHADER_GEOMETRY];
+ nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
+
+ /* The GLSL linker will have already matched up GS inputs and the outputs
+ * of prior stages. The driver does extend VS outputs in some cases, but
+ * only for legacy OpenGL or Gen4-5 hardware, neither of which offer
+ * geometry shader support. So we can safely ignore that.
+ *
+ * For SSO pipelines, we use a fixed VUE map layout based on variable
+ * locations, so we can rely on rendezvous-by-location making this work.
+ */
+ GLbitfield64 inputs_read = shader->info->inputs_read;
+ brw_compute_vue_map(compiler->devinfo,
+ &c.input_vue_map, inputs_read,
+ shader->info->separate_shader);
+
+ shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, is_scalar);
+ brw_nir_lower_vue_inputs(shader, is_scalar, &c.input_vue_map);
+ brw_nir_lower_vue_outputs(shader, is_scalar);
+ shader = brw_postprocess_nir(shader, compiler, is_scalar);
+
+ prog_data->base.clip_distance_mask =
+ ((1 << shader->info->clip_distance_array_size) - 1);
+ prog_data->base.cull_distance_mask =
+ ((1 << shader->info->cull_distance_array_size) - 1) <<
+ shader->info->clip_distance_array_size;
+
+ prog_data->include_primitive_id =
+ (shader->info->system_values_read & (1 << SYSTEM_VALUE_PRIMITIVE_ID)) != 0;
+
+ prog_data->invocations = shader->info->gs.invocations;
+
+ if (compiler->devinfo->gen >= 8)
+ prog_data->static_vertex_count = nir_gs_count_vertices(shader);
+
+ if (compiler->devinfo->gen >= 7) {
+ if (shader->info->gs.output_primitive == GL_POINTS) {
+ /* When the output type is points, the geometry shader may output data
+ * to multiple streams, and EndPrimitive() has no effect. So we
+ * configure the hardware to interpret the control data as stream ID.
+ */
+ prog_data->control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;
+
+ /* We only have to emit control bits if we are using streams */
+ if (prog && prog->info.gs.uses_streams)
+ c.control_data_bits_per_vertex = 2;
+ else
+ c.control_data_bits_per_vertex = 0;
+ } else {
+ /* When the output type is triangle_strip or line_strip, EndPrimitive()
+ * may be used to terminate the current strip and start a new one
+ * (similar to primitive restart), and outputting data to multiple
+ * streams is not supported. So we configure the hardware to interpret
+ * the control data as EndPrimitive information (a.k.a. "cut bits").
+ */
+ prog_data->control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;
+
+ /* We only need to output control data if the shader actually calls
+ * EndPrimitive().
+ */
+ c.control_data_bits_per_vertex =
+ shader->info->gs.uses_end_primitive ? 1 : 0;
+ }
+ } else {
+ /* There are no control data bits in gen6. */
+ c.control_data_bits_per_vertex = 0;
+ }
+ c.control_data_header_size_bits =
+ shader->info->gs.vertices_out * c.control_data_bits_per_vertex;
+
+ /* 1 HWORD = 32 bytes = 256 bits */
+ prog_data->control_data_header_size_hwords =
+ ALIGN(c.control_data_header_size_bits, 256) / 256;
+
+ /* Compute the output vertex size.
+ *
+ * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex
+ * Size (p168):
+ *
+ * [0,62] indicating [1,63] 16B units
+ *
+ * Specifies the size of each vertex stored in the GS output entry
+ * (following any Control Header data) as a number of 128-bit units
+ * (minus one).
+ *
+ * Programming Restrictions: The vertex size must be programmed as a
+ * multiple of 32B units with the following exception: Rendering is
+ * disabled (as per SOL stage state) and the vertex size output by the
+ * GS thread is 16B.
+ *
+ * If rendering is enabled (as per SOL state) the vertex size must be
+ * programmed as a multiple of 32B units. In other words, the only time
+ * software can program a vertex size with an odd number of 16B units
+ * is when rendering is disabled.
+ *
+ * Note: B=bytes in the above text.
+ *
+ * It doesn't seem worth the extra trouble to optimize the case where the
+ * vertex size is 16B (especially since this would require special-casing
+ * the GEN assembly that writes to the URB). So we just set the vertex
+ * size to a multiple of 32B (2 vec4's) in all cases.
+ *
+ * The maximum output vertex size is 62*16 = 992 bytes (31 hwords). We
+ * budget that as follows:
+ *
+ * 512 bytes for varyings (a varying component is 4 bytes and
+ * gl_MaxGeometryOutputComponents = 128)
+ * 16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
+ * bytes)
+ * 16 bytes overhead for gl_Position (we allocate it a slot in the VUE
+ * even if it's not used)
+ * 32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
+ * whenever clip planes are enabled, even if the shader doesn't
+ * write to gl_ClipDistance)
+ * 16 bytes overhead since the VUE size must be a multiple of 32 bytes
+ * (see below)--this causes up to 1 VUE slot to be wasted
+ * 400 bytes available for varying packing overhead
+ *
+ * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes)
+ * per interpolation type, so this is plenty.
+ *
+ */
+ unsigned output_vertex_size_bytes = prog_data->base.vue_map.num_slots * 16;
+ assert(compiler->devinfo->gen == 6 ||
+ output_vertex_size_bytes <= GEN7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
+ prog_data->output_vertex_size_hwords =
+ ALIGN(output_vertex_size_bytes, 32) / 32;
+
+ /* Compute URB entry size. The maximum allowed URB entry size is 32k.
+ * That divides up as follows:
+ *
+ * 64 bytes for the control data header (cut indices or StreamID bits)
+ * 4096 bytes for varyings (a varying component is 4 bytes and
+ * gl_MaxGeometryTotalOutputComponents = 1024)
+ * 4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
+ * bytes/vertex and gl_MaxGeometryOutputVertices is 256)
+ * 4096 bytes overhead for gl_Position (we allocate it a slot in the VUE
+ * even if it's not used)
+ * 8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
+ * whenever clip planes are enabled, even if the shader doesn't
+ * write to gl_ClipDistance)
+ * 4096 bytes overhead since the VUE size must be a multiple of 32
+ * bytes (see above)--this causes up to 1 VUE slot to be wasted
+ * 8128 bytes available for varying packing overhead
+ *
+ * Worst-case varying packing overhead is 3/4 of a varying slot per
+ * interpolation type, which works out to 3072 bytes, so this would allow
+ * us to accommodate 2 interpolation types without any danger of running
+ * out of URB space.
+ *
+ * In practice, the risk of running out of URB space is very small, since
+ * the above figures are all worst-case, and most of them scale with the
+ * number of output vertices. So we'll just calculate the amount of space
+ * we need, and if it's too large, fail to compile.
+ *
+ * The above is for gen7+ where we have a single URB entry that will hold
+ * all the output. In gen6, we will have to allocate URB entries for every
+ * vertex we emit, so our URB entries only need to be large enough to hold
+ * a single vertex. Also, gen6 does not have a control data header.
+ */
+ unsigned output_size_bytes;
+ if (compiler->devinfo->gen >= 7) {
+ output_size_bytes =
+ prog_data->output_vertex_size_hwords * 32 * shader->info->gs.vertices_out;
+ output_size_bytes += 32 * prog_data->control_data_header_size_hwords;
+ } else {
+ output_size_bytes = prog_data->output_vertex_size_hwords * 32;
+ }
+
+ /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
+ * which comes before the control header.
+ */
+ if (compiler->devinfo->gen >= 8)
+ output_size_bytes += 32;
+
+ /* Shaders can technically set max_vertices = 0, at which point we
+ * may have a URB size of 0 bytes. Nothing good can come from that,
+ * so enforce a minimum size.
+ */
+ if (output_size_bytes == 0)
+ output_size_bytes = 1;
+
+ unsigned max_output_size_bytes = GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES;
+ if (compiler->devinfo->gen == 6)
+ max_output_size_bytes = GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES;
+ if (output_size_bytes > max_output_size_bytes)
+ return NULL;
+
+
+ /* URB entry sizes are stored as a multiple of 64 bytes in gen7+ and
+ * a multiple of 128 bytes in gen6.
+ */
+ if (compiler->devinfo->gen >= 7)
+ prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+ else
+ prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128;
+
+ assert(shader->info->gs.output_primitive < ARRAY_SIZE(gl_prim_to_hw_prim));
+ prog_data->output_topology =
+ gl_prim_to_hw_prim[shader->info->gs.output_primitive];
+
+ prog_data->vertices_in = shader->info->gs.vertices_in;
+
+ /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
+ * need to program a URB read length of ceiling(num_slots / 2).
+ */
+ prog_data->base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2;
+
+ /* Now that prog_data setup is done, we are ready to actually compile the
+ * program.
+ */
+ if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
+ fprintf(stderr, "GS Input ");
+ brw_print_vue_map(stderr, &c.input_vue_map);
+ fprintf(stderr, "GS Output ");
+ brw_print_vue_map(stderr, &prog_data->base.vue_map);
+ }
+
+ if (is_scalar) {
+ fs_visitor v(compiler, log_data, mem_ctx, &c, prog_data, shader,
+ shader_time_index);
+ if (v.run_gs()) {
+ prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
+ prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
+
+ fs_generator g(compiler, log_data, mem_ctx, &c.key,
+ &prog_data->base.base, v.promoted_constants,
+ false, MESA_SHADER_GEOMETRY);
+ if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
+ const char *label =
+ shader->info->label ? shader->info->label : "unnamed";
+ char *name = ralloc_asprintf(mem_ctx, "%s geometry shader %s",
+ label, shader->info->name);
+ g.enable_debug(name);
+ }
+ g.generate_code(v.cfg, 8);
+ return g.get_assembly(final_assembly_size);
+ }
+ }
+
+ if (compiler->devinfo->gen >= 7) {
+ /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
+ * so without spilling. If the GS invocations count > 1, then we can't use
+ * dual object mode.
+ */
+ if (prog_data->invocations <= 1 &&
+ likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) {
+ prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
+
+ vec4_gs_visitor v(compiler, log_data, &c, prog_data, shader,
+ mem_ctx, true /* no_spills */, shader_time_index);
+ if (v.run()) {
+ return brw_vec4_generate_assembly(compiler, log_data, mem_ctx,
+ shader, &prog_data->base, v.cfg,
+ final_assembly_size);
+ }
+ }
+ }
+
+ /* Either we failed to compile in DUAL_OBJECT mode (probably because it
+ * would have required spilling) or DUAL_OBJECT mode is disabled. So fall
+ * back to DUAL_INSTANCED or SINGLE mode, which consumes fewer registers.
+ *
+ * FIXME: Single dispatch mode requires that the driver can handle
+ * interleaving of input registers, but this is already supported (dual
+ * instance mode has the same requirement). However, to take full advantage
+ * of single dispatch mode to reduce register pressure we would also need to
+ * do interleaved outputs, but currently, the vec4 visitor and generator
+ * classes do not support this, so at the moment register pressure in
+ * single and dual instance modes is the same.
+ *
+ * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 "3DSTATE_GS"
+ * "If InstanceCount>1, DUAL_OBJECT mode is invalid. Software will likely
+ * want to use DUAL_INSTANCE mode for higher performance, but SINGLE mode
+ * is also supported. When InstanceCount=1 (one instance per object) software
+ * can decide which dispatch mode to use. DUAL_OBJECT mode would likely be
+ * the best choice for performance, followed by SINGLE mode."
+ *
+ * So SINGLE mode is more performant when invocations == 1 and DUAL_INSTANCE
+ * mode is more performant when invocations > 1. Gen6 only supports
+ * SINGLE mode.
+ */
+ if (prog_data->invocations <= 1 || compiler->devinfo->gen < 7)
+ prog_data->base.dispatch_mode = DISPATCH_MODE_4X1_SINGLE;
+ else
+ prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_INSTANCE;
+
+ vec4_gs_visitor *gs = NULL;
+ const unsigned *ret = NULL;
+
+ if (compiler->devinfo->gen >= 7)
+ gs = new vec4_gs_visitor(compiler, log_data, &c, prog_data,
+ shader, mem_ctx, false /* no_spills */,
+ shader_time_index);
+ else
+ gs = new gen6_gs_visitor(compiler, log_data, &c, prog_data, prog,
+ shader, mem_ctx, false /* no_spills */,
+ shader_time_index);
+
+ if (!gs->run()) {
+ if (error_str)
+ *error_str = ralloc_strdup(mem_ctx, gs->fail_msg);
+ } else {
+ ret = brw_vec4_generate_assembly(compiler, log_data, mem_ctx, shader,
+ &prog_data->base, gs->cfg,
+ final_assembly_size);
+ }
+
+ delete gs;
+ return ret;
+}
+
+
+} /* namespace brw */
diff --git a/src/intel/compiler/brw_vec4_gs_visitor.h b/src/intel/compiler/brw_vec4_gs_visitor.h
new file mode 100644
index 00000000000..09221f928d1
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_gs_visitor.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_gs_visitor.h
+ *
+ * Geometry-shader-specific code derived from the vec4_visitor class.
+ */
+
+#ifndef BRW_VEC4_GS_VISITOR_H
+#define BRW_VEC4_GS_VISITOR_H
+
+#include "brw_vec4.h"
+
+#define MAX_GS_INPUT_VERTICES 6
+
+#ifdef __cplusplus
+namespace brw {
+
+class vec4_gs_visitor : public vec4_visitor
+{
+public:
+ vec4_gs_visitor(const struct brw_compiler *compiler,
+ void *log_data,
+ struct brw_gs_compile *c,
+ struct brw_gs_prog_data *prog_data,
+ const nir_shader *shader,
+ void *mem_ctx,
+ bool no_spills,
+ int shader_time_index);
+
+ virtual void nir_setup_inputs();
+ virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
+
+protected:
+ virtual dst_reg *make_reg_for_system_value(int location);
+ virtual void setup_payload();
+ virtual void emit_prolog();
+ virtual void emit_thread_end();
+ virtual void emit_urb_write_header(int mrf);
+ virtual vec4_instruction *emit_urb_write_opcode(bool complete);
+ virtual void gs_emit_vertex(int stream_id);
+ virtual void gs_end_primitive();
+ virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
+
+protected:
+ int setup_varying_inputs(int payload_reg, int *attribute_map,
+ int attributes_per_reg);
+ void emit_control_data_bits();
+ void set_stream_control_data_bits(unsigned stream_id);
+
+ src_reg vertex_count;
+ src_reg control_data_bits;
+ const struct brw_gs_compile * const c;
+ struct brw_gs_prog_data * const gs_prog_data;
+};
+
+} /* namespace brw */
+#endif /* __cplusplus */
+
+#endif /* BRW_VEC4_GS_VISITOR_H */
diff --git a/src/intel/compiler/brw_vec4_live_variables.cpp b/src/intel/compiler/brw_vec4_live_variables.cpp
new file mode 100644
index 00000000000..73f658cd8fa
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_live_variables.cpp
@@ -0,0 +1,343 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ * Eric Anholt <[email protected]>
+ *
+ */
+
+#include "brw_cfg.h"
+#include "brw_vec4_live_variables.h"
+
+using namespace brw;
+
+/** @file brw_vec4_live_variables.cpp
+ *
+ * Support for computing at the basic block level which variables
+ * (virtual GRFs in our case) are live at entry and exit.
+ *
+ * See Muchnick's Advanced Compiler Design and Implementation, section
+ * 14.1 (p444).
+ */
+
+/**
+ * Sets up the use[] and def[] arrays.
+ *
+ * The basic-block-level live variable analysis needs to know which
+ * variables get used before they're completely defined, and which
+ * variables are completely defined before they're used.
+ *
+ * We independently track each channel of a vec4. This is because we need to
+ * be able to recognize a sequence like:
+ *
+ * ...
+ * DP4 tmp.x a b;
+ * DP4 tmp.y c d;
+ * MUL result.xy tmp.xy e.xy
+ * ...
+ *
+ * as having tmp live only across that sequence (assuming it's used nowhere
+ * else), because it's a common pattern. A more conservative approach that
+ * doesn't get tmp marked a deffed in this block will tend to result in
+ * spilling.
+ */
+void
+vec4_live_variables::setup_def_use()
+{
+ int ip = 0;
+
+ foreach_block (block, cfg) {
+ assert(ip == block->start_ip);
+ if (block->num > 0)
+ assert(cfg->blocks[block->num - 1]->end_ip == ip - 1);
+
+ foreach_inst_in_block(vec4_instruction, inst, block) {
+ struct block_data *bd = &block_data[block->num];
+
+ /* Set use[] for this instruction */
+ for (unsigned int i = 0; i < 3; i++) {
+ if (inst->src[i].file == VGRF) {
+ for (unsigned j = 0; j < DIV_ROUND_UP(inst->size_read(i), 16); j++) {
+ for (int c = 0; c < 4; c++) {
+ const unsigned v = var_from_reg(alloc, inst->src[i], c, j);
+ if (!BITSET_TEST(bd->def, v))
+ BITSET_SET(bd->use, v);
+ }
+ }
+ }
+ }
+ for (unsigned c = 0; c < 4; c++) {
+ if (inst->reads_flag(c) &&
+ !BITSET_TEST(bd->flag_def, c)) {
+ BITSET_SET(bd->flag_use, c);
+ }
+ }
+
+ /* Check for unconditional writes to whole registers. These
+ * are the things that screen off preceding definitions of a
+ * variable, and thus qualify for being in def[].
+ */
+ if (inst->dst.file == VGRF &&
+ (!inst->predicate || inst->opcode == BRW_OPCODE_SEL)) {
+ for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) {
+ for (int c = 0; c < 4; c++) {
+ if (inst->dst.writemask & (1 << c)) {
+ const unsigned v = var_from_reg(alloc, inst->dst, c, i);
+ if (!BITSET_TEST(bd->use, v))
+ BITSET_SET(bd->def, v);
+ }
+ }
+ }
+ }
+ if (inst->writes_flag()) {
+ for (unsigned c = 0; c < 4; c++) {
+ if ((inst->dst.writemask & (1 << c)) &&
+ !BITSET_TEST(bd->flag_use, c)) {
+ BITSET_SET(bd->flag_def, c);
+ }
+ }
+ }
+
+ ip++;
+ }
+ }
+}
+
+/**
+ * The algorithm incrementally sets bits in liveout and livein,
+ * propagating it through control flow. It will eventually terminate
+ * because it only ever adds bits, and stops when no bits are added in
+ * a pass.
+ */
+void
+vec4_live_variables::compute_live_variables()
+{
+ bool cont = true;
+
+ while (cont) {
+ cont = false;
+
+ foreach_block_reverse (block, cfg) {
+ struct block_data *bd = &block_data[block->num];
+
+ /* Update liveout */
+ foreach_list_typed(bblock_link, child_link, link, &block->children) {
+ struct block_data *child_bd = &block_data[child_link->block->num];
+
+ for (int i = 0; i < bitset_words; i++) {
+ BITSET_WORD new_liveout = (child_bd->livein[i] &
+ ~bd->liveout[i]);
+ if (new_liveout) {
+ bd->liveout[i] |= new_liveout;
+ cont = true;
+ }
+ }
+ BITSET_WORD new_liveout = (child_bd->flag_livein[0] &
+ ~bd->flag_liveout[0]);
+ if (new_liveout) {
+ bd->flag_liveout[0] |= new_liveout;
+ cont = true;
+ }
+ }
+
+ /* Update livein */
+ for (int i = 0; i < bitset_words; i++) {
+ BITSET_WORD new_livein = (bd->use[i] |
+ (bd->liveout[i] &
+ ~bd->def[i]));
+ if (new_livein & ~bd->livein[i]) {
+ bd->livein[i] |= new_livein;
+ cont = true;
+ }
+ }
+ BITSET_WORD new_livein = (bd->flag_use[0] |
+ (bd->flag_liveout[0] &
+ ~bd->flag_def[0]));
+ if (new_livein & ~bd->flag_livein[0]) {
+ bd->flag_livein[0] |= new_livein;
+ cont = true;
+ }
+ }
+ }
+}
+
+vec4_live_variables::vec4_live_variables(const simple_allocator &alloc,
+ cfg_t *cfg)
+ : alloc(alloc), cfg(cfg)
+{
+ mem_ctx = ralloc_context(NULL);
+
+ num_vars = alloc.total_size * 8;
+ block_data = rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks);
+
+ bitset_words = BITSET_WORDS(num_vars);
+ for (int i = 0; i < cfg->num_blocks; i++) {
+ block_data[i].def = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+ block_data[i].use = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+ block_data[i].livein = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+ block_data[i].liveout = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+
+ block_data[i].flag_def[0] = 0;
+ block_data[i].flag_use[0] = 0;
+ block_data[i].flag_livein[0] = 0;
+ block_data[i].flag_liveout[0] = 0;
+ }
+
+ setup_def_use();
+ compute_live_variables();
+}
+
+vec4_live_variables::~vec4_live_variables()
+{
+ ralloc_free(mem_ctx);
+}
+
+#define MAX_INSTRUCTION (1 << 30)
+
+/**
+ * Computes a conservative start/end of the live intervals for each virtual GRF.
+ *
+ * We could expose per-channel live intervals to the consumer based on the
+ * information we computed in vec4_live_variables, except that our only
+ * current user is virtual_grf_interferes(). So we instead union the
+ * per-channel ranges into a per-vgrf range for virtual_grf_start[] and
+ * virtual_grf_end[].
+ *
+ * We could potentially have virtual_grf_interferes() do the test per-channel,
+ * which would let some interesting register allocation occur (particularly on
+ * code-generated GLSL sequences from the Cg compiler which does register
+ * allocation at the GLSL level and thus reuses components of the variable
+ * with distinct lifetimes). But right now the complexity of doing so doesn't
+ * seem worth it, since having virtual_grf_interferes() be cheap is important
+ * for register allocation performance.
+ */
+void
+vec4_visitor::calculate_live_intervals()
+{
+ if (this->live_intervals)
+ return;
+
+ int *start = ralloc_array(mem_ctx, int, this->alloc.total_size * 8);
+ int *end = ralloc_array(mem_ctx, int, this->alloc.total_size * 8);
+ ralloc_free(this->virtual_grf_start);
+ ralloc_free(this->virtual_grf_end);
+ this->virtual_grf_start = start;
+ this->virtual_grf_end = end;
+
+ for (unsigned i = 0; i < this->alloc.total_size * 8; i++) {
+ start[i] = MAX_INSTRUCTION;
+ end[i] = -1;
+ }
+
+ /* Start by setting up the intervals with no knowledge of control
+ * flow.
+ */
+ int ip = 0;
+ foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+ for (unsigned int i = 0; i < 3; i++) {
+ if (inst->src[i].file == VGRF) {
+ for (unsigned j = 0; j < DIV_ROUND_UP(inst->size_read(i), 16); j++) {
+ for (int c = 0; c < 4; c++) {
+ const unsigned v = var_from_reg(alloc, inst->src[i], c, j);
+ start[v] = MIN2(start[v], ip);
+ end[v] = ip;
+ }
+ }
+ }
+ }
+
+ if (inst->dst.file == VGRF) {
+ for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) {
+ for (int c = 0; c < 4; c++) {
+ if (inst->dst.writemask & (1 << c)) {
+ const unsigned v = var_from_reg(alloc, inst->dst, c, i);
+ start[v] = MIN2(start[v], ip);
+ end[v] = ip;
+ }
+ }
+ }
+ }
+
+ ip++;
+ }
+
+ /* Now, extend those intervals using our analysis of control flow.
+ *
+ * The control flow-aware analysis was done at a channel level, while at
+ * this point we're distilling it down to vgrfs.
+ */
+ this->live_intervals = new(mem_ctx) vec4_live_variables(alloc, cfg);
+
+ foreach_block (block, cfg) {
+ struct block_data *bd = &live_intervals->block_data[block->num];
+
+ for (int i = 0; i < live_intervals->num_vars; i++) {
+ if (BITSET_TEST(bd->livein, i)) {
+ start[i] = MIN2(start[i], block->start_ip);
+ end[i] = MAX2(end[i], block->start_ip);
+ }
+
+ if (BITSET_TEST(bd->liveout, i)) {
+ start[i] = MIN2(start[i], block->end_ip);
+ end[i] = MAX2(end[i], block->end_ip);
+ }
+ }
+ }
+}
+
+void
+vec4_visitor::invalidate_live_intervals()
+{
+ ralloc_free(live_intervals);
+ live_intervals = NULL;
+}
+
+int
+vec4_visitor::var_range_start(unsigned v, unsigned n) const
+{
+ int start = INT_MAX;
+
+ for (unsigned i = 0; i < n; i++)
+ start = MIN2(start, virtual_grf_start[v + i]);
+
+ return start;
+}
+
+int
+vec4_visitor::var_range_end(unsigned v, unsigned n) const
+{
+ int end = INT_MIN;
+
+ for (unsigned i = 0; i < n; i++)
+ end = MAX2(end, virtual_grf_end[v + i]);
+
+ return end;
+}
+
+bool
+vec4_visitor::virtual_grf_interferes(int a, int b)
+{
+ return !((var_range_end(8 * alloc.offsets[a], 8 * alloc.sizes[a]) <=
+ var_range_start(8 * alloc.offsets[b], 8 * alloc.sizes[b])) ||
+ (var_range_end(8 * alloc.offsets[b], 8 * alloc.sizes[b]) <=
+ var_range_start(8 * alloc.offsets[a], 8 * alloc.sizes[a])));
+}
diff --git a/src/intel/compiler/brw_vec4_live_variables.h b/src/intel/compiler/brw_vec4_live_variables.h
new file mode 100644
index 00000000000..8807c453743
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_live_variables.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ * Eric Anholt <[email protected]>
+ *
+ */
+
+#include "util/bitset.h"
+#include "brw_vec4.h"
+
+namespace brw {
+
+struct block_data {
+ /**
+ * Which variables are defined before being used in the block.
+ *
+ * Note that for our purposes, "defined" means unconditionally, completely
+ * defined.
+ */
+ BITSET_WORD *def;
+
+ /**
+ * Which variables are used before being defined in the block.
+ */
+ BITSET_WORD *use;
+
+ /** Which defs reach the entry point of the block. */
+ BITSET_WORD *livein;
+
+ /** Which defs reach the exit point of the block. */
+ BITSET_WORD *liveout;
+
+ BITSET_WORD flag_def[1];
+ BITSET_WORD flag_use[1];
+ BITSET_WORD flag_livein[1];
+ BITSET_WORD flag_liveout[1];
+};
+
+class vec4_live_variables {
+public:
+ DECLARE_RALLOC_CXX_OPERATORS(vec4_live_variables)
+
+ vec4_live_variables(const simple_allocator &alloc, cfg_t *cfg);
+ ~vec4_live_variables();
+
+ int num_vars;
+ int bitset_words;
+
+ /** Per-basic-block information on live variables */
+ struct block_data *block_data;
+
+protected:
+ void setup_def_use();
+ void compute_live_variables();
+
+ const simple_allocator &alloc;
+ cfg_t *cfg;
+ void *mem_ctx;
+};
+
+/* Returns the variable index for the k-th dword of the c-th component of
+ * register reg.
+ */
+inline unsigned
+var_from_reg(const simple_allocator &alloc, const src_reg &reg,
+ unsigned c = 0, unsigned k = 0)
+{
+ assert(reg.file == VGRF && reg.nr < alloc.count && c < 4);
+ const unsigned csize = DIV_ROUND_UP(type_sz(reg.type), 4);
+ unsigned result =
+ 8 * (alloc.offsets[reg.nr] + reg.offset / REG_SIZE) +
+ (BRW_GET_SWZ(reg.swizzle, c) + k / csize * 4) * csize + k % csize;
+ /* Do not exceed the limit for this register */
+ assert(result < 8 * (alloc.offsets[reg.nr] + alloc.sizes[reg.nr]));
+ return result;
+}
+
+inline unsigned
+var_from_reg(const simple_allocator &alloc, const dst_reg &reg,
+ unsigned c = 0, unsigned k = 0)
+{
+ assert(reg.file == VGRF && reg.nr < alloc.count && c < 4);
+ const unsigned csize = DIV_ROUND_UP(type_sz(reg.type), 4);
+ unsigned result =
+ 8 * (alloc.offsets[reg.nr] + reg.offset / REG_SIZE) +
+ (c + k / csize * 4) * csize + k % csize;
+ /* Do not exceed the limit for this register */
+ assert(result < 8 * (alloc.offsets[reg.nr] + alloc.sizes[reg.nr]));
+ return result;
+}
+
+} /* namespace brw */
diff --git a/src/intel/compiler/brw_vec4_nir.cpp b/src/intel/compiler/brw_vec4_nir.cpp
new file mode 100644
index 00000000000..4e88b795049
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_nir.cpp
@@ -0,0 +1,2407 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "brw_vec4.h"
+#include "brw_vec4_builder.h"
+#include "brw_vec4_surface_builder.h"
+
+using namespace brw;
+using namespace brw::surface_access;
+
+namespace brw {
+
+void
+vec4_visitor::emit_nir_code()
+{
+ if (nir->num_uniforms > 0)
+ nir_setup_uniforms();
+
+ nir_setup_system_values();
+
+ /* get the main function and emit it */
+ nir_foreach_function(function, nir) {
+ assert(strcmp(function->name, "main") == 0);
+ assert(function->impl);
+ nir_emit_impl(function->impl);
+ }
+}
+
+void
+vec4_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
+{
+ dst_reg *reg;
+
+ switch (instr->intrinsic) {
+ case nir_intrinsic_load_vertex_id:
+ unreachable("should be lowered by lower_vertex_id().");
+
+ case nir_intrinsic_load_vertex_id_zero_base:
+ reg = &nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
+ if (reg->file == BAD_FILE)
+ *reg = *make_reg_for_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
+ break;
+
+ case nir_intrinsic_load_base_vertex:
+ reg = &nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
+ if (reg->file == BAD_FILE)
+ *reg = *make_reg_for_system_value(SYSTEM_VALUE_BASE_VERTEX);
+ break;
+
+ case nir_intrinsic_load_instance_id:
+ reg = &nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
+ if (reg->file == BAD_FILE)
+ *reg = *make_reg_for_system_value(SYSTEM_VALUE_INSTANCE_ID);
+ break;
+
+ case nir_intrinsic_load_base_instance:
+ reg = &nir_system_values[SYSTEM_VALUE_BASE_INSTANCE];
+ if (reg->file == BAD_FILE)
+ *reg = *make_reg_for_system_value(SYSTEM_VALUE_BASE_INSTANCE);
+ break;
+
+ case nir_intrinsic_load_draw_id:
+ reg = &nir_system_values[SYSTEM_VALUE_DRAW_ID];
+ if (reg->file == BAD_FILE)
+ *reg = *make_reg_for_system_value(SYSTEM_VALUE_DRAW_ID);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static bool
+setup_system_values_block(nir_block *block, vec4_visitor *v)
+{
+ nir_foreach_instr(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ v->nir_setup_system_value_intrinsic(intrin);
+ }
+
+ return true;
+}
+
+void
+vec4_visitor::nir_setup_system_values()
+{
+ nir_system_values = ralloc_array(mem_ctx, dst_reg, SYSTEM_VALUE_MAX);
+ for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
+ nir_system_values[i] = dst_reg();
+ }
+
+ nir_foreach_function(function, nir) {
+ assert(strcmp(function->name, "main") == 0);
+ assert(function->impl);
+ nir_foreach_block(block, function->impl) {
+ setup_system_values_block(block, this);
+ }
+ }
+}
+
+void
+vec4_visitor::nir_setup_uniforms()
+{
+ uniforms = nir->num_uniforms / 16;
+}
+
+void
+vec4_visitor::nir_emit_impl(nir_function_impl *impl)
+{
+ nir_locals = ralloc_array(mem_ctx, dst_reg, impl->reg_alloc);
+ for (unsigned i = 0; i < impl->reg_alloc; i++) {
+ nir_locals[i] = dst_reg();
+ }
+
+ foreach_list_typed(nir_register, reg, node, &impl->registers) {
+ unsigned array_elems =
+ reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
+ const unsigned num_regs = array_elems * DIV_ROUND_UP(reg->bit_size, 32);
+ nir_locals[reg->index] = dst_reg(VGRF, alloc.allocate(num_regs));
+
+ if (reg->bit_size == 64)
+ nir_locals[reg->index].type = BRW_REGISTER_TYPE_DF;
+ }
+
+ nir_ssa_values = ralloc_array(mem_ctx, dst_reg, impl->ssa_alloc);
+
+ nir_emit_cf_list(&impl->body);
+}
+
+void
+vec4_visitor::nir_emit_cf_list(exec_list *list)
+{
+ exec_list_validate(list);
+ foreach_list_typed(nir_cf_node, node, node, list) {
+ switch (node->type) {
+ case nir_cf_node_if:
+ nir_emit_if(nir_cf_node_as_if(node));
+ break;
+
+ case nir_cf_node_loop:
+ nir_emit_loop(nir_cf_node_as_loop(node));
+ break;
+
+ case nir_cf_node_block:
+ nir_emit_block(nir_cf_node_as_block(node));
+ break;
+
+ default:
+ unreachable("Invalid CFG node block");
+ }
+ }
+}
+
+void
+vec4_visitor::nir_emit_if(nir_if *if_stmt)
+{
+ /* First, put the condition in f0 */
+ src_reg condition = get_nir_src(if_stmt->condition, BRW_REGISTER_TYPE_D, 1);
+ vec4_instruction *inst = emit(MOV(dst_null_d(), condition));
+ inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+ /* We can just predicate based on the X channel, as the condition only
+ * goes on its own line */
+ emit(IF(BRW_PREDICATE_ALIGN16_REPLICATE_X));
+
+ nir_emit_cf_list(&if_stmt->then_list);
+
+ /* note: if the else is empty, dead CF elimination will remove it */
+ emit(BRW_OPCODE_ELSE);
+
+ nir_emit_cf_list(&if_stmt->else_list);
+
+ emit(BRW_OPCODE_ENDIF);
+}
+
+void
+vec4_visitor::nir_emit_loop(nir_loop *loop)
+{
+ emit(BRW_OPCODE_DO);
+
+ nir_emit_cf_list(&loop->body);
+
+ emit(BRW_OPCODE_WHILE);
+}
+
+void
+vec4_visitor::nir_emit_block(nir_block *block)
+{
+ nir_foreach_instr(instr, block) {
+ nir_emit_instr(instr);
+ }
+}
+
+void
+vec4_visitor::nir_emit_instr(nir_instr *instr)
+{
+ base_ir = instr;
+
+ switch (instr->type) {
+ case nir_instr_type_load_const:
+ nir_emit_load_const(nir_instr_as_load_const(instr));
+ break;
+
+ case nir_instr_type_intrinsic:
+ nir_emit_intrinsic(nir_instr_as_intrinsic(instr));
+ break;
+
+ case nir_instr_type_alu:
+ nir_emit_alu(nir_instr_as_alu(instr));
+ break;
+
+ case nir_instr_type_jump:
+ nir_emit_jump(nir_instr_as_jump(instr));
+ break;
+
+ case nir_instr_type_tex:
+ nir_emit_texture(nir_instr_as_tex(instr));
+ break;
+
+ case nir_instr_type_ssa_undef:
+ nir_emit_undef(nir_instr_as_ssa_undef(instr));
+ break;
+
+ default:
+ fprintf(stderr, "VS instruction not yet implemented by NIR->vec4\n");
+ break;
+ }
+}
+
+static dst_reg
+dst_reg_for_nir_reg(vec4_visitor *v, nir_register *nir_reg,
+ unsigned base_offset, nir_src *indirect)
+{
+ dst_reg reg;
+
+ reg = v->nir_locals[nir_reg->index];
+ if (nir_reg->bit_size == 64)
+ reg.type = BRW_REGISTER_TYPE_DF;
+ reg = offset(reg, 8, base_offset);
+ if (indirect) {
+ reg.reladdr =
+ new(v->mem_ctx) src_reg(v->get_nir_src(*indirect,
+ BRW_REGISTER_TYPE_D,
+ 1));
+ }
+ return reg;
+}
+
+dst_reg
+vec4_visitor::get_nir_dest(const nir_dest &dest)
+{
+ if (dest.is_ssa) {
+ dst_reg dst =
+ dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(dest.ssa.bit_size, 32)));
+ if (dest.ssa.bit_size == 64)
+ dst.type = BRW_REGISTER_TYPE_DF;
+ nir_ssa_values[dest.ssa.index] = dst;
+ return dst;
+ } else {
+ return dst_reg_for_nir_reg(this, dest.reg.reg, dest.reg.base_offset,
+ dest.reg.indirect);
+ }
+}
+
+dst_reg
+vec4_visitor::get_nir_dest(const nir_dest &dest, enum brw_reg_type type)
+{
+ return retype(get_nir_dest(dest), type);
+}
+
+dst_reg
+vec4_visitor::get_nir_dest(const nir_dest &dest, nir_alu_type type)
+{
+ return get_nir_dest(dest, brw_type_for_nir_type(devinfo, type));
+}
+
+src_reg
+vec4_visitor::get_nir_src(const nir_src &src, enum brw_reg_type type,
+ unsigned num_components)
+{
+ dst_reg reg;
+
+ if (src.is_ssa) {
+ assert(src.ssa != NULL);
+ reg = nir_ssa_values[src.ssa->index];
+ }
+ else {
+ reg = dst_reg_for_nir_reg(this, src.reg.reg, src.reg.base_offset,
+ src.reg.indirect);
+ }
+
+ reg = retype(reg, type);
+
+ src_reg reg_as_src = src_reg(reg);
+ reg_as_src.swizzle = brw_swizzle_for_size(num_components);
+ return reg_as_src;
+}
+
+src_reg
+vec4_visitor::get_nir_src(const nir_src &src, nir_alu_type type,
+ unsigned num_components)
+{
+ return get_nir_src(src, brw_type_for_nir_type(devinfo, type),
+ num_components);
+}
+
+src_reg
+vec4_visitor::get_nir_src(const nir_src &src, unsigned num_components)
+{
+ /* if type is not specified, default to signed int */
+ return get_nir_src(src, nir_type_int32, num_components);
+}
+
+src_reg
+vec4_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
+{
+ nir_src *offset_src = nir_get_io_offset_src(instr);
+ nir_const_value *const_value = nir_src_as_const_value(*offset_src);
+
+ if (const_value) {
+ /* The only constant offset we should find is 0. brw_nir.c's
+ * add_const_offset_to_base() will fold other constant offsets
+ * into instr->const_index[0].
+ */
+ assert(const_value->u32[0] == 0);
+ return src_reg();
+ }
+
+ return get_nir_src(*offset_src, BRW_REGISTER_TYPE_UD, 1);
+}
+
+void
+vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
+{
+ dst_reg reg;
+
+ if (instr->def.bit_size == 64) {
+ reg = dst_reg(VGRF, alloc.allocate(2));
+ reg.type = BRW_REGISTER_TYPE_DF;
+ } else {
+ reg = dst_reg(VGRF, alloc.allocate(1));
+ reg.type = BRW_REGISTER_TYPE_D;
+ }
+
+ unsigned remaining = brw_writemask_for_size(instr->def.num_components);
+
+ /* @FIXME: consider emitting vector operations to save some MOVs in
+ * cases where the components are representable in 8 bits.
+ * For now, we emit a MOV for each distinct value.
+ */
+ for (unsigned i = 0; i < instr->def.num_components; i++) {
+ unsigned writemask = 1 << i;
+
+ if ((remaining & writemask) == 0)
+ continue;
+
+ for (unsigned j = i; j < instr->def.num_components; j++) {
+ if ((instr->def.bit_size == 32 &&
+ instr->value.u32[i] == instr->value.u32[j]) ||
+ (instr->def.bit_size == 64 &&
+ instr->value.f64[i] == instr->value.f64[j])) {
+ writemask |= 1 << j;
+ }
+ }
+
+ reg.writemask = writemask;
+ if (instr->def.bit_size == 64) {
+ emit(MOV(reg, setup_imm_df(instr->value.f64[i])));
+ } else {
+ emit(MOV(reg, brw_imm_d(instr->value.i32[i])));
+ }
+
+ remaining &= ~writemask;
+ }
+
+ /* Set final writemask */
+ reg.writemask = brw_writemask_for_size(instr->def.num_components);
+
+ nir_ssa_values[instr->def.index] = reg;
+}
+
+void
+vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+{
+ dst_reg dest;
+ src_reg src;
+
+ switch (instr->intrinsic) {
+
+ case nir_intrinsic_load_input: {
+ nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+
+ /* We set EmitNoIndirectInput for VS */
+ assert(const_offset);
+
+ dest = get_nir_dest(instr->dest);
+ dest.writemask = brw_writemask_for_size(instr->num_components);
+
+ src = src_reg(ATTR, instr->const_index[0] + const_offset->u32[0],
+ glsl_type::uvec4_type);
+ src = retype(src, dest.type);
+
+ bool is_64bit = nir_dest_bit_size(instr->dest) == 64;
+ if (is_64bit) {
+ dst_reg tmp = dst_reg(this, glsl_type::dvec4_type);
+ src.swizzle = BRW_SWIZZLE_XYZW;
+ shuffle_64bit_data(tmp, src, false);
+ emit(MOV(dest, src_reg(tmp)));
+ } else {
+ /* Swizzle source based on component layout qualifier */
+ src.swizzle = BRW_SWZ_COMP_INPUT(nir_intrinsic_component(instr));
+ emit(MOV(dest, src));
+ }
+ break;
+ }
+
+ case nir_intrinsic_store_output: {
+ nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
+ assert(const_offset);
+
+ int varying = instr->const_index[0] + const_offset->u32[0];
+
+ bool is_64bit = nir_src_bit_size(instr->src[0]) == 64;
+ if (is_64bit) {
+ src_reg data;
+ src = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_DF,
+ instr->num_components);
+ data = src_reg(this, glsl_type::dvec4_type);
+ shuffle_64bit_data(dst_reg(data), src, true);
+ src = retype(data, BRW_REGISTER_TYPE_F);
+ } else {
+ src = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F,
+ instr->num_components);
+ }
+
+ unsigned c = nir_intrinsic_component(instr);
+ output_reg[varying][c] = dst_reg(src);
+ output_num_components[varying][c] = instr->num_components;
+
+ unsigned num_components = instr->num_components;
+ if (is_64bit)
+ num_components *= 2;
+
+ output_reg[varying][c] = dst_reg(src);
+ output_num_components[varying][c] = MIN2(4, num_components);
+
+ if (is_64bit && num_components > 4) {
+ assert(num_components <= 8);
+ output_reg[varying + 1][c] = byte_offset(dst_reg(src), REG_SIZE);
+ output_num_components[varying + 1][c] = num_components - 4;
+ }
+ break;
+ }
+
+ case nir_intrinsic_get_buffer_size: {
+ nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
+ unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0;
+
+ const unsigned index =
+ prog_data->base.binding_table.ssbo_start + ssbo_index;
+ dst_reg result_dst = get_nir_dest(instr->dest);
+ vec4_instruction *inst = new(mem_ctx)
+ vec4_instruction(VS_OPCODE_GET_BUFFER_SIZE, result_dst);
+
+ inst->base_mrf = 2;
+ inst->mlen = 1; /* always at least one */
+ inst->src[1] = brw_imm_ud(index);
+
+ /* MRF for the first parameter */
+ src_reg lod = brw_imm_d(0);
+ int param_base = inst->base_mrf;
+ int writemask = WRITEMASK_X;
+ emit(MOV(dst_reg(MRF, param_base, glsl_type::int_type, writemask), lod));
+
+ emit(inst);
+
+ brw_mark_surface_used(&prog_data->base, index);
+ break;
+ }
+
+ case nir_intrinsic_store_ssbo: {
+ assert(devinfo->gen >= 7);
+
+ /* Block index */
+ src_reg surf_index;
+ nir_const_value *const_uniform_block =
+ nir_src_as_const_value(instr->src[1]);
+ if (const_uniform_block) {
+ unsigned index = prog_data->base.binding_table.ssbo_start +
+ const_uniform_block->u32[0];
+ surf_index = brw_imm_ud(index);
+ brw_mark_surface_used(&prog_data->base, index);
+ } else {
+ surf_index = src_reg(this, glsl_type::uint_type);
+ emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[1], 1),
+ brw_imm_ud(prog_data->base.binding_table.ssbo_start)));
+ surf_index = emit_uniformize(surf_index);
+
+ brw_mark_surface_used(&prog_data->base,
+ prog_data->base.binding_table.ssbo_start +
+ nir->info->num_ssbos - 1);
+ }
+
+ /* Offset */
+ src_reg offset_reg;
+ nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]);
+ if (const_offset) {
+ offset_reg = brw_imm_ud(const_offset->u32[0]);
+ } else {
+ offset_reg = get_nir_src(instr->src[2], 1);
+ }
+
+ /* Value */
+ src_reg val_reg = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F, 4);
+
+ /* Writemask */
+ unsigned write_mask = instr->const_index[0];
+
+ /* IvyBridge does not have a native SIMD4x2 untyped write message so untyped
+ * writes will use SIMD8 mode. In order to hide this and keep symmetry across
+ * typed and untyped messages and across hardware platforms, the
+ * current implementation of the untyped messages will transparently convert
+ * the SIMD4x2 payload into an equivalent SIMD8 payload by transposing it
+ * and enabling only channel X on the SEND instruction.
+ *
+ * The above, works well for full vector writes, but not for partial writes
+ * where we want to write some channels and not others, like when we have
+ * code such as v.xyw = vec3(1,2,4). Because the untyped write messages are
+ * quite restrictive with regards to the channel enables we can configure in
+ * the message descriptor (not all combinations are allowed) we cannot simply
+ * implement these scenarios with a single message while keeping the
+ * aforementioned symmetry in the implementation. For now we de decided that
+ * it is better to keep the symmetry to reduce complexity, so in situations
+ * such as the one described we end up emitting two untyped write messages
+ * (one for xy and another for w).
+ *
+ * The code below packs consecutive channels into a single write message,
+ * detects gaps in the vector write and if needed, sends a second message
+ * with the remaining channels. If in the future we decide that we want to
+ * emit a single message at the expense of losing the symmetry in the
+ * implementation we can:
+ *
+ * 1) For IvyBridge: Only use the red channel of the untyped write SIMD8
+ * message payload. In this mode we can write up to 8 offsets and dwords
+ * to the red channel only (for the two vec4s in the SIMD4x2 execution)
+ * and select which of the 8 channels carry data to write by setting the
+ * appropriate writemask in the dst register of the SEND instruction.
+ * It would require to write a new generator opcode specifically for
+ * IvyBridge since we would need to prepare a SIMD8 payload that could
+ * use any channel, not just X.
+ *
+ * 2) For Haswell+: Simply send a single write message but set the writemask
+ * on the dst of the SEND instruction to select the channels we want to
+ * write. It would require to modify the current messages to receive
+ * and honor the writemask provided.
+ */
+ const vec4_builder bld = vec4_builder(this).at_end()
+ .annotate(current_annotation, base_ir);
+
+ unsigned type_slots = nir_src_bit_size(instr->src[0]) / 32;
+ if (type_slots == 2) {
+ dst_reg tmp = dst_reg(this, glsl_type::dvec4_type);
+ shuffle_64bit_data(tmp, retype(val_reg, tmp.type), true);
+ val_reg = src_reg(retype(tmp, BRW_REGISTER_TYPE_F));
+ }
+
+ uint8_t swizzle[4] = { 0, 0, 0, 0};
+ int num_channels = 0;
+ unsigned skipped_channels = 0;
+ int num_components = instr->num_components;
+ for (int i = 0; i < num_components; i++) {
+ /* Read components Z/W of a dvec from the appropriate place. We will
+ * also have to adjust the swizzle (we do that with the '% 4' below)
+ */
+ if (i == 2 && type_slots == 2)
+ val_reg = byte_offset(val_reg, REG_SIZE);
+
+ /* Check if this channel needs to be written. If so, record the
+ * channel we need to take the data from in the swizzle array
+ */
+ int component_mask = 1 << i;
+ int write_test = write_mask & component_mask;
+ if (write_test) {
+ /* If we are writing doubles we have to write 2 channels worth of
+ * of data (64 bits) for each double component.
+ */
+ swizzle[num_channels++] = (i * type_slots) % 4;
+ if (type_slots == 2)
+ swizzle[num_channels++] = (i * type_slots + 1) % 4;
+ }
+
+ /* If we don't have to write this channel it means we have a gap in the
+ * vector, so write the channels we accumulated until now, if any. Do
+ * the same if this was the last component in the vector, if we have
+ * enough channels for a full vec4 write or if we have processed
+ * components XY of a dvec (since components ZW are not in the same
+ * SIMD register)
+ */
+ if (!write_test || i == num_components - 1 || num_channels == 4 ||
+ (i == 1 && type_slots == 2)) {
+ if (num_channels > 0) {
+ /* We have channels to write, so update the offset we need to
+ * write at to skip the channels we skipped, if any.
+ */
+ if (skipped_channels > 0) {
+ if (offset_reg.file == IMM) {
+ offset_reg.ud += 4 * skipped_channels;
+ } else {
+ emit(ADD(dst_reg(offset_reg), offset_reg,
+ brw_imm_ud(4 * skipped_channels)));
+ }
+ }
+
+ /* Swizzle the data register so we take the data from the channels
+ * we need to write and send the write message. This will write
+ * num_channels consecutive dwords starting at offset.
+ */
+ val_reg.swizzle =
+ BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
+ emit_untyped_write(bld, surf_index, offset_reg, val_reg,
+ 1 /* dims */, num_channels /* size */,
+ BRW_PREDICATE_NONE);
+
+ /* If we have to do a second write we will have to update the
+ * offset so that we jump over the channels we have just written
+ * now.
+ */
+ skipped_channels = num_channels;
+
+ /* Restart the count for the next write message */
+ num_channels = 0;
+ }
+
+ /* If we didn't write the channel, increase skipped count */
+ if (!write_test)
+ skipped_channels += type_slots;
+ }
+ }
+
+ break;
+ }
+
+ case nir_intrinsic_load_ssbo: {
+ assert(devinfo->gen >= 7);
+
+ nir_const_value *const_uniform_block =
+ nir_src_as_const_value(instr->src[0]);
+
+ src_reg surf_index;
+ if (const_uniform_block) {
+ unsigned index = prog_data->base.binding_table.ssbo_start +
+ const_uniform_block->u32[0];
+ surf_index = brw_imm_ud(index);
+
+ brw_mark_surface_used(&prog_data->base, index);
+ } else {
+ surf_index = src_reg(this, glsl_type::uint_type);
+ emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[0], 1),
+ brw_imm_ud(prog_data->base.binding_table.ssbo_start)));
+ surf_index = emit_uniformize(surf_index);
+
+ /* Assume this may touch any UBO. It would be nice to provide
+ * a tighter bound, but the array information is already lowered away.
+ */
+ brw_mark_surface_used(&prog_data->base,
+ prog_data->base.binding_table.ssbo_start +
+ nir->info->num_ssbos - 1);
+ }
+
+ src_reg offset_reg;
+ nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
+ if (const_offset) {
+ offset_reg = brw_imm_ud(const_offset->u32[0]);
+ } else {
+ offset_reg = get_nir_src(instr->src[1], 1);
+ }
+
+ /* Read the vector */
+ const vec4_builder bld = vec4_builder(this).at_end()
+ .annotate(current_annotation, base_ir);
+
+ src_reg read_result;
+ dst_reg dest = get_nir_dest(instr->dest);
+ if (type_sz(dest.type) < 8) {
+ read_result = emit_untyped_read(bld, surf_index, offset_reg,
+ 1 /* dims */, 4 /* size*/,
+ BRW_PREDICATE_NONE);
+ } else {
+ src_reg shuffled = src_reg(this, glsl_type::dvec4_type);
+
+ src_reg temp;
+ temp = emit_untyped_read(bld, surf_index, offset_reg,
+ 1 /* dims */, 4 /* size*/,
+ BRW_PREDICATE_NONE);
+ emit(MOV(dst_reg(retype(shuffled, temp.type)), temp));
+
+ if (offset_reg.file == IMM)
+ offset_reg.ud += 16;
+ else
+ emit(ADD(dst_reg(offset_reg), offset_reg, brw_imm_ud(16)));
+
+ temp = emit_untyped_read(bld, surf_index, offset_reg,
+ 1 /* dims */, 4 /* size*/,
+ BRW_PREDICATE_NONE);
+ emit(MOV(dst_reg(retype(byte_offset(shuffled, REG_SIZE), temp.type)),
+ temp));
+
+ read_result = src_reg(this, glsl_type::dvec4_type);
+ shuffle_64bit_data(dst_reg(read_result), shuffled, false);
+ }
+
+ read_result.type = dest.type;
+ read_result.swizzle = brw_swizzle_for_size(instr->num_components);
+ emit(MOV(dest, read_result));
+ break;
+ }
+
+ case nir_intrinsic_ssbo_atomic_add:
+ nir_emit_ssbo_atomic(BRW_AOP_ADD, instr);
+ break;
+ case nir_intrinsic_ssbo_atomic_imin:
+ nir_emit_ssbo_atomic(BRW_AOP_IMIN, instr);
+ break;
+ case nir_intrinsic_ssbo_atomic_umin:
+ nir_emit_ssbo_atomic(BRW_AOP_UMIN, instr);
+ break;
+ case nir_intrinsic_ssbo_atomic_imax:
+ nir_emit_ssbo_atomic(BRW_AOP_IMAX, instr);
+ break;
+ case nir_intrinsic_ssbo_atomic_umax:
+ nir_emit_ssbo_atomic(BRW_AOP_UMAX, instr);
+ break;
+ case nir_intrinsic_ssbo_atomic_and:
+ nir_emit_ssbo_atomic(BRW_AOP_AND, instr);
+ break;
+ case nir_intrinsic_ssbo_atomic_or:
+ nir_emit_ssbo_atomic(BRW_AOP_OR, instr);
+ break;
+ case nir_intrinsic_ssbo_atomic_xor:
+ nir_emit_ssbo_atomic(BRW_AOP_XOR, instr);
+ break;
+ case nir_intrinsic_ssbo_atomic_exchange:
+ nir_emit_ssbo_atomic(BRW_AOP_MOV, instr);
+ break;
+ case nir_intrinsic_ssbo_atomic_comp_swap:
+ nir_emit_ssbo_atomic(BRW_AOP_CMPWR, instr);
+ break;
+
+ case nir_intrinsic_load_vertex_id:
+ unreachable("should be lowered by lower_vertex_id()");
+
+ case nir_intrinsic_load_vertex_id_zero_base:
+ case nir_intrinsic_load_base_vertex:
+ case nir_intrinsic_load_instance_id:
+ case nir_intrinsic_load_base_instance:
+ case nir_intrinsic_load_draw_id:
+ case nir_intrinsic_load_invocation_id: {
+ gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
+ src_reg val = src_reg(nir_system_values[sv]);
+ assert(val.file != BAD_FILE);
+ dest = get_nir_dest(instr->dest, val.type);
+ emit(MOV(dest, val));
+ break;
+ }
+
+ case nir_intrinsic_load_uniform: {
+ /* Offsets are in bytes but they should always be multiples of 4 */
+ assert(nir_intrinsic_base(instr) % 4 == 0);
+
+ dest = get_nir_dest(instr->dest);
+
+ src = src_reg(dst_reg(UNIFORM, nir_intrinsic_base(instr) / 16));
+ src.type = dest.type;
+
+ /* Uniforms don't actually have to be vec4 aligned. In the case that
+ * it isn't, we have to use a swizzle to shift things around. They
+ * do still have the std140 alignment requirement that vec2's have to
+ * be vec2-aligned and vec3's and vec4's have to be vec4-aligned.
+ *
+ * The swizzle also works in the indirect case as the generator adds
+ * the swizzle to the offset for us.
+ */
+ unsigned shift = (nir_intrinsic_base(instr) % 16) / 4;
+ assert(shift + instr->num_components <= 4);
+
+ nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+ if (const_offset) {
+ /* Offsets are in bytes but they should always be multiples of 4 */
+ assert(const_offset->u32[0] % 4 == 0);
+
+ unsigned offset = const_offset->u32[0] + shift * 4;
+ src.offset = ROUND_DOWN_TO(offset, 16);
+ shift = (offset % 16) / 4;
+ src.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift);
+
+ emit(MOV(dest, src));
+ } else {
+ src.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift);
+
+ src_reg indirect = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1);
+
+ /* MOV_INDIRECT is going to stomp the whole thing anyway */
+ dest.writemask = WRITEMASK_XYZW;
+
+ emit(SHADER_OPCODE_MOV_INDIRECT, dest, src,
+ indirect, brw_imm_ud(instr->const_index[1]));
+ }
+ break;
+ }
+
+ case nir_intrinsic_atomic_counter_read:
+ case nir_intrinsic_atomic_counter_inc:
+ case nir_intrinsic_atomic_counter_dec: {
+ unsigned surf_index = prog_data->base.binding_table.abo_start +
+ (unsigned) instr->const_index[0];
+ const vec4_builder bld =
+ vec4_builder(this).at_end().annotate(current_annotation, base_ir);
+
+ /* Get some metadata from the image intrinsic. */
+ const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
+
+ /* Get the arguments of the atomic intrinsic. */
+ src_reg offset = get_nir_src(instr->src[0], nir_type_int32,
+ instr->num_components);
+ const src_reg surface = brw_imm_ud(surf_index);
+ const src_reg src0 = (info->num_srcs >= 2
+ ? get_nir_src(instr->src[1]) : src_reg());
+ const src_reg src1 = (info->num_srcs >= 3
+ ? get_nir_src(instr->src[2]) : src_reg());
+
+ src_reg tmp;
+
+ dest = get_nir_dest(instr->dest);
+
+ if (instr->intrinsic == nir_intrinsic_atomic_counter_read) {
+ tmp = emit_untyped_read(bld, surface, offset, 1, 1);
+ } else {
+ tmp = emit_untyped_atomic(bld, surface, offset,
+ src0, src1,
+ 1, 1,
+ get_atomic_counter_op(instr->intrinsic));
+ }
+
+ bld.MOV(retype(dest, tmp.type), tmp);
+ brw_mark_surface_used(stage_prog_data, surf_index);
+ break;
+ }
+
+ case nir_intrinsic_load_ubo: {
+ nir_const_value *const_block_index = nir_src_as_const_value(instr->src[0]);
+ src_reg surf_index;
+
+ dest = get_nir_dest(instr->dest);
+
+ if (const_block_index) {
+ /* The block index is a constant, so just emit the binding table entry
+ * as an immediate.
+ */
+ const unsigned index = prog_data->base.binding_table.ubo_start +
+ const_block_index->u32[0];
+ surf_index = brw_imm_ud(index);
+ brw_mark_surface_used(&prog_data->base, index);
+ } else {
+ /* The block index is not a constant. Evaluate the index expression
+ * per-channel and add the base UBO index; we have to select a value
+ * from any live channel.
+ */
+ surf_index = src_reg(this, glsl_type::uint_type);
+ emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[0], nir_type_int32,
+ instr->num_components),
+ brw_imm_ud(prog_data->base.binding_table.ubo_start)));
+ surf_index = emit_uniformize(surf_index);
+
+ /* Assume this may touch any UBO. It would be nice to provide
+ * a tighter bound, but the array information is already lowered away.
+ */
+ brw_mark_surface_used(&prog_data->base,
+ prog_data->base.binding_table.ubo_start +
+ nir->info->num_ubos - 1);
+ }
+
+ src_reg offset_reg;
+ nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
+ if (const_offset) {
+ offset_reg = brw_imm_ud(const_offset->u32[0] & ~15);
+ } else {
+ offset_reg = get_nir_src(instr->src[1], nir_type_uint32, 1);
+ }
+
+ src_reg packed_consts;
+ if (nir_dest_bit_size(instr->dest) == 32) {
+ packed_consts = src_reg(this, glsl_type::vec4_type);
+ emit_pull_constant_load_reg(dst_reg(packed_consts),
+ surf_index,
+ offset_reg,
+ NULL, NULL /* before_block/inst */);
+ } else {
+ src_reg temp = src_reg(this, glsl_type::dvec4_type);
+ src_reg temp_float = retype(temp, BRW_REGISTER_TYPE_F);
+
+ emit_pull_constant_load_reg(dst_reg(temp_float),
+ surf_index, offset_reg, NULL, NULL);
+ if (offset_reg.file == IMM)
+ offset_reg.ud += 16;
+ else
+ emit(ADD(dst_reg(offset_reg), offset_reg, brw_imm_ud(16u)));
+ emit_pull_constant_load_reg(dst_reg(byte_offset(temp_float, REG_SIZE)),
+ surf_index, offset_reg, NULL, NULL);
+
+ packed_consts = src_reg(this, glsl_type::dvec4_type);
+ shuffle_64bit_data(dst_reg(packed_consts), temp, false);
+ }
+
+ packed_consts.swizzle = brw_swizzle_for_size(instr->num_components);
+ if (const_offset) {
+ unsigned type_size = type_sz(dest.type);
+ packed_consts.swizzle +=
+ BRW_SWIZZLE4(const_offset->u32[0] % 16 / type_size,
+ const_offset->u32[0] % 16 / type_size,
+ const_offset->u32[0] % 16 / type_size,
+ const_offset->u32[0] % 16 / type_size);
+ }
+
+ emit(MOV(dest, retype(packed_consts, dest.type)));
+
+ break;
+ }
+
+ case nir_intrinsic_memory_barrier: {
+ const vec4_builder bld =
+ vec4_builder(this).at_end().annotate(current_annotation, base_ir);
+ const dst_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+ bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
+ ->size_written = 2 * REG_SIZE;
+ break;
+ }
+
+ case nir_intrinsic_shader_clock: {
+ /* We cannot do anything if there is an event, so ignore it for now */
+ const src_reg shader_clock = get_timestamp();
+ const enum brw_reg_type type = brw_type_for_base_type(glsl_type::uvec2_type);
+
+ dest = get_nir_dest(instr->dest, type);
+ emit(MOV(dest, shader_clock));
+ break;
+ }
+
+ default:
+ unreachable("Unknown intrinsic");
+ }
+}
+
+void
+vec4_visitor::nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr)
+{
+ dst_reg dest;
+ if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+ dest = get_nir_dest(instr->dest);
+
+ src_reg surface;
+ nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]);
+ if (const_surface) {
+ unsigned surf_index = prog_data->base.binding_table.ssbo_start +
+ const_surface->u32[0];
+ surface = brw_imm_ud(surf_index);
+ brw_mark_surface_used(&prog_data->base, surf_index);
+ } else {
+ surface = src_reg(this, glsl_type::uint_type);
+ emit(ADD(dst_reg(surface), get_nir_src(instr->src[0]),
+ brw_imm_ud(prog_data->base.binding_table.ssbo_start)));
+
+ /* Assume this may touch any UBO. This is the same we do for other
+ * UBO/SSBO accesses with non-constant surface.
+ */
+ brw_mark_surface_used(&prog_data->base,
+ prog_data->base.binding_table.ssbo_start +
+ nir->info->num_ssbos - 1);
+ }
+
+ src_reg offset = get_nir_src(instr->src[1], 1);
+ src_reg data1 = get_nir_src(instr->src[2], 1);
+ src_reg data2;
+ if (op == BRW_AOP_CMPWR)
+ data2 = get_nir_src(instr->src[3], 1);
+
+ /* Emit the actual atomic operation operation */
+ const vec4_builder bld =
+ vec4_builder(this).at_end().annotate(current_annotation, base_ir);
+
+ src_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
+ data1, data2,
+ 1 /* dims */, 1 /* rsize */,
+ op,
+ BRW_PREDICATE_NONE);
+ dest.type = atomic_result.type;
+ bld.MOV(dest, atomic_result);
+}
+
+static unsigned
+brw_swizzle_for_nir_swizzle(uint8_t swizzle[4])
+{
+ return BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
+}
+
+static enum brw_conditional_mod
+brw_conditional_for_nir_comparison(nir_op op)
+{
+ switch (op) {
+ case nir_op_flt:
+ case nir_op_ilt:
+ case nir_op_ult:
+ return BRW_CONDITIONAL_L;
+
+ case nir_op_fge:
+ case nir_op_ige:
+ case nir_op_uge:
+ return BRW_CONDITIONAL_GE;
+
+ case nir_op_feq:
+ case nir_op_ieq:
+ case nir_op_ball_fequal2:
+ case nir_op_ball_iequal2:
+ case nir_op_ball_fequal3:
+ case nir_op_ball_iequal3:
+ case nir_op_ball_fequal4:
+ case nir_op_ball_iequal4:
+ return BRW_CONDITIONAL_Z;
+
+ case nir_op_fne:
+ case nir_op_ine:
+ case nir_op_bany_fnequal2:
+ case nir_op_bany_inequal2:
+ case nir_op_bany_fnequal3:
+ case nir_op_bany_inequal3:
+ case nir_op_bany_fnequal4:
+ case nir_op_bany_inequal4:
+ return BRW_CONDITIONAL_NZ;
+
+ default:
+ unreachable("not reached: bad operation for comparison");
+ }
+}
+
+bool
+vec4_visitor::optimize_predicate(nir_alu_instr *instr,
+ enum brw_predicate *predicate)
+{
+ if (!instr->src[0].src.is_ssa ||
+ instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
+ return false;
+
+ nir_alu_instr *cmp_instr =
+ nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+
+ switch (cmp_instr->op) {
+ case nir_op_bany_fnequal2:
+ case nir_op_bany_inequal2:
+ case nir_op_bany_fnequal3:
+ case nir_op_bany_inequal3:
+ case nir_op_bany_fnequal4:
+ case nir_op_bany_inequal4:
+ *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+ break;
+ case nir_op_ball_fequal2:
+ case nir_op_ball_iequal2:
+ case nir_op_ball_fequal3:
+ case nir_op_ball_iequal3:
+ case nir_op_ball_fequal4:
+ case nir_op_ball_iequal4:
+ *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
+ break;
+ default:
+ return false;
+ }
+
+ unsigned size_swizzle =
+ brw_swizzle_for_size(nir_op_infos[cmp_instr->op].input_sizes[0]);
+
+ src_reg op[2];
+ assert(nir_op_infos[cmp_instr->op].num_inputs == 2);
+ for (unsigned i = 0; i < 2; i++) {
+ nir_alu_type type = nir_op_infos[cmp_instr->op].input_types[i];
+ unsigned bit_size = nir_src_bit_size(cmp_instr->src[i].src);
+ type = (nir_alu_type) (((unsigned) type) | bit_size);
+ op[i] = get_nir_src(cmp_instr->src[i].src, type, 4);
+ unsigned base_swizzle =
+ brw_swizzle_for_nir_swizzle(cmp_instr->src[i].swizzle);
+ op[i].swizzle = brw_compose_swizzle(size_swizzle, base_swizzle);
+ op[i].abs = cmp_instr->src[i].abs;
+ op[i].negate = cmp_instr->src[i].negate;
+ }
+
+ emit(CMP(dst_null_d(), op[0], op[1],
+ brw_conditional_for_nir_comparison(cmp_instr->op)));
+
+ return true;
+}
+
+static void
+emit_find_msb_using_lzd(const vec4_builder &bld,
+ const dst_reg &dst,
+ const src_reg &src,
+ bool is_signed)
+{
+ vec4_instruction *inst;
+ src_reg temp = src;
+
+ if (is_signed) {
+ /* LZD of an absolute value source almost always does the right
+ * thing. There are two problem values:
+ *
+ * * 0x80000000. Since abs(0x80000000) == 0x80000000, LZD returns
+ * 0. However, findMSB(int(0x80000000)) == 30.
+ *
+ * * 0xffffffff. Since abs(0xffffffff) == 1, LZD returns
+ * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
+ *
+ * For a value of zero or negative one, -1 will be returned.
+ *
+ * * Negative powers of two. LZD(abs(-(1<<x))) returns x, but
+ * findMSB(-(1<<x)) should return x-1.
+ *
+ * For all negative number cases, including 0x80000000 and
+ * 0xffffffff, the correct value is obtained from LZD if instead of
+ * negating the (already negative) value the logical-not is used. A
+ * conditonal logical-not can be achieved in two instructions.
+ */
+ temp = src_reg(bld.vgrf(BRW_REGISTER_TYPE_D));
+
+ bld.ASR(dst_reg(temp), src, brw_imm_d(31));
+ bld.XOR(dst_reg(temp), temp, src);
+ }
+
+ bld.LZD(retype(dst, BRW_REGISTER_TYPE_UD),
+ retype(temp, BRW_REGISTER_TYPE_UD));
+
+ /* LZD counts from the MSB side, while GLSL's findMSB() wants the count
+ * from the LSB side. Subtract the result from 31 to convert the MSB count
+ * into an LSB count. If no bits are set, LZD will return 32. 31-32 = -1,
+ * which is exactly what findMSB() is supposed to return.
+ */
+ inst = bld.ADD(dst, retype(src_reg(dst), BRW_REGISTER_TYPE_D),
+ brw_imm_d(31));
+ inst->src[0].negate = true;
+}
+
+void
+vec4_visitor::emit_conversion_from_double(dst_reg dst, src_reg src,
+ bool saturate,
+ brw_reg_type single_type)
+{
+ /* BDW PRM vol 15 - workarounds:
+ * DF->f format conversion for Align16 has wrong emask calculation when
+ * source is immediate.
+ */
+ if (devinfo->gen == 8 && single_type == BRW_REGISTER_TYPE_F &&
+ src.file == BRW_IMMEDIATE_VALUE) {
+ vec4_instruction *inst = emit(MOV(dst, brw_imm_f(src.df)));
+ inst->saturate = saturate;
+ return;
+ }
+
+ dst_reg temp = dst_reg(this, glsl_type::dvec4_type);
+ emit(MOV(temp, src));
+
+ dst_reg temp2 = dst_reg(this, glsl_type::dvec4_type);
+ temp2 = retype(temp2, single_type);
+ emit(VEC4_OPCODE_FROM_DOUBLE, temp2, src_reg(temp))
+ ->size_written = 2 * REG_SIZE;
+
+ vec4_instruction *inst = emit(MOV(dst, src_reg(temp2)));
+ inst->saturate = saturate;
+}
+
+void
+vec4_visitor::emit_conversion_to_double(dst_reg dst, src_reg src,
+ bool saturate,
+ brw_reg_type single_type)
+{
+ dst_reg tmp_dst = dst_reg(src_reg(this, glsl_type::dvec4_type));
+ src_reg tmp_src = retype(src_reg(this, glsl_type::vec4_type), single_type);
+ emit(MOV(dst_reg(tmp_src), retype(src, single_type)));
+ emit(VEC4_OPCODE_TO_DOUBLE, tmp_dst, tmp_src);
+ vec4_instruction *inst = emit(MOV(dst, src_reg(tmp_dst)));
+ inst->saturate = saturate;
+}
+
+src_reg
+vec4_visitor::setup_imm_df(double v)
+{
+ assert(devinfo->gen >= 7);
+
+ if (devinfo->gen >= 8)
+ return brw_imm_df(v);
+
+ /* gen7.5 does not support DF immediates straighforward but the DIM
+ * instruction allows to set the 64-bit immediate value.
+ */
+ if (devinfo->is_haswell) {
+ dst_reg dst = retype(dst_reg(VGRF, alloc.allocate(2)), BRW_REGISTER_TYPE_DF);
+ emit(DIM(dst, brw_imm_df(v)))->force_writemask_all = true;
+ return swizzle(src_reg(retype(dst, BRW_REGISTER_TYPE_DF)), BRW_SWIZZLE_XXXX);
+ }
+
+ /* gen7 does not support DF immediates */
+ union {
+ double d;
+ struct {
+ uint32_t i1;
+ uint32_t i2;
+ };
+ } di;
+
+ di.d = v;
+
+ /* Write the low 32-bit of the constant to the X:UD channel and the
+ * high 32-bit to the Y:UD channel to build the constant in a VGRF.
+ * We have to do this twice (offset 0 and offset 1), since a DF VGRF takes
+ * two SIMD8 registers in SIMD4x2 execution. Finally, return a swizzle
+ * XXXX so any access to the VGRF only reads the constant data in these
+ * channels.
+ */
+ const dst_reg tmp =
+ retype(dst_reg(VGRF, alloc.allocate(2)), BRW_REGISTER_TYPE_UD);
+ for (int n = 0; n < 2; n++) {
+ emit(MOV(writemask(offset(tmp, 8, n), WRITEMASK_X), brw_imm_ud(di.i1)))
+ ->force_writemask_all = true;
+ emit(MOV(writemask(offset(tmp, 8, n), WRITEMASK_Y), brw_imm_ud(di.i2)))
+ ->force_writemask_all = true;
+ }
+
+ return swizzle(src_reg(retype(tmp, BRW_REGISTER_TYPE_DF)), BRW_SWIZZLE_XXXX);
+}
+
+void
+vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
+{
+ vec4_instruction *inst;
+
+ nir_alu_type dst_type = (nir_alu_type) (nir_op_infos[instr->op].output_type |
+ nir_dest_bit_size(instr->dest.dest));
+ dst_reg dst = get_nir_dest(instr->dest.dest, dst_type);
+ dst.writemask = instr->dest.write_mask;
+
+ src_reg op[4];
+ for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+ nir_alu_type src_type = (nir_alu_type)
+ (nir_op_infos[instr->op].input_types[i] |
+ nir_src_bit_size(instr->src[i].src));
+ op[i] = get_nir_src(instr->src[i].src, src_type, 4);
+ op[i].swizzle = brw_swizzle_for_nir_swizzle(instr->src[i].swizzle);
+ op[i].abs = instr->src[i].abs;
+ op[i].negate = instr->src[i].negate;
+ }
+
+ switch (instr->op) {
+ case nir_op_imov:
+ case nir_op_fmov:
+ inst = emit(MOV(dst, op[0]));
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_vec2:
+ case nir_op_vec3:
+ case nir_op_vec4:
+ unreachable("not reached: should be handled by lower_vec_to_movs()");
+
+ case nir_op_i2f:
+ case nir_op_u2f:
+ inst = emit(MOV(dst, op[0]));
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_f2i:
+ case nir_op_f2u:
+ inst = emit(MOV(dst, op[0]));
+ break;
+
+ case nir_op_d2f:
+ emit_conversion_from_double(dst, op[0], instr->dest.saturate,
+ BRW_REGISTER_TYPE_F);
+ break;
+
+ case nir_op_f2d:
+ emit_conversion_to_double(dst, op[0], instr->dest.saturate,
+ BRW_REGISTER_TYPE_F);
+ break;
+
+ case nir_op_d2i:
+ case nir_op_d2u:
+ emit_conversion_from_double(dst, op[0], instr->dest.saturate,
+ instr->op == nir_op_d2i ? BRW_REGISTER_TYPE_D :
+ BRW_REGISTER_TYPE_UD);
+ break;
+
+ case nir_op_i2d:
+ case nir_op_u2d:
+ emit_conversion_to_double(dst, op[0], instr->dest.saturate,
+ instr->op == nir_op_i2d ? BRW_REGISTER_TYPE_D :
+ BRW_REGISTER_TYPE_UD);
+ break;
+
+ case nir_op_iadd:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ case nir_op_fadd:
+ inst = emit(ADD(dst, op[0], op[1]));
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_fmul:
+ inst = emit(MUL(dst, op[0], op[1]));
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_imul: {
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ if (devinfo->gen < 8) {
+ nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
+ nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
+
+ /* For integer multiplication, the MUL uses the low 16 bits of one of
+ * the operands (src0 through SNB, src1 on IVB and later). The MACH
+ * accumulates in the contribution of the upper 16 bits of that
+ * operand. If we can determine that one of the args is in the low
+ * 16 bits, though, we can just emit a single MUL.
+ */
+ if (value0 && value0->u32[0] < (1 << 16)) {
+ if (devinfo->gen < 7)
+ emit(MUL(dst, op[0], op[1]));
+ else
+ emit(MUL(dst, op[1], op[0]));
+ } else if (value1 && value1->u32[0] < (1 << 16)) {
+ if (devinfo->gen < 7)
+ emit(MUL(dst, op[1], op[0]));
+ else
+ emit(MUL(dst, op[0], op[1]));
+ } else {
+ struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
+
+ emit(MUL(acc, op[0], op[1]));
+ emit(MACH(dst_null_d(), op[0], op[1]));
+ emit(MOV(dst, src_reg(acc)));
+ }
+ } else {
+ emit(MUL(dst, op[0], op[1]));
+ }
+ break;
+ }
+
+ case nir_op_imul_high:
+ case nir_op_umul_high: {
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
+
+ if (devinfo->gen >= 8)
+ emit(MUL(acc, op[0], retype(op[1], BRW_REGISTER_TYPE_UW)));
+ else
+ emit(MUL(acc, op[0], op[1]));
+
+ emit(MACH(dst, op[0], op[1]));
+ break;
+ }
+
+ case nir_op_frcp:
+ inst = emit_math(SHADER_OPCODE_RCP, dst, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_fexp2:
+ inst = emit_math(SHADER_OPCODE_EXP2, dst, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_flog2:
+ inst = emit_math(SHADER_OPCODE_LOG2, dst, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_fsin:
+ inst = emit_math(SHADER_OPCODE_SIN, dst, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_fcos:
+ inst = emit_math(SHADER_OPCODE_COS, dst, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_idiv:
+ case nir_op_udiv:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ emit_math(SHADER_OPCODE_INT_QUOTIENT, dst, op[0], op[1]);
+ break;
+
+ case nir_op_umod:
+ case nir_op_irem:
+ /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
+ * appears that our hardware just does the right thing for signed
+ * remainder.
+ */
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
+ break;
+
+ case nir_op_imod: {
+ /* Get a regular C-style remainder. If a % b == 0, set the predicate. */
+ inst = emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
+
+ /* Math instructions don't support conditional mod */
+ inst = emit(MOV(dst_null_d(), src_reg(dst)));
+ inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+ /* Now, we need to determine if signs of the sources are different.
+ * When we XOR the sources, the top bit is 0 if they are the same and 1
+ * if they are different. We can then use a conditional modifier to
+ * turn that into a predicate. This leads us to an XOR.l instruction.
+ *
+ * Technically, according to the PRM, you're not allowed to use .l on a
+ * XOR instruction. However, emperical experiments and Curro's reading
+ * of the simulator source both indicate that it's safe.
+ */
+ src_reg tmp = src_reg(this, glsl_type::ivec4_type);
+ inst = emit(XOR(dst_reg(tmp), op[0], op[1]));
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ inst->conditional_mod = BRW_CONDITIONAL_L;
+
+ /* If the result of the initial remainder operation is non-zero and the
+ * two sources have different signs, add in a copy of op[1] to get the
+ * final integer modulus value.
+ */
+ inst = emit(ADD(dst, src_reg(dst), op[1]));
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ break;
+ }
+
+ case nir_op_ldexp:
+ unreachable("not reached: should be handled by ldexp_to_arith()");
+
+ case nir_op_fsqrt:
+ inst = emit_math(SHADER_OPCODE_SQRT, dst, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_frsq:
+ inst = emit_math(SHADER_OPCODE_RSQ, dst, op[0]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_fpow:
+ inst = emit_math(SHADER_OPCODE_POW, dst, op[0], op[1]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_uadd_carry: {
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
+
+ emit(ADDC(dst_null_ud(), op[0], op[1]));
+ emit(MOV(dst, src_reg(acc)));
+ break;
+ }
+
+ case nir_op_usub_borrow: {
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
+
+ emit(SUBB(dst_null_ud(), op[0], op[1]));
+ emit(MOV(dst, src_reg(acc)));
+ break;
+ }
+
+ case nir_op_ftrunc:
+ inst = emit(RNDZ(dst, op[0]));
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_fceil: {
+ src_reg tmp = src_reg(this, glsl_type::float_type);
+ tmp.swizzle =
+ brw_swizzle_for_size(instr->src[0].src.is_ssa ?
+ instr->src[0].src.ssa->num_components :
+ instr->src[0].src.reg.reg->num_components);
+
+ op[0].negate = !op[0].negate;
+ emit(RNDD(dst_reg(tmp), op[0]));
+ tmp.negate = true;
+ inst = emit(MOV(dst, tmp));
+ inst->saturate = instr->dest.saturate;
+ break;
+ }
+
+ case nir_op_ffloor:
+ inst = emit(RNDD(dst, op[0]));
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_ffract:
+ inst = emit(FRC(dst, op[0]));
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_fround_even:
+ inst = emit(RNDE(dst, op[0]));
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_fquantize2f16: {
+ /* See also vec4_visitor::emit_pack_half_2x16() */
+ src_reg tmp16 = src_reg(this, glsl_type::uvec4_type);
+ src_reg tmp32 = src_reg(this, glsl_type::vec4_type);
+ src_reg zero = src_reg(this, glsl_type::vec4_type);
+
+ /* Check for denormal */
+ src_reg abs_src0 = op[0];
+ abs_src0.abs = true;
+ emit(CMP(dst_null_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
+ BRW_CONDITIONAL_L));
+ /* Get the appropriately signed zero */
+ emit(AND(retype(dst_reg(zero), BRW_REGISTER_TYPE_UD),
+ retype(op[0], BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(0x80000000)));
+ /* Do the actual F32 -> F16 -> F32 conversion */
+ emit(F32TO16(dst_reg(tmp16), op[0]));
+ emit(F16TO32(dst_reg(tmp32), tmp16));
+ /* Select that or zero based on normal status */
+ inst = emit(BRW_OPCODE_SEL, dst, zero, tmp32);
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ inst->saturate = instr->dest.saturate;
+ break;
+ }
+
+ case nir_op_imin:
+ case nir_op_umin:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ case nir_op_fmin:
+ inst = emit_minmax(BRW_CONDITIONAL_L, dst, op[0], op[1]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_imax:
+ case nir_op_umax:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ case nir_op_fmax:
+ inst = emit_minmax(BRW_CONDITIONAL_GE, dst, op[0], op[1]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_fddx:
+ case nir_op_fddx_coarse:
+ case nir_op_fddx_fine:
+ case nir_op_fddy:
+ case nir_op_fddy_coarse:
+ case nir_op_fddy_fine:
+ unreachable("derivatives are not valid in vertex shaders");
+
+ case nir_op_ilt:
+ case nir_op_ult:
+ case nir_op_ige:
+ case nir_op_uge:
+ case nir_op_ieq:
+ case nir_op_ine:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ /* Fallthrough */
+ case nir_op_flt:
+ case nir_op_fge:
+ case nir_op_feq:
+ case nir_op_fne: {
+ enum brw_conditional_mod conditional_mod =
+ brw_conditional_for_nir_comparison(instr->op);
+
+ if (nir_src_bit_size(instr->src[0].src) < 64) {
+ emit(CMP(dst, op[0], op[1], conditional_mod));
+ } else {
+ /* Produce a 32-bit boolean result from the DF comparison by selecting
+ * only the low 32-bit in each DF produced. Do this in a temporary
+ * so we can then move from there to the result using align16 again
+ * to honor the original writemask.
+ */
+ dst_reg temp = dst_reg(this, glsl_type::dvec4_type);
+ emit(CMP(temp, op[0], op[1], conditional_mod));
+ dst_reg result = dst_reg(this, glsl_type::bvec4_type);
+ emit(VEC4_OPCODE_PICK_LOW_32BIT, result, src_reg(temp));
+ emit(MOV(dst, src_reg(result)));
+ }
+ break;
+ }
+
+ case nir_op_ball_iequal2:
+ case nir_op_ball_iequal3:
+ case nir_op_ball_iequal4:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ /* Fallthrough */
+ case nir_op_ball_fequal2:
+ case nir_op_ball_fequal3:
+ case nir_op_ball_fequal4: {
+ unsigned swiz =
+ brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]);
+
+ emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz),
+ brw_conditional_for_nir_comparison(instr->op)));
+ emit(MOV(dst, brw_imm_d(0)));
+ inst = emit(MOV(dst, brw_imm_d(~0)));
+ inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
+ break;
+ }
+
+ case nir_op_bany_inequal2:
+ case nir_op_bany_inequal3:
+ case nir_op_bany_inequal4:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ /* Fallthrough */
+ case nir_op_bany_fnequal2:
+ case nir_op_bany_fnequal3:
+ case nir_op_bany_fnequal4: {
+ unsigned swiz =
+ brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]);
+
+ emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz),
+ brw_conditional_for_nir_comparison(instr->op)));
+
+ emit(MOV(dst, brw_imm_d(0)));
+ inst = emit(MOV(dst, brw_imm_d(~0)));
+ inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+ break;
+ }
+
+ case nir_op_inot:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ if (devinfo->gen >= 8) {
+ op[0] = resolve_source_modifiers(op[0]);
+ }
+ emit(NOT(dst, op[0]));
+ break;
+
+ case nir_op_ixor:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ if (devinfo->gen >= 8) {
+ op[0] = resolve_source_modifiers(op[0]);
+ op[1] = resolve_source_modifiers(op[1]);
+ }
+ emit(XOR(dst, op[0], op[1]));
+ break;
+
+ case nir_op_ior:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ if (devinfo->gen >= 8) {
+ op[0] = resolve_source_modifiers(op[0]);
+ op[1] = resolve_source_modifiers(op[1]);
+ }
+ emit(OR(dst, op[0], op[1]));
+ break;
+
+ case nir_op_iand:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ if (devinfo->gen >= 8) {
+ op[0] = resolve_source_modifiers(op[0]);
+ op[1] = resolve_source_modifiers(op[1]);
+ }
+ emit(AND(dst, op[0], op[1]));
+ break;
+
+ case nir_op_b2i:
+ case nir_op_b2f:
+ emit(MOV(dst, negate(op[0])));
+ break;
+
+ case nir_op_f2b:
+ emit(CMP(dst, op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ));
+ break;
+
+ case nir_op_d2b: {
+ /* We use a MOV with conditional_mod to check if the provided value is
+ * 0.0. We want this to flush denormalized numbers to zero, so we set a
+ * source modifier on the source operand to trigger this, as source
+ * modifiers don't affect the result of the testing against 0.0.
+ */
+ src_reg value = op[0];
+ value.abs = true;
+ vec4_instruction *inst = emit(MOV(dst_null_df(), value));
+ inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+ src_reg one = src_reg(this, glsl_type::ivec4_type);
+ emit(MOV(dst_reg(one), brw_imm_d(~0)));
+ inst = emit(BRW_OPCODE_SEL, dst, one, brw_imm_d(0));
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ break;
+ }
+
+ case nir_op_i2b:
+ emit(CMP(dst, op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ));
+ break;
+
+ case nir_op_fnoise1_1:
+ case nir_op_fnoise1_2:
+ case nir_op_fnoise1_3:
+ case nir_op_fnoise1_4:
+ case nir_op_fnoise2_1:
+ case nir_op_fnoise2_2:
+ case nir_op_fnoise2_3:
+ case nir_op_fnoise2_4:
+ case nir_op_fnoise3_1:
+ case nir_op_fnoise3_2:
+ case nir_op_fnoise3_3:
+ case nir_op_fnoise3_4:
+ case nir_op_fnoise4_1:
+ case nir_op_fnoise4_2:
+ case nir_op_fnoise4_3:
+ case nir_op_fnoise4_4:
+ unreachable("not reached: should be handled by lower_noise");
+
+ case nir_op_unpack_half_2x16_split_x:
+ case nir_op_unpack_half_2x16_split_y:
+ case nir_op_pack_half_2x16_split:
+ unreachable("not reached: should not occur in vertex shader");
+
+ case nir_op_unpack_snorm_2x16:
+ case nir_op_unpack_unorm_2x16:
+ case nir_op_pack_snorm_2x16:
+ case nir_op_pack_unorm_2x16:
+ unreachable("not reached: should be handled by lower_packing_builtins");
+
+ case nir_op_pack_uvec4_to_uint:
+ unreachable("not reached");
+
+ case nir_op_pack_uvec2_to_uint: {
+ dst_reg tmp1 = dst_reg(this, glsl_type::uint_type);
+ tmp1.writemask = WRITEMASK_X;
+ op[0].swizzle = BRW_SWIZZLE_YYYY;
+ emit(SHL(tmp1, op[0], src_reg(brw_imm_ud(16u))));
+
+ dst_reg tmp2 = dst_reg(this, glsl_type::uint_type);
+ tmp2.writemask = WRITEMASK_X;
+ op[0].swizzle = BRW_SWIZZLE_XXXX;
+ emit(AND(tmp2, op[0], src_reg(brw_imm_ud(0xffffu))));
+
+ emit(OR(dst, src_reg(tmp1), src_reg(tmp2)));
+ break;
+ }
+
+ case nir_op_pack_64_2x32_split: {
+ dst_reg result = dst_reg(this, glsl_type::dvec4_type);
+ dst_reg tmp = dst_reg(this, glsl_type::uvec4_type);
+ emit(MOV(tmp, retype(op[0], BRW_REGISTER_TYPE_UD)));
+ emit(VEC4_OPCODE_SET_LOW_32BIT, result, src_reg(tmp));
+ emit(MOV(tmp, retype(op[1], BRW_REGISTER_TYPE_UD)));
+ emit(VEC4_OPCODE_SET_HIGH_32BIT, result, src_reg(tmp));
+ emit(MOV(dst, src_reg(result)));
+ break;
+ }
+
+ case nir_op_unpack_64_2x32_split_x:
+ case nir_op_unpack_64_2x32_split_y: {
+ enum opcode oper = (instr->op == nir_op_unpack_64_2x32_split_x) ?
+ VEC4_OPCODE_PICK_LOW_32BIT : VEC4_OPCODE_PICK_HIGH_32BIT;
+ dst_reg tmp = dst_reg(this, glsl_type::dvec4_type);
+ emit(MOV(tmp, op[0]));
+ dst_reg tmp2 = dst_reg(this, glsl_type::uvec4_type);
+ emit(oper, tmp2, src_reg(tmp));
+ emit(MOV(dst, src_reg(tmp2)));
+ break;
+ }
+
+ case nir_op_unpack_half_2x16:
+ /* As NIR does not guarantee that we have a correct swizzle outside the
+ * boundaries of a vector, and the implementation of emit_unpack_half_2x16
+ * uses the source operand in an operation with WRITEMASK_Y while our
+ * source operand has only size 1, it accessed incorrect data producing
+ * regressions in Piglit. We repeat the swizzle of the first component on the
+ * rest of components to avoid regressions. In the vec4_visitor IR code path
+ * this is not needed because the operand has already the correct swizzle.
+ */
+ op[0].swizzle = brw_compose_swizzle(BRW_SWIZZLE_XXXX, op[0].swizzle);
+ emit_unpack_half_2x16(dst, op[0]);
+ break;
+
+ case nir_op_pack_half_2x16:
+ emit_pack_half_2x16(dst, op[0]);
+ break;
+
+ case nir_op_unpack_unorm_4x8:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ emit_unpack_unorm_4x8(dst, op[0]);
+ break;
+
+ case nir_op_pack_unorm_4x8:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ emit_pack_unorm_4x8(dst, op[0]);
+ break;
+
+ case nir_op_unpack_snorm_4x8:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ emit_unpack_snorm_4x8(dst, op[0]);
+ break;
+
+ case nir_op_pack_snorm_4x8:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ emit_pack_snorm_4x8(dst, op[0]);
+ break;
+
+ case nir_op_bitfield_reverse:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ emit(BFREV(dst, op[0]));
+ break;
+
+ case nir_op_bit_count:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ emit(CBIT(dst, op[0]));
+ break;
+
+ case nir_op_ufind_msb:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ emit_find_msb_using_lzd(vec4_builder(this).at_end(), dst, op[0], false);
+ break;
+
+ case nir_op_ifind_msb: {
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ vec4_builder bld = vec4_builder(this).at_end();
+ src_reg src(dst);
+
+ if (devinfo->gen < 7) {
+ emit_find_msb_using_lzd(bld, dst, op[0], true);
+ } else {
+ emit(FBH(retype(dst, BRW_REGISTER_TYPE_UD), op[0]));
+
+ /* FBH counts from the MSB side, while GLSL's findMSB() wants the
+ * count from the LSB side. If FBH didn't return an error
+ * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB
+ * count into an LSB count.
+ */
+ bld.CMP(dst_null_d(), src, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
+
+ inst = bld.ADD(dst, src, brw_imm_d(31));
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ inst->src[0].negate = true;
+ }
+ break;
+ }
+
+ case nir_op_find_lsb: {
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ vec4_builder bld = vec4_builder(this).at_end();
+
+ if (devinfo->gen < 7) {
+ dst_reg temp = bld.vgrf(BRW_REGISTER_TYPE_D);
+
+ /* (x & -x) generates a value that consists of only the LSB of x.
+ * For all powers of 2, findMSB(y) == findLSB(y).
+ */
+ src_reg src = src_reg(retype(op[0], BRW_REGISTER_TYPE_D));
+ src_reg negated_src = src;
+
+ /* One must be negated, and the other must be non-negated. It
+ * doesn't matter which is which.
+ */
+ negated_src.negate = true;
+ src.negate = false;
+
+ bld.AND(temp, src, negated_src);
+ emit_find_msb_using_lzd(bld, dst, src_reg(temp), false);
+ } else {
+ bld.FBL(dst, op[0]);
+ }
+ break;
+ }
+
+ case nir_op_ubitfield_extract:
+ case nir_op_ibitfield_extract:
+ unreachable("should have been lowered");
+ case nir_op_ubfe:
+ case nir_op_ibfe:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ op[0] = fix_3src_operand(op[0]);
+ op[1] = fix_3src_operand(op[1]);
+ op[2] = fix_3src_operand(op[2]);
+
+ emit(BFE(dst, op[2], op[1], op[0]));
+ break;
+
+ case nir_op_bfm:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ emit(BFI1(dst, op[0], op[1]));
+ break;
+
+ case nir_op_bfi:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ op[0] = fix_3src_operand(op[0]);
+ op[1] = fix_3src_operand(op[1]);
+ op[2] = fix_3src_operand(op[2]);
+
+ emit(BFI2(dst, op[0], op[1], op[2]));
+ break;
+
+ case nir_op_bitfield_insert:
+ unreachable("not reached: should have been lowered");
+
+ case nir_op_fsign:
+ if (type_sz(op[0].type) < 8) {
+ /* AND(val, 0x80000000) gives the sign bit.
+ *
+ * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
+ * zero.
+ */
+ emit(CMP(dst_null_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ));
+
+ op[0].type = BRW_REGISTER_TYPE_UD;
+ dst.type = BRW_REGISTER_TYPE_UD;
+ emit(AND(dst, op[0], brw_imm_ud(0x80000000u)));
+
+ inst = emit(OR(dst, src_reg(dst), brw_imm_ud(0x3f800000u)));
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ dst.type = BRW_REGISTER_TYPE_F;
+
+ if (instr->dest.saturate) {
+ inst = emit(MOV(dst, src_reg(dst)));
+ inst->saturate = true;
+ }
+ } else {
+ /* For doubles we do the same but we need to consider:
+ *
+ * - We use a MOV with conditional_mod instead of a CMP so that we can
+ * skip loading a 0.0 immediate. We use a source modifier on the
+ * source of the MOV so that we flush denormalized values to 0.
+ * Since we want to compare against 0, this won't alter the result.
+ * - We need to extract the high 32-bit of each DF where the sign
+ * is stored.
+ * - We need to produce a DF result.
+ */
+
+ /* Check for zero */
+ src_reg value = op[0];
+ value.abs = true;
+ inst = emit(MOV(dst_null_df(), value));
+ inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+ /* AND each high 32-bit channel with 0x80000000u */
+ dst_reg tmp = dst_reg(this, glsl_type::uvec4_type);
+ emit(VEC4_OPCODE_PICK_HIGH_32BIT, tmp, op[0]);
+ emit(AND(tmp, src_reg(tmp), brw_imm_ud(0x80000000u)));
+
+ /* Add 1.0 to each channel, predicated to skip the cases where the
+ * channel's value was 0
+ */
+ inst = emit(OR(tmp, src_reg(tmp), brw_imm_ud(0x3f800000u)));
+ inst->predicate = BRW_PREDICATE_NORMAL;
+
+ /* Now convert the result from float to double */
+ emit_conversion_to_double(dst, src_reg(tmp), instr->dest.saturate,
+ BRW_REGISTER_TYPE_F);
+ }
+ break;
+
+ case nir_op_isign:
+ /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
+ * -> non-negative val generates 0x00000000.
+ * Predicated OR sets 1 if val is positive.
+ */
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_G));
+ emit(ASR(dst, op[0], brw_imm_d(31)));
+ inst = emit(OR(dst, src_reg(dst), brw_imm_d(1)));
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ break;
+
+ case nir_op_ishl:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ emit(SHL(dst, op[0], op[1]));
+ break;
+
+ case nir_op_ishr:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ emit(ASR(dst, op[0], op[1]));
+ break;
+
+ case nir_op_ushr:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ emit(SHR(dst, op[0], op[1]));
+ break;
+
+ case nir_op_ffma:
+ if (type_sz(dst.type) == 8) {
+ dst_reg mul_dst = dst_reg(this, glsl_type::dvec4_type);
+ emit(MUL(mul_dst, op[1], op[0]));
+ inst = emit(ADD(dst, src_reg(mul_dst), op[2]));
+ inst->saturate = instr->dest.saturate;
+ } else {
+ op[0] = fix_3src_operand(op[0]);
+ op[1] = fix_3src_operand(op[1]);
+ op[2] = fix_3src_operand(op[2]);
+
+ inst = emit(MAD(dst, op[2], op[1], op[0]));
+ inst->saturate = instr->dest.saturate;
+ }
+ break;
+
+ case nir_op_flrp:
+ inst = emit_lrp(dst, op[0], op[1], op[2]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_bcsel:
+ enum brw_predicate predicate;
+ if (!optimize_predicate(instr, &predicate)) {
+ emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ));
+ switch (dst.writemask) {
+ case WRITEMASK_X:
+ predicate = BRW_PREDICATE_ALIGN16_REPLICATE_X;
+ break;
+ case WRITEMASK_Y:
+ predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Y;
+ break;
+ case WRITEMASK_Z:
+ predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Z;
+ break;
+ case WRITEMASK_W:
+ predicate = BRW_PREDICATE_ALIGN16_REPLICATE_W;
+ break;
+ default:
+ predicate = BRW_PREDICATE_NORMAL;
+ break;
+ }
+ }
+ inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]);
+ inst->predicate = predicate;
+ break;
+
+ case nir_op_fdot_replicated2:
+ inst = emit(BRW_OPCODE_DP2, dst, op[0], op[1]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_fdot_replicated3:
+ inst = emit(BRW_OPCODE_DP3, dst, op[0], op[1]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_fdot_replicated4:
+ inst = emit(BRW_OPCODE_DP4, dst, op[0], op[1]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_fdph_replicated:
+ inst = emit(BRW_OPCODE_DPH, dst, op[0], op[1]);
+ inst->saturate = instr->dest.saturate;
+ break;
+
+ case nir_op_iabs:
+ case nir_op_ineg:
+ assert(nir_dest_bit_size(instr->dest.dest) < 64);
+ case nir_op_fabs:
+ case nir_op_fneg:
+ case nir_op_fsat:
+ unreachable("not reached: should be lowered by lower_source mods");
+
+ case nir_op_fdiv:
+ unreachable("not reached: should be lowered by DIV_TO_MUL_RCP in the compiler");
+
+ case nir_op_fmod:
+ unreachable("not reached: should be lowered by MOD_TO_FLOOR in the compiler");
+
+ case nir_op_fsub:
+ case nir_op_isub:
+ unreachable("not reached: should be handled by ir_sub_to_add_neg");
+
+ default:
+ unreachable("Unimplemented ALU operation");
+ }
+
+ /* If we need to do a boolean resolve, replace the result with -(x & 1)
+ * to sign extend the low bit to 0/~0
+ */
+ if (devinfo->gen <= 5 &&
+ (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) ==
+ BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
+ dst_reg masked = dst_reg(this, glsl_type::int_type);
+ masked.writemask = dst.writemask;
+ emit(AND(masked, src_reg(dst), brw_imm_d(1)));
+ src_reg masked_neg = src_reg(masked);
+ masked_neg.negate = true;
+ emit(MOV(retype(dst, BRW_REGISTER_TYPE_D), masked_neg));
+ }
+}
+
+void
+vec4_visitor::nir_emit_jump(nir_jump_instr *instr)
+{
+ switch (instr->type) {
+ case nir_jump_break:
+ emit(BRW_OPCODE_BREAK);
+ break;
+
+ case nir_jump_continue:
+ emit(BRW_OPCODE_CONTINUE);
+ break;
+
+ case nir_jump_return:
+ /* fall through */
+ default:
+ unreachable("unknown jump");
+ }
+}
+
+enum ir_texture_opcode
+ir_texture_opcode_for_nir_texop(nir_texop texop)
+{
+ enum ir_texture_opcode op;
+
+ switch (texop) {
+ case nir_texop_lod: op = ir_lod; break;
+ case nir_texop_query_levels: op = ir_query_levels; break;
+ case nir_texop_texture_samples: op = ir_texture_samples; break;
+ case nir_texop_tex: op = ir_tex; break;
+ case nir_texop_tg4: op = ir_tg4; break;
+ case nir_texop_txb: op = ir_txb; break;
+ case nir_texop_txd: op = ir_txd; break;
+ case nir_texop_txf: op = ir_txf; break;
+ case nir_texop_txf_ms: op = ir_txf_ms; break;
+ case nir_texop_txl: op = ir_txl; break;
+ case nir_texop_txs: op = ir_txs; break;
+ case nir_texop_samples_identical: op = ir_samples_identical; break;
+ default:
+ unreachable("unknown texture opcode");
+ }
+
+ return op;
+}
+const glsl_type *
+glsl_type_for_nir_alu_type(nir_alu_type alu_type,
+ unsigned components)
+{
+ return glsl_type::get_instance(brw_glsl_base_type_for_nir_type(alu_type),
+ components, 1);
+}
+
+void
+vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
+{
+ unsigned texture = instr->texture_index;
+ unsigned sampler = instr->sampler_index;
+ src_reg texture_reg = brw_imm_ud(texture);
+ src_reg sampler_reg = brw_imm_ud(sampler);
+ src_reg coordinate;
+ const glsl_type *coord_type = NULL;
+ src_reg shadow_comparator;
+ src_reg offset_value;
+ src_reg lod, lod2;
+ src_reg sample_index;
+ src_reg mcs;
+
+ const glsl_type *dest_type =
+ glsl_type_for_nir_alu_type(instr->dest_type,
+ nir_tex_instr_dest_size(instr));
+ dst_reg dest = get_nir_dest(instr->dest, instr->dest_type);
+
+ /* The hardware requires a LOD for buffer textures */
+ if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
+ lod = brw_imm_d(0);
+
+ /* Load the texture operation sources */
+ uint32_t constant_offset = 0;
+ for (unsigned i = 0; i < instr->num_srcs; i++) {
+ switch (instr->src[i].src_type) {
+ case nir_tex_src_comparator:
+ shadow_comparator = get_nir_src(instr->src[i].src,
+ BRW_REGISTER_TYPE_F, 1);
+ break;
+
+ case nir_tex_src_coord: {
+ unsigned src_size = nir_tex_instr_src_size(instr, i);
+
+ switch (instr->op) {
+ case nir_texop_txf:
+ case nir_texop_txf_ms:
+ case nir_texop_samples_identical:
+ coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D,
+ src_size);
+ coord_type = glsl_type::ivec(src_size);
+ break;
+
+ default:
+ coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F,
+ src_size);
+ coord_type = glsl_type::vec(src_size);
+ break;
+ }
+ break;
+ }
+
+ case nir_tex_src_ddx:
+ lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F,
+ nir_tex_instr_src_size(instr, i));
+ break;
+
+ case nir_tex_src_ddy:
+ lod2 = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F,
+ nir_tex_instr_src_size(instr, i));
+ break;
+
+ case nir_tex_src_lod:
+ switch (instr->op) {
+ case nir_texop_txs:
+ case nir_texop_txf:
+ lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1);
+ break;
+
+ default:
+ lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, 1);
+ break;
+ }
+ break;
+
+ case nir_tex_src_ms_index: {
+ sample_index = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1);
+ break;
+ }
+
+ case nir_tex_src_offset: {
+ nir_const_value *const_offset =
+ nir_src_as_const_value(instr->src[i].src);
+ if (!const_offset ||
+ !brw_texture_offset(const_offset->i32,
+ nir_tex_instr_src_size(instr, i),
+ &constant_offset)) {
+ offset_value =
+ get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 2);
+ }
+ break;
+ }
+
+ case nir_tex_src_texture_offset: {
+ /* The highest texture which may be used by this operation is
+ * the last element of the array. Mark it here, because the generator
+ * doesn't have enough information to determine the bound.
+ */
+ uint32_t array_size = instr->texture_array_size;
+ uint32_t max_used = texture + array_size - 1;
+ if (instr->op == nir_texop_tg4) {
+ max_used += prog_data->base.binding_table.gather_texture_start;
+ } else {
+ max_used += prog_data->base.binding_table.texture_start;
+ }
+
+ brw_mark_surface_used(&prog_data->base, max_used);
+
+ /* Emit code to evaluate the actual indexing expression */
+ src_reg src = get_nir_src(instr->src[i].src, 1);
+ src_reg temp(this, glsl_type::uint_type);
+ emit(ADD(dst_reg(temp), src, brw_imm_ud(texture)));
+ texture_reg = emit_uniformize(temp);
+ break;
+ }
+
+ case nir_tex_src_sampler_offset: {
+ /* Emit code to evaluate the actual indexing expression */
+ src_reg src = get_nir_src(instr->src[i].src, 1);
+ src_reg temp(this, glsl_type::uint_type);
+ emit(ADD(dst_reg(temp), src, brw_imm_ud(sampler)));
+ sampler_reg = emit_uniformize(temp);
+ break;
+ }
+
+ case nir_tex_src_projector:
+ unreachable("Should be lowered by do_lower_texture_projection");
+
+ case nir_tex_src_bias:
+ unreachable("LOD bias is not valid for vertex shaders.\n");
+
+ default:
+ unreachable("unknown texture source");
+ }
+ }
+
+ if (instr->op == nir_texop_txf_ms ||
+ instr->op == nir_texop_samples_identical) {
+ assert(coord_type != NULL);
+ if (devinfo->gen >= 7 &&
+ key_tex->compressed_multisample_layout_mask & (1 << texture)) {
+ mcs = emit_mcs_fetch(coord_type, coordinate, texture_reg);
+ } else {
+ mcs = brw_imm_ud(0u);
+ }
+ }
+
+ /* Stuff the channel select bits in the top of the texture offset */
+ if (instr->op == nir_texop_tg4) {
+ if (instr->component == 1 &&
+ (key_tex->gather_channel_quirk_mask & (1 << texture))) {
+ /* gather4 sampler is broken for green channel on RG32F --
+ * we must ask for blue instead.
+ */
+ constant_offset |= 2 << 16;
+ } else {
+ constant_offset |= instr->component << 16;
+ }
+ }
+
+ ir_texture_opcode op = ir_texture_opcode_for_nir_texop(instr->op);
+
+ emit_texture(op, dest, dest_type, coordinate, instr->coord_components,
+ shadow_comparator,
+ lod, lod2, sample_index,
+ constant_offset, offset_value, mcs,
+ texture, texture_reg, sampler_reg);
+}
+
+void
+vec4_visitor::nir_emit_undef(nir_ssa_undef_instr *instr)
+{
+ nir_ssa_values[instr->def.index] =
+ dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(instr->def.bit_size, 32)));
+}
+
+/* SIMD4x2 64bit data is stored in register space like this:
+ *
+ * r0.0:DF x0 y0 z0 w0
+ * r1.0:DF x1 y1 z1 w1
+ *
+ * When we need to write data such as this to memory using 32-bit write
+ * messages we need to shuffle it in this fashion:
+ *
+ * r0.0:DF x0 y0 x1 y1 (to be written at base offset)
+ * r0.0:DF z0 w0 z1 w1 (to be written at base offset + 16)
+ *
+ * We need to do the inverse operation when we read using 32-bit messages,
+ * which we can do by applying the same exact shuffling on the 64-bit data
+ * read, only that because the data for each vertex is positioned differently
+ * we need to apply different channel enables.
+ *
+ * This function takes 64bit data and shuffles it as explained above.
+ *
+ * The @for_write parameter is used to specify if the shuffling is being done
+ * for proper SIMD4x2 64-bit data that needs to be shuffled prior to a 32-bit
+ * write message (for_write = true), or instead we are doing the inverse
+ * operation and we have just read 64-bit data using a 32-bit messages that we
+ * need to shuffle to create valid SIMD4x2 64-bit data (for_write = false).
+ *
+ * If @block and @ref are non-NULL, then the shuffling is done after @ref,
+ * otherwise the instructions are emitted normally at the end. The function
+ * returns the last instruction inserted.
+ *
+ * Notice that @src and @dst cannot be the same register.
+ */
+vec4_instruction *
+vec4_visitor::shuffle_64bit_data(dst_reg dst, src_reg src, bool for_write,
+ bblock_t *block, vec4_instruction *ref)
+{
+ assert(type_sz(src.type) == 8);
+ assert(type_sz(dst.type) == 8);
+ assert(!regions_overlap(dst, 2 * REG_SIZE, src, 2 * REG_SIZE));
+ assert(!ref == !block);
+
+ const vec4_builder bld = !ref ? vec4_builder(this).at_end() :
+ vec4_builder(this).at(block, ref->next);
+
+ /* Resolve swizzle in src */
+ vec4_instruction *inst;
+ if (src.swizzle != BRW_SWIZZLE_XYZW) {
+ dst_reg data = dst_reg(this, glsl_type::dvec4_type);
+ inst = bld.MOV(data, src);
+ src = src_reg(data);
+ }
+
+ /* dst+0.XY = src+0.XY */
+ inst = bld.group(4, 0).MOV(writemask(dst, WRITEMASK_XY), src);
+
+ /* dst+0.ZW = src+1.XY */
+ inst = bld.group(4, for_write ? 1 : 0)
+ .MOV(writemask(dst, WRITEMASK_ZW),
+ swizzle(byte_offset(src, REG_SIZE), BRW_SWIZZLE_XYXY));
+
+ /* dst+1.XY = src+0.ZW */
+ inst = bld.group(4, for_write ? 0 : 1)
+ .MOV(writemask(byte_offset(dst, REG_SIZE), WRITEMASK_XY),
+ swizzle(src, BRW_SWIZZLE_ZWZW));
+
+ /* dst+1.ZW = src+1.ZW */
+ inst = bld.group(4, 1)
+ .MOV(writemask(byte_offset(dst, REG_SIZE), WRITEMASK_ZW),
+ byte_offset(src, REG_SIZE));
+
+ return inst;
+}
+
+}
diff --git a/src/intel/compiler/brw_vec4_reg_allocate.cpp b/src/intel/compiler/brw_vec4_reg_allocate.cpp
new file mode 100644
index 00000000000..e3b46cc2f7f
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_reg_allocate.cpp
@@ -0,0 +1,558 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/register_allocate.h"
+#include "brw_vec4.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+namespace brw {
+
+static void
+assign(unsigned int *reg_hw_locations, backend_reg *reg)
+{
+ if (reg->file == VGRF) {
+ reg->nr = reg_hw_locations[reg->nr] + reg->offset / REG_SIZE;
+ reg->offset %= REG_SIZE;
+ }
+}
+
+bool
+vec4_visitor::reg_allocate_trivial()
+{
+ unsigned int hw_reg_mapping[this->alloc.count];
+ bool virtual_grf_used[this->alloc.count];
+ int next;
+
+ /* Calculate which virtual GRFs are actually in use after whatever
+ * optimization passes have occurred.
+ */
+ for (unsigned i = 0; i < this->alloc.count; i++) {
+ virtual_grf_used[i] = false;
+ }
+
+ foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+ if (inst->dst.file == VGRF)
+ virtual_grf_used[inst->dst.nr] = true;
+
+ for (unsigned i = 0; i < 3; i++) {
+ if (inst->src[i].file == VGRF)
+ virtual_grf_used[inst->src[i].nr] = true;
+ }
+ }
+
+ hw_reg_mapping[0] = this->first_non_payload_grf;
+ next = hw_reg_mapping[0] + this->alloc.sizes[0];
+ for (unsigned i = 1; i < this->alloc.count; i++) {
+ if (virtual_grf_used[i]) {
+ hw_reg_mapping[i] = next;
+ next += this->alloc.sizes[i];
+ }
+ }
+ prog_data->total_grf = next;
+
+ foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+ assign(hw_reg_mapping, &inst->dst);
+ assign(hw_reg_mapping, &inst->src[0]);
+ assign(hw_reg_mapping, &inst->src[1]);
+ assign(hw_reg_mapping, &inst->src[2]);
+ }
+
+ if (prog_data->total_grf > max_grf) {
+ fail("Ran out of regs on trivial allocator (%d/%d)\n",
+ prog_data->total_grf, max_grf);
+ return false;
+ }
+
+ return true;
+}
+
+extern "C" void
+brw_vec4_alloc_reg_set(struct brw_compiler *compiler)
+{
+ int base_reg_count =
+ compiler->devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
+
+ /* After running split_virtual_grfs(), almost all VGRFs will be of size 1.
+ * SEND-from-GRF sources cannot be split, so we also need classes for each
+ * potential message length.
+ */
+ const int class_count = MAX_VGRF_SIZE;
+ int class_sizes[MAX_VGRF_SIZE];
+
+ for (int i = 0; i < class_count; i++)
+ class_sizes[i] = i + 1;
+
+ /* Compute the total number of registers across all classes. */
+ int ra_reg_count = 0;
+ for (int i = 0; i < class_count; i++) {
+ ra_reg_count += base_reg_count - (class_sizes[i] - 1);
+ }
+
+ ralloc_free(compiler->vec4_reg_set.ra_reg_to_grf);
+ compiler->vec4_reg_set.ra_reg_to_grf = ralloc_array(compiler, uint8_t, ra_reg_count);
+ ralloc_free(compiler->vec4_reg_set.regs);
+ compiler->vec4_reg_set.regs = ra_alloc_reg_set(compiler, ra_reg_count, false);
+ if (compiler->devinfo->gen >= 6)
+ ra_set_allocate_round_robin(compiler->vec4_reg_set.regs);
+ ralloc_free(compiler->vec4_reg_set.classes);
+ compiler->vec4_reg_set.classes = ralloc_array(compiler, int, class_count);
+
+ /* Now, add the registers to their classes, and add the conflicts
+ * between them and the base GRF registers (and also each other).
+ */
+ int reg = 0;
+ unsigned *q_values[MAX_VGRF_SIZE];
+ for (int i = 0; i < class_count; i++) {
+ int class_reg_count = base_reg_count - (class_sizes[i] - 1);
+ compiler->vec4_reg_set.classes[i] = ra_alloc_reg_class(compiler->vec4_reg_set.regs);
+
+ q_values[i] = new unsigned[MAX_VGRF_SIZE];
+
+ for (int j = 0; j < class_reg_count; j++) {
+ ra_class_add_reg(compiler->vec4_reg_set.regs, compiler->vec4_reg_set.classes[i], reg);
+
+ compiler->vec4_reg_set.ra_reg_to_grf[reg] = j;
+
+ for (int base_reg = j;
+ base_reg < j + class_sizes[i];
+ base_reg++) {
+ ra_add_reg_conflict(compiler->vec4_reg_set.regs, base_reg, reg);
+ }
+
+ reg++;
+ }
+
+ for (int j = 0; j < class_count; j++) {
+ /* Calculate the q values manually because the algorithm used by
+ * ra_set_finalize() to do it has higher complexity affecting the
+ * start-up time of some applications. q(i, j) is just the maximum
+ * number of registers from class i a register from class j can
+ * conflict with.
+ */
+ q_values[i][j] = class_sizes[i] + class_sizes[j] - 1;
+ }
+ }
+ assert(reg == ra_reg_count);
+
+ for (int reg = 0; reg < base_reg_count; reg++)
+ ra_make_reg_conflicts_transitive(compiler->vec4_reg_set.regs, reg);
+
+ ra_set_finalize(compiler->vec4_reg_set.regs, q_values);
+
+ for (int i = 0; i < MAX_VGRF_SIZE; i++)
+ delete[] q_values[i];
+}
+
+void
+vec4_visitor::setup_payload_interference(struct ra_graph *g,
+ int first_payload_node,
+ int reg_node_count)
+{
+ int payload_node_count = this->first_non_payload_grf;
+
+ for (int i = 0; i < payload_node_count; i++) {
+ /* Mark each payload reg node as being allocated to its physical register.
+ *
+ * The alternative would be to have per-physical register classes, which
+ * would just be silly.
+ */
+ ra_set_node_reg(g, first_payload_node + i, i);
+
+ /* For now, just mark each payload node as interfering with every other
+ * node to be allocated.
+ */
+ for (int j = 0; j < reg_node_count; j++) {
+ ra_add_node_interference(g, first_payload_node + i, j);
+ }
+ }
+}
+
+bool
+vec4_visitor::reg_allocate()
+{
+ unsigned int hw_reg_mapping[alloc.count];
+ int payload_reg_count = this->first_non_payload_grf;
+
+ /* Using the trivial allocator can be useful in debugging undefined
+ * register access as a result of broken optimization passes.
+ */
+ if (0)
+ return reg_allocate_trivial();
+
+ calculate_live_intervals();
+
+ int node_count = alloc.count;
+ int first_payload_node = node_count;
+ node_count += payload_reg_count;
+ struct ra_graph *g =
+ ra_alloc_interference_graph(compiler->vec4_reg_set.regs, node_count);
+
+ for (unsigned i = 0; i < alloc.count; i++) {
+ int size = this->alloc.sizes[i];
+ assert(size >= 1 && size <= MAX_VGRF_SIZE);
+ ra_set_node_class(g, i, compiler->vec4_reg_set.classes[size - 1]);
+
+ for (unsigned j = 0; j < i; j++) {
+ if (virtual_grf_interferes(i, j)) {
+ ra_add_node_interference(g, i, j);
+ }
+ }
+ }
+
+ /* Certain instructions can't safely use the same register for their
+ * sources and destination. Add interference.
+ */
+ foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+ if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) {
+ for (unsigned i = 0; i < 3; i++) {
+ if (inst->src[i].file == VGRF) {
+ ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr);
+ }
+ }
+ }
+ }
+
+ setup_payload_interference(g, first_payload_node, node_count);
+
+ if (!ra_allocate(g)) {
+ /* Failed to allocate registers. Spill a reg, and the caller will
+ * loop back into here to try again.
+ */
+ int reg = choose_spill_reg(g);
+ if (this->no_spills) {
+ fail("Failure to register allocate. Reduce number of live "
+ "values to avoid this.");
+ } else if (reg == -1) {
+ fail("no register to spill\n");
+ } else {
+ spill_reg(reg);
+ }
+ ralloc_free(g);
+ return false;
+ }
+
+ /* Get the chosen virtual registers for each node, and map virtual
+ * regs in the register classes back down to real hardware reg
+ * numbers.
+ */
+ prog_data->total_grf = payload_reg_count;
+ for (unsigned i = 0; i < alloc.count; i++) {
+ int reg = ra_get_node_reg(g, i);
+
+ hw_reg_mapping[i] = compiler->vec4_reg_set.ra_reg_to_grf[reg];
+ prog_data->total_grf = MAX2(prog_data->total_grf,
+ hw_reg_mapping[i] + alloc.sizes[i]);
+ }
+
+ foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+ assign(hw_reg_mapping, &inst->dst);
+ assign(hw_reg_mapping, &inst->src[0]);
+ assign(hw_reg_mapping, &inst->src[1]);
+ assign(hw_reg_mapping, &inst->src[2]);
+ }
+
+ ralloc_free(g);
+
+ return true;
+}
+
+/**
+ * When we decide to spill a register, instead of blindly spilling every use,
+ * save unspills when the spill register is used (read) in consecutive
+ * instructions. This can potentially save a bunch of unspills that would
+ * have very little impact in register allocation anyway.
+ *
+ * Notice that we need to account for this behavior when spilling a register
+ * and when evaluating spilling costs. This function is designed so it can
+ * be called from both places and avoid repeating the logic.
+ *
+ * - When we call this function from spill_reg(), we pass in scratch_reg the
+ * actual unspill/spill register that we want to reuse in the current
+ * instruction.
+ *
+ * - When we call this from evaluate_spill_costs(), we pass the register for
+ * which we are evaluating spilling costs.
+ *
+ * In either case, we check if the previous instructions read scratch_reg until
+ * we find one that writes to it with a compatible mask or does not read/write
+ * scratch_reg at all.
+ */
+static bool
+can_use_scratch_for_source(const vec4_instruction *inst, unsigned i,
+ unsigned scratch_reg)
+{
+ assert(inst->src[i].file == VGRF);
+ bool prev_inst_read_scratch_reg = false;
+
+ /* See if any previous source in the same instructions reads scratch_reg */
+ for (unsigned n = 0; n < i; n++) {
+ if (inst->src[n].file == VGRF && inst->src[n].nr == scratch_reg)
+ prev_inst_read_scratch_reg = true;
+ }
+
+ /* Now check if previous instructions read/write scratch_reg */
+ for (vec4_instruction *prev_inst = (vec4_instruction *) inst->prev;
+ !prev_inst->is_head_sentinel();
+ prev_inst = (vec4_instruction *) prev_inst->prev) {
+
+ /* If the previous instruction writes to scratch_reg then we can reuse
+ * it if the write is not conditional and the channels we write are
+ * compatible with our read mask
+ */
+ if (prev_inst->dst.file == VGRF && prev_inst->dst.nr == scratch_reg) {
+ return (!prev_inst->predicate || prev_inst->opcode == BRW_OPCODE_SEL) &&
+ (brw_mask_for_swizzle(inst->src[i].swizzle) &
+ ~prev_inst->dst.writemask) == 0;
+ }
+
+ /* Skip scratch read/writes so that instructions generated by spilling
+ * other registers (that won't read/write scratch_reg) do not stop us from
+ * reusing scratch_reg for this instruction.
+ */
+ if (prev_inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE ||
+ prev_inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_READ)
+ continue;
+
+ /* If the previous instruction does not write to scratch_reg, then check
+ * if it reads it
+ */
+ int n;
+ for (n = 0; n < 3; n++) {
+ if (prev_inst->src[n].file == VGRF &&
+ prev_inst->src[n].nr == scratch_reg) {
+ prev_inst_read_scratch_reg = true;
+ break;
+ }
+ }
+ if (n == 3) {
+ /* The previous instruction does not read scratch_reg. At this point,
+ * if no previous instruction has read scratch_reg it means that we
+ * will need to unspill it here and we can't reuse it (so we return
+ * false). Otherwise, if we found at least one consecutive instruction
+ * that read scratch_reg, then we know that we got here from
+ * evaluate_spill_costs (since for the spill_reg path any block of
+ * consecutive instructions using scratch_reg must start with a write
+ * to that register, so we would've exited the loop in the check for
+ * the write that we have at the start of this loop), and in that case
+ * it means that we found the point at which the scratch_reg would be
+ * unspilled. Since we always unspill a full vec4, it means that we
+ * have all the channels available and we can just return true to
+ * signal that we can reuse the register in the current instruction
+ * too.
+ */
+ return prev_inst_read_scratch_reg;
+ }
+ }
+
+ return prev_inst_read_scratch_reg;
+}
+
+static inline unsigned
+spill_cost_for_type(enum brw_reg_type type)
+{
+ /* Spilling of a 64-bit register involves emitting 2 32-bit scratch
+ * messages plus the 64b/32b shuffling code.
+ */
+ return type_sz(type) == 8 ? 2.25f : 1.0f;
+}
+
+void
+vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
+{
+ float loop_scale = 1.0;
+
+ unsigned *reg_type_size = (unsigned *)
+ ralloc_size(NULL, this->alloc.count * sizeof(unsigned));
+
+ for (unsigned i = 0; i < this->alloc.count; i++) {
+ spill_costs[i] = 0.0;
+ no_spill[i] = alloc.sizes[i] != 1 && alloc.sizes[i] != 2;
+ reg_type_size[i] = 0;
+ }
+
+ /* Calculate costs for spilling nodes. Call it a cost of 1 per
+ * spill/unspill we'll have to do, and guess that the insides of
+ * loops run 10 times.
+ */
+ foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+ for (unsigned int i = 0; i < 3; i++) {
+ if (inst->src[i].file == VGRF && !no_spill[inst->src[i].nr]) {
+ /* We will only unspill src[i] it it wasn't unspilled for the
+ * previous instruction, in which case we'll just reuse the scratch
+ * reg for this instruction.
+ */
+ if (!can_use_scratch_for_source(inst, i, inst->src[i].nr)) {
+ spill_costs[inst->src[i].nr] +=
+ loop_scale * spill_cost_for_type(inst->src[i].type);
+ if (inst->src[i].reladdr ||
+ inst->src[i].offset >= REG_SIZE)
+ no_spill[inst->src[i].nr] = true;
+
+ /* We don't support unspills of partial DF reads.
+ *
+ * Our 64-bit unspills are implemented with two 32-bit scratch
+ * messages, each one reading that for both SIMD4x2 threads that
+ * we need to shuffle into correct 64-bit data. Ensure that we
+ * are reading data for both threads.
+ */
+ if (type_sz(inst->src[i].type) == 8 && inst->exec_size != 8)
+ no_spill[inst->src[i].nr] = true;
+ }
+
+ /* We can't spill registers that mix 32-bit and 64-bit access (that
+ * contain 64-bit data that is operated on via 32-bit instructions)
+ */
+ unsigned type_size = type_sz(inst->src[i].type);
+ if (reg_type_size[inst->src[i].nr] == 0)
+ reg_type_size[inst->src[i].nr] = type_size;
+ else if (reg_type_size[inst->src[i].nr] != type_size)
+ no_spill[inst->src[i].nr] = true;
+ }
+ }
+
+ if (inst->dst.file == VGRF && !no_spill[inst->dst.nr]) {
+ spill_costs[inst->dst.nr] +=
+ loop_scale * spill_cost_for_type(inst->dst.type);
+ if (inst->dst.reladdr || inst->dst.offset >= REG_SIZE)
+ no_spill[inst->dst.nr] = true;
+
+ /* We don't support spills of partial DF writes.
+ *
+ * Our 64-bit spills are implemented with two 32-bit scratch messages,
+ * each one writing that for both SIMD4x2 threads. Ensure that we
+ * are writing data for both threads.
+ */
+ if (type_sz(inst->dst.type) == 8 && inst->exec_size != 8)
+ no_spill[inst->dst.nr] = true;
+
+ /* FROM_DOUBLE opcodes are setup so that they use a dst register
+ * with a size of 2 even if they only produce a single-precison
+ * result (this is so that the opcode can use the larger register to
+ * produce a 64-bit aligned intermediary result as required by the
+ * hardware during the conversion process). This creates a problem for
+ * spilling though, because when we attempt to emit a spill for the
+ * dst we see a 32-bit destination and emit a scratch write that
+ * allocates a single spill register.
+ */
+ if (inst->opcode == VEC4_OPCODE_FROM_DOUBLE)
+ no_spill[inst->dst.nr] = true;
+
+ /* We can't spill registers that mix 32-bit and 64-bit access (that
+ * contain 64-bit data that is operated on via 32-bit instructions)
+ */
+ unsigned type_size = type_sz(inst->dst.type);
+ if (reg_type_size[inst->dst.nr] == 0)
+ reg_type_size[inst->dst.nr] = type_size;
+ else if (reg_type_size[inst->dst.nr] != type_size)
+ no_spill[inst->dst.nr] = true;
+ }
+
+ switch (inst->opcode) {
+
+ case BRW_OPCODE_DO:
+ loop_scale *= 10;
+ break;
+
+ case BRW_OPCODE_WHILE:
+ loop_scale /= 10;
+ break;
+
+ case SHADER_OPCODE_GEN4_SCRATCH_READ:
+ case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
+ for (int i = 0; i < 3; i++) {
+ if (inst->src[i].file == VGRF)
+ no_spill[inst->src[i].nr] = true;
+ }
+ if (inst->dst.file == VGRF)
+ no_spill[inst->dst.nr] = true;
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ ralloc_free(reg_type_size);
+}
+
+int
+vec4_visitor::choose_spill_reg(struct ra_graph *g)
+{
+ float spill_costs[this->alloc.count];
+ bool no_spill[this->alloc.count];
+
+ evaluate_spill_costs(spill_costs, no_spill);
+
+ for (unsigned i = 0; i < this->alloc.count; i++) {
+ if (!no_spill[i])
+ ra_set_node_spill_cost(g, i, spill_costs[i]);
+ }
+
+ return ra_get_best_spill_node(g);
+}
+
+void
+vec4_visitor::spill_reg(int spill_reg_nr)
+{
+ assert(alloc.sizes[spill_reg_nr] == 1 || alloc.sizes[spill_reg_nr] == 2);
+ unsigned int spill_offset = last_scratch;
+ last_scratch += alloc.sizes[spill_reg_nr];
+
+ /* Generate spill/unspill instructions for the objects being spilled. */
+ int scratch_reg = -1;
+ foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+ for (unsigned int i = 0; i < 3; i++) {
+ if (inst->src[i].file == VGRF && inst->src[i].nr == spill_reg_nr) {
+ if (scratch_reg == -1 ||
+ !can_use_scratch_for_source(inst, i, scratch_reg)) {
+ /* We need to unspill anyway so make sure we read the full vec4
+ * in any case. This way, the cached register can be reused
+ * for consecutive instructions that read different channels of
+ * the same vec4.
+ */
+ scratch_reg = alloc.allocate(alloc.sizes[spill_reg_nr]);
+ src_reg temp = inst->src[i];
+ temp.nr = scratch_reg;
+ temp.offset = 0;
+ temp.swizzle = BRW_SWIZZLE_XYZW;
+ emit_scratch_read(block, inst,
+ dst_reg(temp), inst->src[i], spill_offset);
+ temp.offset = inst->src[i].offset;
+ }
+ assert(scratch_reg != -1);
+ inst->src[i].nr = scratch_reg;
+ }
+ }
+
+ if (inst->dst.file == VGRF && inst->dst.nr == spill_reg_nr) {
+ emit_scratch_write(block, inst, spill_offset);
+ scratch_reg = inst->dst.nr;
+ }
+ }
+
+ invalidate_live_intervals();
+}
+
+} /* namespace brw */
diff --git a/src/intel/compiler/brw_vec4_surface_builder.cpp b/src/intel/compiler/brw_vec4_surface_builder.cpp
new file mode 100644
index 00000000000..00c94fedca2
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_surface_builder.cpp
@@ -0,0 +1,332 @@
+/*
+ * Copyright © 2013-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4_surface_builder.h"
+
+using namespace brw;
+
+namespace {
+ namespace array_utils {
+ /**
+ * Copy one every \p src_stride logical components of the argument into
+ * one every \p dst_stride logical components of the result.
+ */
+ src_reg
+ emit_stride(const vec4_builder &bld, const src_reg &src, unsigned size,
+ unsigned dst_stride, unsigned src_stride)
+ {
+ if (src_stride == 1 && dst_stride == 1) {
+ return src;
+ } else {
+ const dst_reg dst = bld.vgrf(src.type,
+ DIV_ROUND_UP(size * dst_stride, 4));
+
+ for (unsigned i = 0; i < size; ++i)
+ bld.MOV(writemask(offset(dst, 8, i * dst_stride / 4),
+ 1 << (i * dst_stride % 4)),
+ swizzle(offset(src, 8, i * src_stride / 4),
+ brw_swizzle_for_mask(1 << (i * src_stride % 4))));
+
+ return src_reg(dst);
+ }
+ }
+
+ /**
+ * Convert a VEC4 into an array of registers with the layout expected by
+ * the recipient shared unit. If \p has_simd4x2 is true the argument is
+ * left unmodified in SIMD4x2 form, otherwise it will be rearranged into
+ * a SIMD8 vector.
+ */
+ src_reg
+ emit_insert(const vec4_builder &bld, const src_reg &src,
+ unsigned n, bool has_simd4x2)
+ {
+ if (src.file == BAD_FILE || n == 0) {
+ return src_reg();
+
+ } else {
+ /* Pad unused components with zeroes. */
+ const unsigned mask = (1 << n) - 1;
+ const dst_reg tmp = bld.vgrf(src.type);
+
+ bld.MOV(writemask(tmp, mask), src);
+ if (n < 4)
+ bld.MOV(writemask(tmp, ~mask), brw_imm_d(0));
+
+ return emit_stride(bld, src_reg(tmp), n, has_simd4x2 ? 1 : 4, 1);
+ }
+ }
+
+ /**
+ * Convert an array of registers back into a VEC4 according to the
+ * layout expected from some shared unit. If \p has_simd4x2 is true the
+ * argument is left unmodified in SIMD4x2 form, otherwise it will be
+ * rearranged from SIMD8 form.
+ */
+ src_reg
+ emit_extract(const vec4_builder &bld, const src_reg src,
+ unsigned n, bool has_simd4x2)
+ {
+ if (src.file == BAD_FILE || n == 0) {
+ return src_reg();
+
+ } else {
+ return emit_stride(bld, src, n, 1, has_simd4x2 ? 1 : 4);
+ }
+ }
+ }
+}
+
+namespace brw {
+ namespace surface_access {
+ namespace {
+ using namespace array_utils;
+
+ /**
+ * Generate a send opcode for a surface message and return the
+ * result.
+ */
+ src_reg
+ emit_send(const vec4_builder &bld, enum opcode op,
+ const src_reg &header,
+ const src_reg &addr, unsigned addr_sz,
+ const src_reg &src, unsigned src_sz,
+ const src_reg &surface,
+ unsigned arg, unsigned ret_sz,
+ brw_predicate pred = BRW_PREDICATE_NONE)
+ {
+ /* Calculate the total number of components of the payload. */
+ const unsigned header_sz = (header.file == BAD_FILE ? 0 : 1);
+ const unsigned sz = header_sz + addr_sz + src_sz;
+
+ /* Construct the payload. */
+ const dst_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
+ unsigned n = 0;
+
+ if (header_sz)
+ bld.exec_all().MOV(offset(payload, 8, n++),
+ retype(header, BRW_REGISTER_TYPE_UD));
+
+ for (unsigned i = 0; i < addr_sz; i++)
+ bld.MOV(offset(payload, 8, n++),
+ offset(retype(addr, BRW_REGISTER_TYPE_UD), 8, i));
+
+ for (unsigned i = 0; i < src_sz; i++)
+ bld.MOV(offset(payload, 8, n++),
+ offset(retype(src, BRW_REGISTER_TYPE_UD), 8, i));
+
+ /* Reduce the dynamically uniform surface index to a single
+ * scalar.
+ */
+ const src_reg usurface = bld.emit_uniformize(surface);
+
+ /* Emit the message send instruction. */
+ const dst_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, ret_sz);
+ vec4_instruction *inst =
+ bld.emit(op, dst, src_reg(payload), usurface, brw_imm_ud(arg));
+ inst->mlen = sz;
+ inst->size_written = ret_sz * REG_SIZE;
+ inst->header_size = header_sz;
+ inst->predicate = pred;
+
+ return src_reg(dst);
+ }
+ }
+
+ /**
+ * Emit an untyped surface read opcode. \p dims determines the number
+ * of components of the address and \p size the number of components of
+ * the returned value.
+ */
+ src_reg
+ emit_untyped_read(const vec4_builder &bld,
+ const src_reg &surface, const src_reg &addr,
+ unsigned dims, unsigned size,
+ brw_predicate pred)
+ {
+ return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ, src_reg(),
+ emit_insert(bld, addr, dims, true), 1,
+ src_reg(), 0,
+ surface, size, 1, pred);
+ }
+
+ /**
+ * Emit an untyped surface write opcode. \p dims determines the number
+ * of components of the address and \p size the number of components of
+ * the argument.
+ */
+ void
+ emit_untyped_write(const vec4_builder &bld, const src_reg &surface,
+ const src_reg &addr, const src_reg &src,
+ unsigned dims, unsigned size,
+ brw_predicate pred)
+ {
+ const bool has_simd4x2 = (bld.shader->devinfo->gen >= 8 ||
+ bld.shader->devinfo->is_haswell);
+ emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE, src_reg(),
+ emit_insert(bld, addr, dims, has_simd4x2),
+ has_simd4x2 ? 1 : dims,
+ emit_insert(bld, src, size, has_simd4x2),
+ has_simd4x2 ? 1 : size,
+ surface, size, 0, pred);
+ }
+
+ /**
+ * Emit an untyped surface atomic opcode. \p dims determines the number
+ * of components of the address and \p rsize the number of components of
+ * the returned value (either zero or one).
+ */
+ src_reg
+ emit_untyped_atomic(const vec4_builder &bld,
+ const src_reg &surface, const src_reg &addr,
+ const src_reg &src0, const src_reg &src1,
+ unsigned dims, unsigned rsize, unsigned op,
+ brw_predicate pred)
+ {
+ const bool has_simd4x2 = (bld.shader->devinfo->gen >= 8 ||
+ bld.shader->devinfo->is_haswell);
+
+ /* Zip the components of both sources, they are represented as the X
+ * and Y components of the same vector.
+ */
+ const unsigned size = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
+ const dst_reg srcs = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+ if (size >= 1)
+ bld.MOV(writemask(srcs, WRITEMASK_X), src0);
+ if (size >= 2)
+ bld.MOV(writemask(srcs, WRITEMASK_Y), src1);
+
+ return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC, src_reg(),
+ emit_insert(bld, addr, dims, has_simd4x2),
+ has_simd4x2 ? 1 : dims,
+ emit_insert(bld, src_reg(srcs), size, has_simd4x2),
+ has_simd4x2 && size ? 1 : size,
+ surface, op, rsize, pred);
+ }
+
+ namespace {
+ /**
+ * Initialize the header present in typed surface messages.
+ */
+ src_reg
+ emit_typed_message_header(const vec4_builder &bld)
+ {
+ const vec4_builder ubld = bld.exec_all();
+ const dst_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+ ubld.MOV(dst, brw_imm_d(0));
+
+ if (bld.shader->devinfo->gen == 7 &&
+ !bld.shader->devinfo->is_haswell) {
+ /* The sample mask is used on IVB for the SIMD8 messages that
+ * have no SIMD4x2 variant. We only use the two X channels
+ * in that case, mask everything else out.
+ */
+ ubld.MOV(writemask(dst, WRITEMASK_W), brw_imm_d(0x11));
+ }
+
+ return src_reg(dst);
+ }
+ }
+
+ /**
+ * Emit a typed surface read opcode. \p dims determines the number of
+ * components of the address and \p size the number of components of the
+ * returned value.
+ */
+ src_reg
+ emit_typed_read(const vec4_builder &bld, const src_reg &surface,
+ const src_reg &addr, unsigned dims, unsigned size)
+ {
+ const bool has_simd4x2 = (bld.shader->devinfo->gen >= 8 ||
+ bld.shader->devinfo->is_haswell);
+ const src_reg tmp =
+ emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ,
+ emit_typed_message_header(bld),
+ emit_insert(bld, addr, dims, has_simd4x2),
+ has_simd4x2 ? 1 : dims,
+ src_reg(), 0,
+ surface, size,
+ has_simd4x2 ? 1 : size);
+
+ return emit_extract(bld, tmp, size, has_simd4x2);
+ }
+
+ /**
+ * Emit a typed surface write opcode. \p dims determines the number of
+ * components of the address and \p size the number of components of the
+ * argument.
+ */
+ void
+ emit_typed_write(const vec4_builder &bld, const src_reg &surface,
+ const src_reg &addr, const src_reg &src,
+ unsigned dims, unsigned size)
+ {
+ const bool has_simd4x2 = (bld.shader->devinfo->gen >= 8 ||
+ bld.shader->devinfo->is_haswell);
+ emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE,
+ emit_typed_message_header(bld),
+ emit_insert(bld, addr, dims, has_simd4x2),
+ has_simd4x2 ? 1 : dims,
+ emit_insert(bld, src, size, has_simd4x2),
+ has_simd4x2 ? 1 : size,
+ surface, size, 0);
+ }
+
+ /**
+ * Emit a typed surface atomic opcode. \p dims determines the number of
+ * components of the address and \p rsize the number of components of
+ * the returned value (either zero or one).
+ */
+ src_reg
+ emit_typed_atomic(const vec4_builder &bld,
+ const src_reg &surface, const src_reg &addr,
+ const src_reg &src0, const src_reg &src1,
+ unsigned dims, unsigned rsize, unsigned op,
+ brw_predicate pred)
+ {
+ const bool has_simd4x2 = (bld.shader->devinfo->gen >= 8 ||
+ bld.shader->devinfo->is_haswell);
+
+ /* Zip the components of both sources, they are represented as the X
+ * and Y components of the same vector.
+ */
+ const unsigned size = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
+ const dst_reg srcs = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+ if (size >= 1)
+ bld.MOV(writemask(srcs, WRITEMASK_X), src0);
+ if (size >= 2)
+ bld.MOV(writemask(srcs, WRITEMASK_Y), src1);
+
+ return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC,
+ emit_typed_message_header(bld),
+ emit_insert(bld, addr, dims, has_simd4x2),
+ has_simd4x2 ? 1 : dims,
+ emit_insert(bld, src_reg(srcs), size, has_simd4x2),
+ has_simd4x2 ? 1 : size,
+ surface, op, rsize, pred);
+ }
+ }
+}
diff --git a/src/intel/compiler/brw_vec4_surface_builder.h b/src/intel/compiler/brw_vec4_surface_builder.h
new file mode 100644
index 00000000000..6e61c0fce9b
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_surface_builder.h
@@ -0,0 +1,69 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2013-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_VEC4_SURFACE_BUILDER_H
+#define BRW_VEC4_SURFACE_BUILDER_H
+
+#include "brw_vec4_builder.h"
+
+namespace brw {
+ namespace surface_access {
+ src_reg
+ emit_untyped_read(const vec4_builder &bld,
+ const src_reg &surface, const src_reg &addr,
+ unsigned dims, unsigned size,
+ brw_predicate pred = BRW_PREDICATE_NONE);
+
+ void
+ emit_untyped_write(const vec4_builder &bld, const src_reg &surface,
+ const src_reg &addr, const src_reg &src,
+ unsigned dims, unsigned size,
+ brw_predicate pred = BRW_PREDICATE_NONE);
+
+ src_reg
+ emit_untyped_atomic(const vec4_builder &bld,
+ const src_reg &surface, const src_reg &addr,
+ const src_reg &src0, const src_reg &src1,
+ unsigned dims, unsigned rsize, unsigned op,
+ brw_predicate pred = BRW_PREDICATE_NONE);
+
+ src_reg
+ emit_typed_read(const vec4_builder &bld, const src_reg &surface,
+ const src_reg &addr, unsigned dims, unsigned size);
+
+ void
+ emit_typed_write(const vec4_builder &bld, const src_reg &surface,
+ const src_reg &addr, const src_reg &src,
+ unsigned dims, unsigned size);
+
+ src_reg
+ emit_typed_atomic(const vec4_builder &bld, const src_reg &surface,
+ const src_reg &addr,
+ const src_reg &src0, const src_reg &src1,
+ unsigned dims, unsigned rsize, unsigned op,
+ brw_predicate pred = BRW_PREDICATE_NONE);
+ }
+}
+
+#endif
diff --git a/src/intel/compiler/brw_vec4_tcs.cpp b/src/intel/compiler/brw_vec4_tcs.cpp
new file mode 100644
index 00000000000..d4a647d029f
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_tcs.cpp
@@ -0,0 +1,516 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_tcs.cpp
+ *
+ * Tessellaton control shader specific code derived from the vec4_visitor class.
+ */
+
+#include "brw_nir.h"
+#include "brw_vec4_tcs.h"
+#include "brw_fs.h"
+#include "common/gen_debug.h"
+
+namespace brw {
+
+vec4_tcs_visitor::vec4_tcs_visitor(const struct brw_compiler *compiler,
+ void *log_data,
+ const struct brw_tcs_prog_key *key,
+ struct brw_tcs_prog_data *prog_data,
+ const nir_shader *nir,
+ void *mem_ctx,
+ int shader_time_index,
+ const struct brw_vue_map *input_vue_map)
+ : vec4_visitor(compiler, log_data, &key->tex, &prog_data->base,
+ nir, mem_ctx, false, shader_time_index),
+ input_vue_map(input_vue_map), key(key)
+{
+}
+
+
+void
+vec4_tcs_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
+{
+}
+
+dst_reg *
+vec4_tcs_visitor::make_reg_for_system_value(int location)
+{
+ return NULL;
+}
+
+
+void
+vec4_tcs_visitor::setup_payload()
+{
+ int reg = 0;
+
+ /* The payload always contains important data in r0, which contains
+ * the URB handles that are passed on to the URB write at the end
+ * of the thread.
+ */
+ reg++;
+
+ /* r1.0 - r4.7 may contain the input control point URB handles,
+ * which we use to pull vertex data.
+ */
+ reg += 4;
+
+ /* Push constants may start at r5.0 */
+ reg = setup_uniforms(reg);
+
+ this->first_non_payload_grf = reg;
+}
+
+
+void
+vec4_tcs_visitor::emit_prolog()
+{
+ invocation_id = src_reg(this, glsl_type::uint_type);
+ emit(TCS_OPCODE_GET_INSTANCE_ID, dst_reg(invocation_id));
+
+ /* HS threads are dispatched with the dispatch mask set to 0xFF.
+ * If there are an odd number of output vertices, then the final
+ * HS instance dispatched will only have its bottom half doing real
+ * work, and so we need to disable the upper half:
+ */
+ if (nir->info->tess.tcs_vertices_out % 2) {
+ emit(CMP(dst_null_d(), invocation_id,
+ brw_imm_ud(nir->info->tess.tcs_vertices_out),
+ BRW_CONDITIONAL_L));
+
+ /* Matching ENDIF is in emit_thread_end() */
+ emit(IF(BRW_PREDICATE_NORMAL));
+ }
+}
+
+
+void
+vec4_tcs_visitor::emit_thread_end()
+{
+ vec4_instruction *inst;
+ current_annotation = "thread end";
+
+ if (nir->info->tess.tcs_vertices_out % 2) {
+ emit(BRW_OPCODE_ENDIF);
+ }
+
+ if (devinfo->gen == 7) {
+ struct brw_tcs_prog_data *tcs_prog_data =
+ (struct brw_tcs_prog_data *) prog_data;
+
+ current_annotation = "release input vertices";
+
+ /* Synchronize all threads, so we know that no one is still
+ * using the input URB handles.
+ */
+ if (tcs_prog_data->instances > 1) {
+ dst_reg header = dst_reg(this, glsl_type::uvec4_type);
+ emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header);
+ emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
+ }
+
+ /* Make thread 0 (invocations <1, 0>) release pairs of ICP handles.
+ * We want to compare the bottom half of invocation_id with 0, but
+ * use that truth value for the top half as well. Unfortunately,
+ * we don't have stride in the vec4 world, nor UV immediates in
+ * align16, so we need an opcode to get invocation_id<0,4,0>.
+ */
+ set_condmod(BRW_CONDITIONAL_Z,
+ emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(),
+ invocation_id));
+ emit(IF(BRW_PREDICATE_NORMAL));
+ for (unsigned i = 0; i < key->input_vertices; i += 2) {
+ /* If we have an odd number of input vertices, the last will be
+ * unpaired. We don't want to use an interleaved URB write in
+ * that case.
+ */
+ const bool is_unpaired = i == key->input_vertices - 1;
+
+ dst_reg header(this, glsl_type::uvec4_type);
+ emit(TCS_OPCODE_RELEASE_INPUT, header, brw_imm_ud(i),
+ brw_imm_ud(is_unpaired));
+ }
+ emit(BRW_OPCODE_ENDIF);
+ }
+
+ if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME))
+ emit_shader_time_end();
+
+ inst = emit(TCS_OPCODE_THREAD_END);
+ inst->base_mrf = 14;
+ inst->mlen = 2;
+}
+
+
+void
+vec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst,
+ const src_reg &vertex_index,
+ unsigned base_offset,
+ unsigned first_component,
+ const src_reg &indirect_offset)
+{
+ vec4_instruction *inst;
+ dst_reg temp(this, glsl_type::ivec4_type);
+ temp.type = dst.type;
+
+ /* Set up the message header to reference the proper parts of the URB */
+ dst_reg header = dst_reg(this, glsl_type::uvec4_type);
+ inst = emit(TCS_OPCODE_SET_INPUT_URB_OFFSETS, header, vertex_index,
+ indirect_offset);
+ inst->force_writemask_all = true;
+
+ /* Read into a temporary, ignoring writemasking. */
+ inst = emit(VEC4_OPCODE_URB_READ, temp, src_reg(header));
+ inst->offset = base_offset;
+ inst->mlen = 1;
+ inst->base_mrf = -1;
+
+ /* Copy the temporary to the destination to deal with writemasking.
+ *
+ * Also attempt to deal with gl_PointSize being in the .w component.
+ */
+ if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
+ emit(MOV(dst, swizzle(src_reg(temp), BRW_SWIZZLE_WWWW)));
+ } else {
+ src_reg src = src_reg(temp);
+ src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
+ emit(MOV(dst, src));
+ }
+}
+
+void
+vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst,
+ unsigned base_offset,
+ unsigned first_component,
+ const src_reg &indirect_offset)
+{
+ vec4_instruction *inst;
+
+ /* Set up the message header to reference the proper parts of the URB */
+ dst_reg header = dst_reg(this, glsl_type::uvec4_type);
+ inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, header,
+ brw_imm_ud(dst.writemask << first_component), indirect_offset);
+ inst->force_writemask_all = true;
+
+ vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, dst, src_reg(header));
+ read->offset = base_offset;
+ read->mlen = 1;
+ read->base_mrf = -1;
+
+ if (first_component) {
+ /* Read into a temporary and copy with a swizzle and writemask. */
+ read->dst = retype(dst_reg(this, glsl_type::ivec4_type), dst.type);
+ emit(MOV(dst, swizzle(src_reg(read->dst),
+ BRW_SWZ_COMP_INPUT(first_component))));
+ }
+}
+
+void
+vec4_tcs_visitor::emit_urb_write(const src_reg &value,
+ unsigned writemask,
+ unsigned base_offset,
+ const src_reg &indirect_offset)
+{
+ if (writemask == 0)
+ return;
+
+ src_reg message(this, glsl_type::uvec4_type, 2);
+ vec4_instruction *inst;
+
+ inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, dst_reg(message),
+ brw_imm_ud(writemask), indirect_offset);
+ inst->force_writemask_all = true;
+ inst = emit(MOV(byte_offset(dst_reg(retype(message, value.type)), REG_SIZE),
+ value));
+ inst->force_writemask_all = true;
+
+ inst = emit(TCS_OPCODE_URB_WRITE, dst_null_f(), message);
+ inst->offset = base_offset;
+ inst->mlen = 2;
+ inst->base_mrf = -1;
+}
+
+void
+vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+{
+ switch (instr->intrinsic) {
+ case nir_intrinsic_load_invocation_id:
+ emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD),
+ invocation_id));
+ break;
+ case nir_intrinsic_load_primitive_id:
+ emit(TCS_OPCODE_GET_PRIMITIVE_ID,
+ get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD));
+ break;
+ case nir_intrinsic_load_patch_vertices_in:
+ emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D),
+ brw_imm_d(key->input_vertices)));
+ break;
+ case nir_intrinsic_load_per_vertex_input: {
+ src_reg indirect_offset = get_indirect_offset(instr);
+ unsigned imm_offset = instr->const_index[0];
+
+ nir_const_value *vertex_const = nir_src_as_const_value(instr->src[0]);
+ src_reg vertex_index =
+ vertex_const ? src_reg(brw_imm_ud(vertex_const->u32[0]))
+ : get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1);
+
+ unsigned first_component = nir_intrinsic_component(instr);
+ if (nir_dest_bit_size(instr->dest) == 64) {
+ /* We need to emit up to two 32-bit URB reads, then shuffle
+ * the result into a temporary, then move to the destination
+ * honoring the writemask
+ *
+ * We don't need to divide first_component by 2 because
+ * emit_input_urb_read takes a 32-bit type.
+ */
+ dst_reg tmp = dst_reg(this, glsl_type::dvec4_type);
+ dst_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
+ emit_input_urb_read(tmp_d, vertex_index, imm_offset,
+ first_component, indirect_offset);
+ if (instr->num_components > 2) {
+ emit_input_urb_read(byte_offset(tmp_d, REG_SIZE), vertex_index,
+ imm_offset + 1, 0, indirect_offset);
+ }
+
+ src_reg tmp_src = retype(src_reg(tmp_d), BRW_REGISTER_TYPE_DF);
+ dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
+ shuffle_64bit_data(shuffled, tmp_src, false);
+
+ dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_DF);
+ dst.writemask = brw_writemask_for_size(instr->num_components);
+ emit(MOV(dst, src_reg(shuffled)));
+ } else {
+ dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
+ dst.writemask = brw_writemask_for_size(instr->num_components);
+ emit_input_urb_read(dst, vertex_index, imm_offset,
+ first_component, indirect_offset);
+ }
+ break;
+ }
+ case nir_intrinsic_load_input:
+ unreachable("nir_lower_io should use load_per_vertex_input intrinsics");
+ break;
+ case nir_intrinsic_load_output:
+ case nir_intrinsic_load_per_vertex_output: {
+ src_reg indirect_offset = get_indirect_offset(instr);
+ unsigned imm_offset = instr->const_index[0];
+
+ dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
+ dst.writemask = brw_writemask_for_size(instr->num_components);
+
+ emit_output_urb_read(dst, imm_offset, nir_intrinsic_component(instr),
+ indirect_offset);
+ break;
+ }
+ case nir_intrinsic_store_output:
+ case nir_intrinsic_store_per_vertex_output: {
+ src_reg value = get_nir_src(instr->src[0]);
+ unsigned mask = instr->const_index[1];
+ unsigned swiz = BRW_SWIZZLE_XYZW;
+
+ src_reg indirect_offset = get_indirect_offset(instr);
+ unsigned imm_offset = instr->const_index[0];
+
+ unsigned first_component = nir_intrinsic_component(instr);
+ if (first_component) {
+ if (nir_src_bit_size(instr->src[0]) == 64)
+ first_component /= 2;
+ assert(swiz == BRW_SWIZZLE_XYZW);
+ swiz = BRW_SWZ_COMP_OUTPUT(first_component);
+ mask = mask << first_component;
+ }
+
+ if (nir_src_bit_size(instr->src[0]) == 64) {
+ /* For 64-bit data we need to shuffle the data before we write and
+ * emit two messages. Also, since each channel is twice as large we
+ * need to fix the writemask in each 32-bit message to account for it.
+ */
+ value = swizzle(retype(value, BRW_REGISTER_TYPE_DF), swiz);
+ dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
+ shuffle_64bit_data(shuffled, value, true);
+ src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
+
+ for (int n = 0; n < 2; n++) {
+ unsigned fixed_mask = 0;
+ if (mask & WRITEMASK_X)
+ fixed_mask |= WRITEMASK_XY;
+ if (mask & WRITEMASK_Y)
+ fixed_mask |= WRITEMASK_ZW;
+ emit_urb_write(shuffled_float, fixed_mask,
+ imm_offset, indirect_offset);
+
+ shuffled_float = byte_offset(shuffled_float, REG_SIZE);
+ mask >>= 2;
+ imm_offset++;
+ }
+ } else {
+ emit_urb_write(swizzle(value, swiz), mask,
+ imm_offset, indirect_offset);
+ }
+ break;
+ }
+
+ case nir_intrinsic_barrier: {
+ dst_reg header = dst_reg(this, glsl_type::uvec4_type);
+ emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header);
+ emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
+ break;
+ }
+
+ default:
+ vec4_visitor::nir_emit_intrinsic(instr);
+ }
+}
+
+
+extern "C" const unsigned *
+brw_compile_tcs(const struct brw_compiler *compiler,
+ void *log_data,
+ void *mem_ctx,
+ const struct brw_tcs_prog_key *key,
+ struct brw_tcs_prog_data *prog_data,
+ const nir_shader *src_shader,
+ int shader_time_index,
+ unsigned *final_assembly_size,
+ char **error_str)
+{
+ const struct gen_device_info *devinfo = compiler->devinfo;
+ struct brw_vue_prog_data *vue_prog_data = &prog_data->base;
+ const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL];
+
+ nir_shader *nir = nir_shader_clone(mem_ctx, src_shader);
+ nir->info->outputs_written = key->outputs_written;
+ nir->info->patch_outputs_written = key->patch_outputs_written;
+
+ struct brw_vue_map input_vue_map;
+ brw_compute_vue_map(devinfo, &input_vue_map, nir->info->inputs_read,
+ nir->info->separate_shader);
+ brw_compute_tess_vue_map(&vue_prog_data->vue_map,
+ nir->info->outputs_written,
+ nir->info->patch_outputs_written);
+
+ nir = brw_nir_apply_sampler_key(nir, compiler, &key->tex, is_scalar);
+ brw_nir_lower_vue_inputs(nir, is_scalar, &input_vue_map);
+ brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map,
+ key->tes_primitive_mode);
+ if (key->quads_workaround)
+ brw_nir_apply_tcs_quads_workaround(nir);
+
+ nir = brw_postprocess_nir(nir, compiler, is_scalar);
+
+ if (is_scalar)
+ prog_data->instances = DIV_ROUND_UP(nir->info->tess.tcs_vertices_out, 8);
+ else
+ prog_data->instances = DIV_ROUND_UP(nir->info->tess.tcs_vertices_out, 2);
+
+ /* Compute URB entry size. The maximum allowed URB entry size is 32k.
+ * That divides up as follows:
+ *
+ * 32 bytes for the patch header (tessellation factors)
+ * 480 bytes for per-patch varyings (a varying component is 4 bytes and
+ * gl_MaxTessPatchComponents = 120)
+ * 16384 bytes for per-vertex varyings (a varying component is 4 bytes,
+ * gl_MaxPatchVertices = 32 and
+ * gl_MaxTessControlOutputComponents = 128)
+ *
+ * 15808 bytes left for varying packing overhead
+ */
+ const int num_per_patch_slots = vue_prog_data->vue_map.num_per_patch_slots;
+ const int num_per_vertex_slots = vue_prog_data->vue_map.num_per_vertex_slots;
+ unsigned output_size_bytes = 0;
+ /* Note that the patch header is counted in num_per_patch_slots. */
+ output_size_bytes += num_per_patch_slots * 16;
+ output_size_bytes += nir->info->tess.tcs_vertices_out *
+ num_per_vertex_slots * 16;
+
+ assert(output_size_bytes >= 1);
+ if (output_size_bytes > GEN7_MAX_HS_URB_ENTRY_SIZE_BYTES)
+ return NULL;
+
+ /* URB entry sizes are stored as a multiple of 64 bytes. */
+ vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+
+ /* HS does not use the usual payload pushing from URB to GRFs,
+ * because we don't have enough registers for a full-size payload, and
+ * the hardware is broken on Haswell anyway.
+ */
+ vue_prog_data->urb_read_length = 0;
+
+ if (unlikely(INTEL_DEBUG & DEBUG_TCS)) {
+ fprintf(stderr, "TCS Input ");
+ brw_print_vue_map(stderr, &input_vue_map);
+ fprintf(stderr, "TCS Output ");
+ brw_print_vue_map(stderr, &vue_prog_data->vue_map);
+ }
+
+ if (is_scalar) {
+ fs_visitor v(compiler, log_data, mem_ctx, (void *) key,
+ &prog_data->base.base, NULL, nir, 8,
+ shader_time_index, &input_vue_map);
+ if (!v.run_tcs_single_patch()) {
+ if (error_str)
+ *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+ return NULL;
+ }
+
+ prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
+ prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
+
+ fs_generator g(compiler, log_data, mem_ctx, (void *) key,
+ &prog_data->base.base, v.promoted_constants, false,
+ MESA_SHADER_TESS_CTRL);
+ if (unlikely(INTEL_DEBUG & DEBUG_TCS)) {
+ g.enable_debug(ralloc_asprintf(mem_ctx,
+ "%s tessellation control shader %s",
+ nir->info->label ? nir->info->label
+ : "unnamed",
+ nir->info->name));
+ }
+
+ g.generate_code(v.cfg, 8);
+
+ return g.get_assembly(final_assembly_size);
+ } else {
+ vec4_tcs_visitor v(compiler, log_data, key, prog_data,
+ nir, mem_ctx, shader_time_index, &input_vue_map);
+ if (!v.run()) {
+ if (error_str)
+ *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+ return NULL;
+ }
+
+ if (unlikely(INTEL_DEBUG & DEBUG_TCS))
+ v.dump_instructions();
+
+
+ return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
+ &prog_data->base, v.cfg,
+ final_assembly_size);
+ }
+}
+
+
+} /* namespace brw */
diff --git a/src/intel/compiler/brw_vec4_tcs.h b/src/intel/compiler/brw_vec4_tcs.h
new file mode 100644
index 00000000000..030eb5e6603
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_tcs.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_tcs.h
+ *
+ * The vec4-mode tessellation control shader compiler backend.
+ */
+
+#ifndef BRW_VEC4_TCS_H
+#define BRW_VEC4_TCS_H
+
+#include "brw_compiler.h"
+#include "brw_vec4.h"
+
+#ifdef __cplusplus
+namespace brw {
+
+class vec4_tcs_visitor : public vec4_visitor
+{
+public:
+ vec4_tcs_visitor(const struct brw_compiler *compiler,
+ void *log_data,
+ const struct brw_tcs_prog_key *key,
+ struct brw_tcs_prog_data *prog_data,
+ const nir_shader *nir,
+ void *mem_ctx,
+ int shader_time_index,
+ const struct brw_vue_map *input_vue_map);
+
+protected:
+ virtual dst_reg *make_reg_for_system_value(int location);
+ virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
+ virtual void setup_payload();
+ virtual void emit_prolog();
+ virtual void emit_thread_end();
+
+ virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
+
+ void emit_input_urb_read(const dst_reg &dst,
+ const src_reg &vertex_index,
+ unsigned base_offset,
+ unsigned first_component,
+ const src_reg &indirect_offset);
+ void emit_output_urb_read(const dst_reg &dst,
+ unsigned base_offset,
+ unsigned first_component,
+ const src_reg &indirect_offset);
+
+ void emit_urb_write(const src_reg &value, unsigned writemask,
+ unsigned base_offset, const src_reg &indirect_offset);
+
+ /* we do not use the normal end-of-shader URB write mechanism -- but every vec4 stage
+ * must provide implementations of these:
+ */
+ virtual void emit_urb_write_header(int mrf) {}
+ virtual vec4_instruction *emit_urb_write_opcode(bool complete) { return NULL; }
+
+ const struct brw_vue_map *input_vue_map;
+
+ const struct brw_tcs_prog_key *key;
+ src_reg invocation_id;
+};
+
+} /* namespace brw */
+#endif /* __cplusplus */
+
+#endif /* BRW_VEC4_TCS_H */
diff --git a/src/intel/compiler/brw_vec4_tes.cpp b/src/intel/compiler/brw_vec4_tes.cpp
new file mode 100644
index 00000000000..bcf9a87eb01
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_tes.cpp
@@ -0,0 +1,296 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_tes.cpp
+ *
+ * Tessellaton evaluation shader specific code derived from the vec4_visitor class.
+ */
+
+#include "brw_vec4_tes.h"
+#include "brw_cfg.h"
+#include "common/gen_debug.h"
+
+namespace brw {
+
+vec4_tes_visitor::vec4_tes_visitor(const struct brw_compiler *compiler,
+ void *log_data,
+ const struct brw_tes_prog_key *key,
+ struct brw_tes_prog_data *prog_data,
+ const nir_shader *shader,
+ void *mem_ctx,
+ int shader_time_index)
+ : vec4_visitor(compiler, log_data, &key->tex, &prog_data->base,
+ shader, mem_ctx, false, shader_time_index)
+{
+}
+
+
+dst_reg *
+vec4_tes_visitor::make_reg_for_system_value(int location)
+{
+ return NULL;
+}
+
+void
+vec4_tes_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
+{
+ switch (instr->intrinsic) {
+ case nir_intrinsic_load_tess_level_outer:
+ case nir_intrinsic_load_tess_level_inner:
+ break;
+ default:
+ vec4_visitor::nir_setup_system_value_intrinsic(instr);
+ }
+}
+
+
+void
+vec4_tes_visitor::setup_payload()
+{
+ int reg = 0;
+
+ /* The payload always contains important data in r0 and r1, which contains
+ * the URB handles that are passed on to the URB write at the end
+ * of the thread.
+ */
+ reg += 2;
+
+ reg = setup_uniforms(reg);
+
+ foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+ for (int i = 0; i < 3; i++) {
+ if (inst->src[i].file != ATTR)
+ continue;
+
+ bool is_64bit = type_sz(inst->src[i].type) == 8;
+
+ unsigned slot = inst->src[i].nr + inst->src[i].offset / 16;
+ struct brw_reg grf = brw_vec4_grf(reg + slot / 2, 4 * (slot % 2));
+ grf = stride(grf, 0, is_64bit ? 2 : 4, 1);
+ grf.swizzle = inst->src[i].swizzle;
+ grf.type = inst->src[i].type;
+ grf.abs = inst->src[i].abs;
+ grf.negate = inst->src[i].negate;
+
+ /* For 64-bit attributes we can end up with components XY in the
+ * second half of a register and components ZW in the first half
+ * of the next. Fix it up here.
+ */
+ if (is_64bit && grf.subnr > 0) {
+ /* We can't do swizzles that mix XY and ZW channels in this case.
+ * Such cases should have been handled by the scalarization pass.
+ */
+ assert((brw_mask_for_swizzle(grf.swizzle) & 0x3) ^
+ (brw_mask_for_swizzle(grf.swizzle) & 0xc));
+ if (brw_mask_for_swizzle(grf.swizzle) & 0xc) {
+ grf.subnr = 0;
+ grf.nr++;
+ grf.swizzle -= BRW_SWIZZLE_ZZZZ;
+ }
+ }
+
+ inst->src[i] = grf;
+ }
+ }
+
+ reg += 8 * prog_data->urb_read_length;
+
+ this->first_non_payload_grf = reg;
+}
+
+
+void
+vec4_tes_visitor::emit_prolog()
+{
+ input_read_header = src_reg(this, glsl_type::uvec4_type);
+ emit(TES_OPCODE_CREATE_INPUT_READ_HEADER, dst_reg(input_read_header));
+
+ this->current_annotation = NULL;
+}
+
+
+void
+vec4_tes_visitor::emit_urb_write_header(int mrf)
+{
+ /* No need to do anything for DS; an implied write to this MRF will be
+ * performed by VS_OPCODE_URB_WRITE.
+ */
+ (void) mrf;
+}
+
+
+vec4_instruction *
+vec4_tes_visitor::emit_urb_write_opcode(bool complete)
+{
+ /* For DS, the URB writes end the thread. */
+ if (complete) {
+ if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+ emit_shader_time_end();
+ }
+
+ vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
+ inst->urb_write_flags = complete ?
+ BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS;
+
+ return inst;
+}
+
+void
+vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+{
+ const struct brw_tes_prog_data *tes_prog_data =
+ (const struct brw_tes_prog_data *) prog_data;
+
+ switch (instr->intrinsic) {
+ case nir_intrinsic_load_tess_coord:
+ /* gl_TessCoord is part of the payload in g1 channels 0-2 and 4-6. */
+ emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
+ src_reg(brw_vec8_grf(1, 0))));
+ break;
+ case nir_intrinsic_load_tess_level_outer:
+ if (tes_prog_data->domain == BRW_TESS_DOMAIN_ISOLINE) {
+ emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
+ swizzle(src_reg(ATTR, 1, glsl_type::vec4_type),
+ BRW_SWIZZLE_ZWZW)));
+ } else {
+ emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
+ swizzle(src_reg(ATTR, 1, glsl_type::vec4_type),
+ BRW_SWIZZLE_WZYX)));
+ }
+ break;
+ case nir_intrinsic_load_tess_level_inner:
+ if (tes_prog_data->domain == BRW_TESS_DOMAIN_QUAD) {
+ emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
+ swizzle(src_reg(ATTR, 0, glsl_type::vec4_type),
+ BRW_SWIZZLE_WZYX)));
+ } else {
+ emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
+ src_reg(ATTR, 1, glsl_type::float_type)));
+ }
+ break;
+ case nir_intrinsic_load_primitive_id:
+ emit(TES_OPCODE_GET_PRIMITIVE_ID,
+ get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD));
+ break;
+
+ case nir_intrinsic_load_input:
+ case nir_intrinsic_load_per_vertex_input: {
+ src_reg indirect_offset = get_indirect_offset(instr);
+ unsigned imm_offset = instr->const_index[0];
+ src_reg header = input_read_header;
+ bool is_64bit = nir_dest_bit_size(instr->dest) == 64;
+ unsigned first_component = nir_intrinsic_component(instr);
+ if (is_64bit)
+ first_component /= 2;
+
+ if (indirect_offset.file != BAD_FILE) {
+ header = src_reg(this, glsl_type::uvec4_type);
+ emit(TES_OPCODE_ADD_INDIRECT_URB_OFFSET, dst_reg(header),
+ input_read_header, indirect_offset);
+ } else {
+ /* Arbitrarily only push up to 24 vec4 slots worth of data,
+ * which is 12 registers (since each holds 2 vec4 slots).
+ */
+ const unsigned max_push_slots = 24;
+ if (imm_offset < max_push_slots) {
+ const glsl_type *src_glsl_type =
+ is_64bit ? glsl_type::dvec4_type : glsl_type::ivec4_type;
+ src_reg src = src_reg(ATTR, imm_offset, src_glsl_type);
+ src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
+
+ const brw_reg_type dst_reg_type =
+ is_64bit ? BRW_REGISTER_TYPE_DF : BRW_REGISTER_TYPE_D;
+ emit(MOV(get_nir_dest(instr->dest, dst_reg_type), src));
+
+ prog_data->urb_read_length =
+ MAX2(prog_data->urb_read_length,
+ DIV_ROUND_UP(imm_offset + (is_64bit ? 2 : 1), 2));
+ break;
+ }
+ }
+
+ if (!is_64bit) {
+ dst_reg temp(this, glsl_type::ivec4_type);
+ vec4_instruction *read =
+ emit(VEC4_OPCODE_URB_READ, temp, src_reg(header));
+ read->offset = imm_offset;
+ read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
+
+ src_reg src = src_reg(temp);
+ src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
+
+ /* Copy to target. We might end up with some funky writemasks landing
+ * in here, but we really don't want them in the above pseudo-ops.
+ */
+ dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
+ dst.writemask = brw_writemask_for_size(instr->num_components);
+ emit(MOV(dst, src));
+ } else {
+ /* For 64-bit we need to load twice as many 32-bit components, and for
+ * dvec3/4 we need to emit 2 URB Read messages
+ */
+ dst_reg temp(this, glsl_type::dvec4_type);
+ dst_reg temp_d = retype(temp, BRW_REGISTER_TYPE_D);
+
+ vec4_instruction *read =
+ emit(VEC4_OPCODE_URB_READ, temp_d, src_reg(header));
+ read->offset = imm_offset;
+ read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
+
+ if (instr->num_components > 2) {
+ read = emit(VEC4_OPCODE_URB_READ, byte_offset(temp_d, REG_SIZE),
+ src_reg(header));
+ read->offset = imm_offset + 1;
+ read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
+ }
+
+ src_reg temp_as_src = src_reg(temp);
+ temp_as_src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
+
+ dst_reg shuffled(this, glsl_type::dvec4_type);
+ shuffle_64bit_data(shuffled, temp_as_src, false);
+
+ dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_DF);
+ dst.writemask = brw_writemask_for_size(instr->num_components);
+ emit(MOV(dst, src_reg(shuffled)));
+ }
+ break;
+ }
+ default:
+ vec4_visitor::nir_emit_intrinsic(instr);
+ }
+}
+
+
+void
+vec4_tes_visitor::emit_thread_end()
+{
+ /* For DS, we always end the thread by emitting a single vertex.
+ * emit_urb_write_opcode() will take care of setting the eot flag on the
+ * SEND instruction.
+ */
+ emit_vertex();
+}
+
+} /* namespace brw */
diff --git a/src/intel/compiler/brw_vec4_tes.h b/src/intel/compiler/brw_vec4_tes.h
new file mode 100644
index 00000000000..31a28f35974
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_tes.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_tes.h
+ *
+ * The vec4 mode tessellation evaluation shader compiler backend.
+ */
+
+#ifndef BRW_VEC4_TES_H
+#define BRW_VEC4_TES_H
+
+#include "brw_vec4.h"
+
+#ifdef __cplusplus
+namespace brw {
+
+class vec4_tes_visitor : public vec4_visitor
+{
+public:
+ vec4_tes_visitor(const struct brw_compiler *compiler,
+ void *log_data,
+ const struct brw_tes_prog_key *key,
+ struct brw_tes_prog_data *prog_data,
+ const nir_shader *nir,
+ void *mem_ctx,
+ int shader_time_index);
+
+protected:
+ virtual dst_reg *make_reg_for_system_value(int location);
+ virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
+ virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
+
+ virtual void setup_payload();
+ virtual void emit_prolog();
+ virtual void emit_thread_end();
+
+ virtual void emit_urb_write_header(int mrf);
+ virtual vec4_instruction *emit_urb_write_opcode(bool complete);
+
+private:
+ src_reg input_read_header;
+};
+
+} /* namespace brw */
+#endif /* __cplusplus */
+
+#endif /* BRW_VEC4_TES_H */
diff --git a/src/intel/compiler/brw_vec4_visitor.cpp b/src/intel/compiler/brw_vec4_visitor.cpp
new file mode 100644
index 00000000000..262a084ca87
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_visitor.cpp
@@ -0,0 +1,1917 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4.h"
+#include "brw_cfg.h"
+#include "brw_eu.h"
+
+namespace brw {
+
+vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
+ const src_reg &src0, const src_reg &src1,
+ const src_reg &src2)
+{
+ this->opcode = opcode;
+ this->dst = dst;
+ this->src[0] = src0;
+ this->src[1] = src1;
+ this->src[2] = src2;
+ this->saturate = false;
+ this->force_writemask_all = false;
+ this->no_dd_clear = false;
+ this->no_dd_check = false;
+ this->writes_accumulator = false;
+ this->conditional_mod = BRW_CONDITIONAL_NONE;
+ this->predicate = BRW_PREDICATE_NONE;
+ this->predicate_inverse = false;
+ this->target = 0;
+ this->shadow_compare = false;
+ this->ir = NULL;
+ this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
+ this->header_size = 0;
+ this->flag_subreg = 0;
+ this->mlen = 0;
+ this->base_mrf = 0;
+ this->offset = 0;
+ this->exec_size = 8;
+ this->group = 0;
+ this->size_written = (dst.file == BAD_FILE ?
+ 0 : this->exec_size * type_sz(dst.type));
+ this->annotation = NULL;
+}
+
+vec4_instruction *
+vec4_visitor::emit(vec4_instruction *inst)
+{
+ inst->ir = this->base_ir;
+ inst->annotation = this->current_annotation;
+
+ this->instructions.push_tail(inst);
+
+ return inst;
+}
+
+vec4_instruction *
+vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
+ vec4_instruction *new_inst)
+{
+ new_inst->ir = inst->ir;
+ new_inst->annotation = inst->annotation;
+
+ inst->insert_before(block, new_inst);
+
+ return inst;
+}
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+ const src_reg &src1, const src_reg &src2)
+{
+ return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
+}
+
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+ const src_reg &src1)
+{
+ return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
+}
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
+{
+ return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
+}
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
+{
+ return emit(new(mem_ctx) vec4_instruction(opcode, dst));
+}
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode)
+{
+ return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
+}
+
+#define ALU1(op) \
+ vec4_instruction * \
+ vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
+ { \
+ return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
+ }
+
+#define ALU2(op) \
+ vec4_instruction * \
+ vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
+ const src_reg &src1) \
+ { \
+ return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
+ src0, src1); \
+ }
+
+#define ALU2_ACC(op) \
+ vec4_instruction * \
+ vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
+ const src_reg &src1) \
+ { \
+ vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
+ BRW_OPCODE_##op, dst, src0, src1); \
+ inst->writes_accumulator = true; \
+ return inst; \
+ }
+
+#define ALU3(op) \
+ vec4_instruction * \
+ vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
+ const src_reg &src1, const src_reg &src2) \
+ { \
+ assert(devinfo->gen >= 6); \
+ return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
+ src0, src1, src2); \
+ }
+
+ALU1(NOT)
+ALU1(MOV)
+ALU1(FRC)
+ALU1(RNDD)
+ALU1(RNDE)
+ALU1(RNDZ)
+ALU1(F32TO16)
+ALU1(F16TO32)
+ALU2(ADD)
+ALU2(MUL)
+ALU2_ACC(MACH)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(DP3)
+ALU2(DP4)
+ALU2(DPH)
+ALU2(SHL)
+ALU2(SHR)
+ALU2(ASR)
+ALU3(LRP)
+ALU1(BFREV)
+ALU3(BFE)
+ALU2(BFI1)
+ALU3(BFI2)
+ALU1(FBH)
+ALU1(FBL)
+ALU1(CBIT)
+ALU3(MAD)
+ALU2_ACC(ADDC)
+ALU2_ACC(SUBB)
+ALU2(MAC)
+ALU1(DIM)
+
+/** Gen4 predicated IF. */
+vec4_instruction *
+vec4_visitor::IF(enum brw_predicate predicate)
+{
+ vec4_instruction *inst;
+
+ inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
+ inst->predicate = predicate;
+
+ return inst;
+}
+
+/** Gen6 IF with embedded comparison. */
+vec4_instruction *
+vec4_visitor::IF(src_reg src0, src_reg src1,
+ enum brw_conditional_mod condition)
+{
+ assert(devinfo->gen == 6);
+
+ vec4_instruction *inst;
+
+ resolve_ud_negate(&src0);
+ resolve_ud_negate(&src1);
+
+ inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
+ src0, src1);
+ inst->conditional_mod = condition;
+
+ return inst;
+}
+
+/**
+ * CMP: Sets the low bit of the destination channels with the result
+ * of the comparison, while the upper bits are undefined, and updates
+ * the flag register with the packed 16 bits of the result.
+ */
+vec4_instruction *
+vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
+ enum brw_conditional_mod condition)
+{
+ vec4_instruction *inst;
+
+ /* Take the instruction:
+ *
+ * CMP null<d> src0<f> src1<f>
+ *
+ * Original gen4 does type conversion to the destination type before
+ * comparison, producing garbage results for floating point comparisons.
+ *
+ * The destination type doesn't matter on newer generations, so we set the
+ * type to match src0 so we can compact the instruction.
+ */
+ dst.type = src0.type;
+
+ resolve_ud_negate(&src0);
+ resolve_ud_negate(&src1);
+
+ inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
+ inst->conditional_mod = condition;
+
+ return inst;
+}
+
+vec4_instruction *
+vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
+{
+ vec4_instruction *inst;
+
+ inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
+ dst, index);
+ inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
+ inst->mlen = 2;
+
+ return inst;
+}
+
+vec4_instruction *
+vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
+ const src_reg &index)
+{
+ vec4_instruction *inst;
+
+ inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
+ dst, src, index);
+ inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
+ inst->mlen = 3;
+
+ return inst;
+}
+
+src_reg
+vec4_visitor::fix_3src_operand(const src_reg &src)
+{
+ /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
+ * able to use vertical stride of zero to replicate the vec4 uniform, like
+ *
+ * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
+ *
+ * But you can't, since vertical stride is always four in three-source
+ * instructions. Instead, insert a MOV instruction to do the replication so
+ * that the three-source instruction can consume it.
+ */
+
+ /* The MOV is only needed if the source is a uniform or immediate. */
+ if (src.file != UNIFORM && src.file != IMM)
+ return src;
+
+ if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
+ return src;
+
+ dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
+ expanded.type = src.type;
+ emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
+ return src_reg(expanded);
+}
+
+src_reg
+vec4_visitor::resolve_source_modifiers(const src_reg &src)
+{
+ if (!src.abs && !src.negate)
+ return src;
+
+ dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
+ resolved.type = src.type;
+ emit(MOV(resolved, src));
+
+ return src_reg(resolved);
+}
+
+src_reg
+vec4_visitor::fix_math_operand(const src_reg &src)
+{
+ if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
+ return src;
+
+ /* The gen6 math instruction ignores the source modifiers --
+ * swizzle, abs, negate, and at least some parts of the register
+ * region description.
+ *
+ * Rather than trying to enumerate all these cases, *always* expand the
+ * operand to a temp GRF for gen6.
+ *
+ * For gen7, keep the operand as-is, except if immediate, which gen7 still
+ * can't use.
+ */
+
+ if (devinfo->gen == 7 && src.file != IMM)
+ return src;
+
+ dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
+ expanded.type = src.type;
+ emit(MOV(expanded, src));
+ return src_reg(expanded);
+}
+
+vec4_instruction *
+vec4_visitor::emit_math(enum opcode opcode,
+ const dst_reg &dst,
+ const src_reg &src0, const src_reg &src1)
+{
+ vec4_instruction *math =
+ emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
+
+ if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
+ /* MATH on Gen6 must be align1, so we can't do writemasks. */
+ math->dst = dst_reg(this, glsl_type::vec4_type);
+ math->dst.type = dst.type;
+ math = emit(MOV(dst, src_reg(math->dst)));
+ } else if (devinfo->gen < 6) {
+ math->base_mrf = 1;
+ math->mlen = src1.file == BAD_FILE ? 1 : 2;
+ }
+
+ return math;
+}
+
+void
+vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
+{
+ if (devinfo->gen < 7) {
+ unreachable("ir_unop_pack_half_2x16 should be lowered");
+ }
+
+ assert(dst.type == BRW_REGISTER_TYPE_UD);
+ assert(src0.type == BRW_REGISTER_TYPE_F);
+
+ /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
+ *
+ * Because this instruction does not have a 16-bit floating-point type,
+ * the destination data type must be Word (W).
+ *
+ * The destination must be DWord-aligned and specify a horizontal stride
+ * (HorzStride) of 2. The 16-bit result is stored in the lower word of
+ * each destination channel and the upper word is not modified.
+ *
+ * The above restriction implies that the f32to16 instruction must use
+ * align1 mode, because only in align1 mode is it possible to specify
+ * horizontal stride. We choose here to defy the hardware docs and emit
+ * align16 instructions.
+ *
+ * (I [chadv] did attempt to emit align1 instructions for VS f32to16
+ * instructions. I was partially successful in that the code passed all
+ * tests. However, the code was dubiously correct and fragile, and the
+ * tests were not harsh enough to probe that frailty. Not trusting the
+ * code, I chose instead to remain in align16 mode in defiance of the hw
+ * docs).
+ *
+ * I've [chadv] experimentally confirmed that, on gen7 hardware and the
+ * simulator, emitting a f32to16 in align16 mode with UD as destination
+ * data type is safe. The behavior differs from that specified in the PRM
+ * in that the upper word of each destination channel is cleared to 0.
+ */
+
+ dst_reg tmp_dst(this, glsl_type::uvec2_type);
+ src_reg tmp_src(tmp_dst);
+
+#if 0
+ /* Verify the undocumented behavior on which the following instructions
+ * rely. If f32to16 fails to clear the upper word of the X and Y channels,
+ * then the result of the bit-or instruction below will be incorrect.
+ *
+ * You should inspect the disasm output in order to verify that the MOV is
+ * not optimized away.
+ */
+ emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
+#endif
+
+ /* Give tmp the form below, where "." means untouched.
+ *
+ * w z y x w z y x
+ * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
+ *
+ * That the upper word of each write-channel be 0 is required for the
+ * following bit-shift and bit-or instructions to work. Note that this
+ * relies on the undocumented hardware behavior mentioned above.
+ */
+ tmp_dst.writemask = WRITEMASK_XY;
+ emit(F32TO16(tmp_dst, src0));
+
+ /* Give the write-channels of dst the form:
+ * 0xhhhh0000
+ */
+ tmp_src.swizzle = BRW_SWIZZLE_YYYY;
+ emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
+
+ /* Finally, give the write-channels of dst the form of packHalf2x16's
+ * output:
+ * 0xhhhhllll
+ */
+ tmp_src.swizzle = BRW_SWIZZLE_XXXX;
+ emit(OR(dst, src_reg(dst), tmp_src));
+}
+
+void
+vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
+{
+ if (devinfo->gen < 7) {
+ unreachable("ir_unop_unpack_half_2x16 should be lowered");
+ }
+
+ assert(dst.type == BRW_REGISTER_TYPE_F);
+ assert(src0.type == BRW_REGISTER_TYPE_UD);
+
+ /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
+ *
+ * Because this instruction does not have a 16-bit floating-point type,
+ * the source data type must be Word (W). The destination type must be
+ * F (Float).
+ *
+ * To use W as the source data type, we must adjust horizontal strides,
+ * which is only possible in align1 mode. All my [chadv] attempts at
+ * emitting align1 instructions for unpackHalf2x16 failed to pass the
+ * Piglit tests, so I gave up.
+ *
+ * I've verified that, on gen7 hardware and the simulator, it is safe to
+ * emit f16to32 in align16 mode with UD as source data type.
+ */
+
+ dst_reg tmp_dst(this, glsl_type::uvec2_type);
+ src_reg tmp_src(tmp_dst);
+
+ tmp_dst.writemask = WRITEMASK_X;
+ emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
+
+ tmp_dst.writemask = WRITEMASK_Y;
+ emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
+
+ dst.writemask = WRITEMASK_XY;
+ emit(F16TO32(dst, tmp_src));
+}
+
+void
+vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
+{
+ /* Instead of splitting the 32-bit integer, shifting, and ORing it back
+ * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
+ * is not suitable to generate the shift values, but we can use the packed
+ * vector float and a type-converting MOV.
+ */
+ dst_reg shift(this, glsl_type::uvec4_type);
+ emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
+
+ dst_reg shifted(this, glsl_type::uvec4_type);
+ src0.swizzle = BRW_SWIZZLE_XXXX;
+ emit(SHR(shifted, src0, src_reg(shift)));
+
+ shifted.type = BRW_REGISTER_TYPE_UB;
+ dst_reg f(this, glsl_type::vec4_type);
+ emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
+
+ emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
+}
+
+void
+vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
+{
+ /* Instead of splitting the 32-bit integer, shifting, and ORing it back
+ * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
+ * is not suitable to generate the shift values, but we can use the packed
+ * vector float and a type-converting MOV.
+ */
+ dst_reg shift(this, glsl_type::uvec4_type);
+ emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
+
+ dst_reg shifted(this, glsl_type::uvec4_type);
+ src0.swizzle = BRW_SWIZZLE_XXXX;
+ emit(SHR(shifted, src0, src_reg(shift)));
+
+ shifted.type = BRW_REGISTER_TYPE_B;
+ dst_reg f(this, glsl_type::vec4_type);
+ emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
+
+ dst_reg scaled(this, glsl_type::vec4_type);
+ emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
+
+ dst_reg max(this, glsl_type::vec4_type);
+ emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
+ emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
+}
+
+void
+vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
+{
+ dst_reg saturated(this, glsl_type::vec4_type);
+ vec4_instruction *inst = emit(MOV(saturated, src0));
+ inst->saturate = true;
+
+ dst_reg scaled(this, glsl_type::vec4_type);
+ emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
+
+ dst_reg rounded(this, glsl_type::vec4_type);
+ emit(RNDE(rounded, src_reg(scaled)));
+
+ dst_reg u(this, glsl_type::uvec4_type);
+ emit(MOV(u, src_reg(rounded)));
+
+ src_reg bytes(u);
+ emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
+}
+
+void
+vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
+{
+ dst_reg max(this, glsl_type::vec4_type);
+ emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
+
+ dst_reg min(this, glsl_type::vec4_type);
+ emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
+
+ dst_reg scaled(this, glsl_type::vec4_type);
+ emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
+
+ dst_reg rounded(this, glsl_type::vec4_type);
+ emit(RNDE(rounded, src_reg(scaled)));
+
+ dst_reg i(this, glsl_type::ivec4_type);
+ emit(MOV(i, src_reg(rounded)));
+
+ src_reg bytes(i);
+ emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
+}
+
+/*
+ * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
+ * false) elements needed to pack a type.
+ */
+static int
+type_size_xvec4(const struct glsl_type *type, bool as_vec4)
+{
+ unsigned int i;
+ int size;
+
+ switch (type->base_type) {
+ case GLSL_TYPE_UINT:
+ case GLSL_TYPE_INT:
+ case GLSL_TYPE_FLOAT:
+ case GLSL_TYPE_BOOL:
+ case GLSL_TYPE_DOUBLE:
+ case GLSL_TYPE_UINT64:
+ case GLSL_TYPE_INT64:
+ if (type->is_matrix()) {
+ const glsl_type *col_type = type->column_type();
+ unsigned col_slots =
+ (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
+ return type->matrix_columns * col_slots;
+ } else {
+ /* Regardless of size of vector, it gets a vec4. This is bad
+ * packing for things like floats, but otherwise arrays become a
+ * mess. Hopefully a later pass over the code can pack scalars
+ * down if appropriate.
+ */
+ return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
+ }
+ case GLSL_TYPE_ARRAY:
+ assert(type->length > 0);
+ return type_size_xvec4(type->fields.array, as_vec4) * type->length;
+ case GLSL_TYPE_STRUCT:
+ size = 0;
+ for (i = 0; i < type->length; i++) {
+ size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
+ }
+ return size;
+ case GLSL_TYPE_SUBROUTINE:
+ return 1;
+
+ case GLSL_TYPE_SAMPLER:
+ /* Samplers take up no register space, since they're baked in at
+ * link time.
+ */
+ return 0;
+ case GLSL_TYPE_ATOMIC_UINT:
+ return 0;
+ case GLSL_TYPE_IMAGE:
+ return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
+ case GLSL_TYPE_VOID:
+ case GLSL_TYPE_ERROR:
+ case GLSL_TYPE_INTERFACE:
+ case GLSL_TYPE_FUNCTION:
+ unreachable("not reached");
+ }
+
+ return 0;
+}
+
+/**
+ * Returns the minimum number of vec4 elements needed to pack a type.
+ *
+ * For simple types, it will return 1 (a single vec4); for matrices, the
+ * number of columns; for array and struct, the sum of the vec4_size of
+ * each of its elements; and for sampler and atomic, zero.
+ *
+ * This method is useful to calculate how much register space is needed to
+ * store a particular type.
+ */
+extern "C" int
+type_size_vec4(const struct glsl_type *type)
+{
+ return type_size_xvec4(type, true);
+}
+
+/**
+ * Returns the minimum number of dvec4 elements needed to pack a type.
+ *
+ * For simple types, it will return 1 (a single dvec4); for matrices, the
+ * number of columns; for array and struct, the sum of the dvec4_size of
+ * each of its elements; and for sampler and atomic, zero.
+ *
+ * This method is useful to calculate how much register space is needed to
+ * store a particular type.
+ *
+ * Measuring double-precision vertex inputs as dvec4 is required because
+ * ARB_vertex_attrib_64bit states that these uses the same number of locations
+ * than the single-precision version. That is, two consecutives dvec4 would be
+ * located in location "x" and location "x+1", not "x+2".
+ *
+ * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
+ * remap_vs_attrs() will take in account both the location and also if the
+ * type fits in one or two vec4 slots.
+ */
+extern "C" int
+type_size_dvec4(const struct glsl_type *type)
+{
+ return type_size_xvec4(type, false);
+}
+
+src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
+{
+ init();
+
+ this->file = VGRF;
+ this->nr = v->alloc.allocate(type_size_vec4(type));
+
+ if (type->is_array() || type->is_record()) {
+ this->swizzle = BRW_SWIZZLE_NOOP;
+ } else {
+ this->swizzle = brw_swizzle_for_size(type->vector_elements);
+ }
+
+ this->type = brw_type_for_base_type(type);
+}
+
+src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
+{
+ assert(size > 0);
+
+ init();
+
+ this->file = VGRF;
+ this->nr = v->alloc.allocate(type_size_vec4(type) * size);
+
+ this->swizzle = BRW_SWIZZLE_NOOP;
+
+ this->type = brw_type_for_base_type(type);
+}
+
+dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
+{
+ init();
+
+ this->file = VGRF;
+ this->nr = v->alloc.allocate(type_size_vec4(type));
+
+ if (type->is_array() || type->is_record()) {
+ this->writemask = WRITEMASK_XYZW;
+ } else {
+ this->writemask = (1 << type->vector_elements) - 1;
+ }
+
+ this->type = brw_type_for_base_type(type);
+}
+
+vec4_instruction *
+vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
+ src_reg src0, src_reg src1)
+{
+ vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
+ inst->conditional_mod = conditionalmod;
+ return inst;
+}
+
+vec4_instruction *
+vec4_visitor::emit_lrp(const dst_reg &dst,
+ const src_reg &x, const src_reg &y, const src_reg &a)
+{
+ if (devinfo->gen >= 6) {
+ /* Note that the instruction's argument order is reversed from GLSL
+ * and the IR.
+ */
+ return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
+ fix_3src_operand(x)));
+ } else {
+ /* Earlier generations don't support three source operations, so we
+ * need to emit x*(1-a) + y*a.
+ */
+ dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
+ dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
+ dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
+ y_times_a.writemask = dst.writemask;
+ one_minus_a.writemask = dst.writemask;
+ x_times_one_minus_a.writemask = dst.writemask;
+
+ emit(MUL(y_times_a, y, a));
+ emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
+ emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
+ return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
+ }
+}
+
+/**
+ * Emits the instructions needed to perform a pull constant load. before_block
+ * and before_inst can be NULL in which case the instruction will be appended
+ * to the end of the instruction list.
+ */
+void
+vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
+ src_reg surf_index,
+ src_reg offset_reg,
+ bblock_t *before_block,
+ vec4_instruction *before_inst)
+{
+ assert((before_inst == NULL && before_block == NULL) ||
+ (before_inst && before_block));
+
+ vec4_instruction *pull;
+
+ if (devinfo->gen >= 9) {
+ /* Gen9+ needs a message header in order to use SIMD4x2 mode */
+ src_reg header(this, glsl_type::uvec4_type, 2);
+
+ pull = new(mem_ctx)
+ vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
+ dst_reg(header));
+
+ if (before_inst)
+ emit_before(before_block, before_inst, pull);
+ else
+ emit(pull);
+
+ dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
+ offset_reg.type);
+ pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
+
+ if (before_inst)
+ emit_before(before_block, before_inst, pull);
+ else
+ emit(pull);
+
+ pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
+ dst,
+ surf_index,
+ header);
+ pull->mlen = 2;
+ pull->header_size = 1;
+ } else if (devinfo->gen >= 7) {
+ dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
+
+ grf_offset.type = offset_reg.type;
+
+ pull = MOV(grf_offset, offset_reg);
+
+ if (before_inst)
+ emit_before(before_block, before_inst, pull);
+ else
+ emit(pull);
+
+ pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
+ dst,
+ surf_index,
+ src_reg(grf_offset));
+ pull->mlen = 1;
+ } else {
+ pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
+ dst,
+ surf_index,
+ offset_reg);
+ pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
+ pull->mlen = 1;
+ }
+
+ if (before_inst)
+ emit_before(before_block, before_inst, pull);
+ else
+ emit(pull);
+}
+
+src_reg
+vec4_visitor::emit_uniformize(const src_reg &src)
+{
+ const src_reg chan_index(this, glsl_type::uint_type);
+ const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
+ src.type);
+
+ emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
+ ->force_writemask_all = true;
+ emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
+ ->force_writemask_all = true;
+
+ return src_reg(dst);
+}
+
+src_reg
+vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
+ src_reg coordinate, src_reg surface)
+{
+ vec4_instruction *inst =
+ new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
+ dst_reg(this, glsl_type::uvec4_type));
+ inst->base_mrf = 2;
+ inst->src[1] = surface;
+ inst->src[2] = surface;
+
+ int param_base;
+
+ if (devinfo->gen >= 9) {
+ /* Gen9+ needs a message header in order to use SIMD4x2 mode */
+ vec4_instruction *header_inst = new(mem_ctx)
+ vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
+ dst_reg(MRF, inst->base_mrf));
+
+ emit(header_inst);
+
+ inst->mlen = 2;
+ inst->header_size = 1;
+ param_base = inst->base_mrf + 1;
+ } else {
+ inst->mlen = 1;
+ param_base = inst->base_mrf;
+ }
+
+ /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
+ int coord_mask = (1 << coordinate_type->vector_elements) - 1;
+ int zero_mask = 0xf & ~coord_mask;
+
+ emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
+ coordinate));
+
+ emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
+ brw_imm_d(0)));
+
+ emit(inst);
+ return src_reg(inst->dst);
+}
+
+bool
+vec4_visitor::is_high_sampler(src_reg sampler)
+{
+ if (devinfo->gen < 8 && !devinfo->is_haswell)
+ return false;
+
+ return sampler.file != IMM || sampler.ud >= 16;
+}
+
+void
+vec4_visitor::emit_texture(ir_texture_opcode op,
+ dst_reg dest,
+ const glsl_type *dest_type,
+ src_reg coordinate,
+ int coord_components,
+ src_reg shadow_comparator,
+ src_reg lod, src_reg lod2,
+ src_reg sample_index,
+ uint32_t constant_offset,
+ src_reg offset_value,
+ src_reg mcs,
+ uint32_t surface,
+ src_reg surface_reg,
+ src_reg sampler_reg)
+{
+ /* The sampler can only meaningfully compute LOD for fragment shader
+ * messages. For all other stages, we change the opcode to TXL and hardcode
+ * the LOD to 0.
+ *
+ * textureQueryLevels() is implemented in terms of TXS so we need to pass a
+ * valid LOD argument.
+ */
+ if (op == ir_tex || op == ir_query_levels) {
+ assert(lod.file == BAD_FILE);
+ lod = brw_imm_f(0.0f);
+ }
+
+ enum opcode opcode;
+ switch (op) {
+ case ir_tex: opcode = SHADER_OPCODE_TXL; break;
+ case ir_txl: opcode = SHADER_OPCODE_TXL; break;
+ case ir_txd: opcode = SHADER_OPCODE_TXD; break;
+ case ir_txf: opcode = SHADER_OPCODE_TXF; break;
+ case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
+ SHADER_OPCODE_TXF_CMS); break;
+ case ir_txs: opcode = SHADER_OPCODE_TXS; break;
+ case ir_tg4: opcode = offset_value.file != BAD_FILE
+ ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
+ case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
+ case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
+ case ir_txb:
+ unreachable("TXB is not valid for vertex shaders.");
+ case ir_lod:
+ unreachable("LOD is not valid for vertex shaders.");
+ case ir_samples_identical: {
+ /* There are some challenges implementing this for vec4, and it seems
+ * unlikely to be used anyway. For now, just return false ways.
+ */
+ emit(MOV(dest, brw_imm_ud(0u)));
+ return;
+ }
+ default:
+ unreachable("Unrecognized tex op");
+ }
+
+ vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
+
+ inst->offset = constant_offset;
+
+ /* The message header is necessary for:
+ * - Gen4 (always)
+ * - Gen9+ for selecting SIMD4x2
+ * - Texel offsets
+ * - Gather channel selection
+ * - Sampler indices too large to fit in a 4-bit value.
+ * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
+ */
+ inst->header_size =
+ (devinfo->gen < 5 || devinfo->gen >= 9 ||
+ inst->offset != 0 || op == ir_tg4 ||
+ op == ir_texture_samples ||
+ is_high_sampler(sampler_reg)) ? 1 : 0;
+ inst->base_mrf = 2;
+ inst->mlen = inst->header_size;
+ inst->dst.writemask = WRITEMASK_XYZW;
+ inst->shadow_compare = shadow_comparator.file != BAD_FILE;
+
+ inst->src[1] = surface_reg;
+ inst->src[2] = sampler_reg;
+
+ /* MRF for the first parameter */
+ int param_base = inst->base_mrf + inst->header_size;
+
+ if (op == ir_txs || op == ir_query_levels) {
+ int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
+ emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
+ inst->mlen++;
+ } else if (op == ir_texture_samples) {
+ inst->dst.writemask = WRITEMASK_X;
+ } else {
+ /* Load the coordinate */
+ /* FINISHME: gl_clamp_mask and saturate */
+ int coord_mask = (1 << coord_components) - 1;
+ int zero_mask = 0xf & ~coord_mask;
+
+ emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
+ coordinate));
+ inst->mlen++;
+
+ if (zero_mask != 0) {
+ emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
+ brw_imm_d(0)));
+ }
+ /* Load the shadow comparator */
+ if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
+ emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
+ WRITEMASK_X),
+ shadow_comparator));
+ inst->mlen++;
+ }
+
+ /* Load the LOD info */
+ if (op == ir_tex || op == ir_txl) {
+ int mrf, writemask;
+ if (devinfo->gen >= 5) {
+ mrf = param_base + 1;
+ if (shadow_comparator.file != BAD_FILE) {
+ writemask = WRITEMASK_Y;
+ /* mlen already incremented */
+ } else {
+ writemask = WRITEMASK_X;
+ inst->mlen++;
+ }
+ } else /* devinfo->gen == 4 */ {
+ mrf = param_base;
+ writemask = WRITEMASK_W;
+ }
+ emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
+ } else if (op == ir_txf) {
+ emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
+ } else if (op == ir_txf_ms) {
+ emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
+ sample_index));
+ if (opcode == SHADER_OPCODE_TXF_CMS_W) {
+ /* MCS data is stored in the first two channels of ‘mcs’, but we
+ * need to get it into the .y and .z channels of the second vec4
+ * of params.
+ */
+ mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
+ emit(MOV(dst_reg(MRF, param_base + 1,
+ glsl_type::uint_type, WRITEMASK_YZ),
+ mcs));
+ } else if (devinfo->gen >= 7) {
+ /* MCS data is in the first channel of `mcs`, but we need to get it into
+ * the .y channel of the second vec4 of params, so replicate .x across
+ * the whole vec4 and then mask off everything except .y
+ */
+ mcs.swizzle = BRW_SWIZZLE_XXXX;
+ emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
+ mcs));
+ }
+ inst->mlen++;
+ } else if (op == ir_txd) {
+ const brw_reg_type type = lod.type;
+
+ if (devinfo->gen >= 5) {
+ lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
+ lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
+ emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
+ emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
+ inst->mlen++;
+
+ if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
+ lod.swizzle = BRW_SWIZZLE_ZZZZ;
+ lod2.swizzle = BRW_SWIZZLE_ZZZZ;
+ emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
+ emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
+ inst->mlen++;
+
+ if (shadow_comparator.file != BAD_FILE) {
+ emit(MOV(dst_reg(MRF, param_base + 2,
+ shadow_comparator.type, WRITEMASK_Z),
+ shadow_comparator));
+ }
+ }
+ } else /* devinfo->gen == 4 */ {
+ emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
+ emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
+ inst->mlen += 2;
+ }
+ } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
+ if (shadow_comparator.file != BAD_FILE) {
+ emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
+ shadow_comparator));
+ }
+
+ emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
+ offset_value));
+ inst->mlen++;
+ }
+ }
+
+ emit(inst);
+
+ /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
+ * spec requires layers.
+ */
+ if (op == ir_txs && devinfo->gen < 7) {
+ /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
+ emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
+ src_reg(inst->dst), brw_imm_d(1));
+ }
+
+ if (devinfo->gen == 6 && op == ir_tg4) {
+ emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
+ }
+
+ if (op == ir_query_levels) {
+ /* # levels is in .w */
+ src_reg swizzled(dest);
+ swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
+ SWIZZLE_W, SWIZZLE_W);
+ emit(MOV(dest, swizzled));
+ }
+}
+
+/**
+ * Apply workarounds for Gen6 gather with UINT/SINT
+ */
+void
+vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
+{
+ if (!wa)
+ return;
+
+ int width = (wa & WA_8BIT) ? 8 : 16;
+ dst_reg dst_f = dst;
+ dst_f.type = BRW_REGISTER_TYPE_F;
+
+ /* Convert from UNORM to UINT */
+ emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
+ emit(MOV(dst, src_reg(dst_f)));
+
+ if (wa & WA_SIGN) {
+ /* Reinterpret the UINT value as a signed INT value by
+ * shifting the sign bit into place, then shifting back
+ * preserving sign.
+ */
+ emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
+ emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
+ }
+}
+
+void
+vec4_visitor::gs_emit_vertex(int /* stream_id */)
+{
+ unreachable("not reached");
+}
+
+void
+vec4_visitor::gs_end_primitive()
+{
+ unreachable("not reached");
+}
+
+void
+vec4_visitor::emit_ndc_computation()
+{
+ if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
+ return;
+
+ /* Get the position */
+ src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
+
+ /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
+ dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
+ output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
+ output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
+
+ current_annotation = "NDC";
+ dst_reg ndc_w = ndc;
+ ndc_w.writemask = WRITEMASK_W;
+ src_reg pos_w = pos;
+ pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
+ emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
+
+ dst_reg ndc_xyz = ndc;
+ ndc_xyz.writemask = WRITEMASK_XYZ;
+
+ emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
+}
+
+void
+vec4_visitor::emit_psiz_and_flags(dst_reg reg)
+{
+ if (devinfo->gen < 6 &&
+ ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
+ output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
+ devinfo->has_negative_rhw_bug)) {
+ dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
+ dst_reg header1_w = header1;
+ header1_w.writemask = WRITEMASK_W;
+
+ emit(MOV(header1, brw_imm_ud(0u)));
+
+ if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
+ src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
+
+ current_annotation = "Point size";
+ emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
+ emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
+ }
+
+ if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
+ current_annotation = "Clipping flags";
+ dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
+ dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
+
+ emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
+ emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
+ emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
+
+ emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
+ emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
+ emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
+ emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
+ }
+
+ /* i965 clipping workaround:
+ * 1) Test for -ve rhw
+ * 2) If set,
+ * set ndc = (0,0,0,0)
+ * set ucp[6] = 1
+ *
+ * Later, clipping will detect ucp[6] and ensure the primitive is
+ * clipped against all fixed planes.
+ */
+ if (devinfo->has_negative_rhw_bug &&
+ output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
+ src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
+ ndc_w.swizzle = BRW_SWIZZLE_WWWW;
+ emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
+ vec4_instruction *inst;
+ inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
+ inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ }
+
+ emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
+ } else if (devinfo->gen < 6) {
+ emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
+ } else {
+ emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
+ if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
+ dst_reg reg_w = reg;
+ reg_w.writemask = WRITEMASK_W;
+ src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
+ reg_as_src.type = reg_w.type;
+ reg_as_src.swizzle = brw_swizzle_for_size(1);
+ emit(MOV(reg_w, reg_as_src));
+ }
+ if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
+ dst_reg reg_y = reg;
+ reg_y.writemask = WRITEMASK_Y;
+ reg_y.type = BRW_REGISTER_TYPE_D;
+ output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
+ emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
+ }
+ if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
+ dst_reg reg_z = reg;
+ reg_z.writemask = WRITEMASK_Z;
+ reg_z.type = BRW_REGISTER_TYPE_D;
+ output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
+ emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
+ }
+ }
+}
+
+vec4_instruction *
+vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
+{
+ assert(varying < VARYING_SLOT_MAX);
+
+ unsigned num_comps = output_num_components[varying][component];
+ if (num_comps == 0)
+ return NULL;
+
+ assert(output_reg[varying][component].type == reg.type);
+ current_annotation = output_reg_annotation[varying];
+ if (output_reg[varying][component].file != BAD_FILE) {
+ src_reg src = src_reg(output_reg[varying][component]);
+ src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
+ reg.writemask =
+ brw_writemask_for_component_packing(num_comps, component);
+ return emit(MOV(reg, src));
+ }
+ return NULL;
+}
+
+void
+vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
+{
+ reg.type = BRW_REGISTER_TYPE_F;
+ output_reg[varying][0].type = reg.type;
+
+ switch (varying) {
+ case VARYING_SLOT_PSIZ:
+ {
+ /* PSIZ is always in slot 0, and is coupled with other flags. */
+ current_annotation = "indices, point width, clip flags";
+ emit_psiz_and_flags(reg);
+ break;
+ }
+ case BRW_VARYING_SLOT_NDC:
+ current_annotation = "NDC";
+ if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
+ emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
+ break;
+ case VARYING_SLOT_POS:
+ current_annotation = "gl_Position";
+ if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
+ emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
+ break;
+ case VARYING_SLOT_EDGE:
+ /* This is present when doing unfilled polygons. We're supposed to copy
+ * the edge flag from the user-provided vertex array
+ * (glEdgeFlagPointer), or otherwise we'll copy from the current value
+ * of that attribute (starts as 1.0f). This is then used in clipping to
+ * determine which edges should be drawn as wireframe.
+ */
+ current_annotation = "edge flag";
+ emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
+ glsl_type::float_type, WRITEMASK_XYZW))));
+ break;
+ case BRW_VARYING_SLOT_PAD:
+ /* No need to write to this slot */
+ break;
+ default:
+ for (int i = 0; i < 4; i++) {
+ emit_generic_urb_slot(reg, varying, i);
+ }
+ break;
+ }
+}
+
+static int
+align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen)
+{
+ if (devinfo->gen >= 6) {
+ /* URB data written (does not include the message header reg) must
+ * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
+ * section 5.4.3.2.2: URB_INTERLEAVED.
+ *
+ * URB entries are allocated on a multiple of 1024 bits, so an
+ * extra 128 bits written here to make the end align to 256 is
+ * no problem.
+ */
+ if ((mlen % 2) != 1)
+ mlen++;
+ }
+
+ return mlen;
+}
+
+
+/**
+ * Generates the VUE payload plus the necessary URB write instructions to
+ * output it.
+ *
+ * The VUE layout is documented in Volume 2a.
+ */
+void
+vec4_visitor::emit_vertex()
+{
+ /* MRF 0 is reserved for the debugger, so start with message header
+ * in MRF 1.
+ */
+ int base_mrf = 1;
+ int mrf = base_mrf;
+ /* In the process of generating our URB write message contents, we
+ * may need to unspill a register or load from an array. Those
+ * reads would use MRFs 14-15.
+ */
+ int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
+
+ /* The following assertion verifies that max_usable_mrf causes an
+ * even-numbered amount of URB write data, which will meet gen6's
+ * requirements for length alignment.
+ */
+ assert ((max_usable_mrf - base_mrf) % 2 == 0);
+
+ /* First mrf is the g0-based message header containing URB handles and
+ * such.
+ */
+ emit_urb_write_header(mrf++);
+
+ if (devinfo->gen < 6) {
+ emit_ndc_computation();
+ }
+
+ /* We may need to split this up into several URB writes, so do them in a
+ * loop.
+ */
+ int slot = 0;
+ bool complete = false;
+ do {
+ /* URB offset is in URB row increments, and each of our MRFs is half of
+ * one of those, since we're doing interleaved writes.
+ */
+ int offset = slot / 2;
+
+ mrf = base_mrf + 1;
+ for (; slot < prog_data->vue_map.num_slots; ++slot) {
+ emit_urb_slot(dst_reg(MRF, mrf++),
+ prog_data->vue_map.slot_to_varying[slot]);
+
+ /* If this was max_usable_mrf, we can't fit anything more into this
+ * URB WRITE. Same thing if we reached the maximum length available.
+ */
+ if (mrf > max_usable_mrf ||
+ align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
+ slot++;
+ break;
+ }
+ }
+
+ complete = slot >= prog_data->vue_map.num_slots;
+ current_annotation = "URB write";
+ vec4_instruction *inst = emit_urb_write_opcode(complete);
+ inst->base_mrf = base_mrf;
+ inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
+ inst->offset += offset;
+ } while(!complete);
+}
+
+
+src_reg
+vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
+ src_reg *reladdr, int reg_offset)
+{
+ /* Because we store the values to scratch interleaved like our
+ * vertex data, we need to scale the vec4 index by 2.
+ */
+ int message_header_scale = 2;
+
+ /* Pre-gen6, the message header uses byte offsets instead of vec4
+ * (16-byte) offset units.
+ */
+ if (devinfo->gen < 6)
+ message_header_scale *= 16;
+
+ if (reladdr) {
+ /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
+ * to multiply the reladdr by 2. Notice that the reg_offset part
+ * is in units of 16 bytes and is used to select the low/high 16-byte
+ * chunk of a full dvec4, so we don't want to multiply that part.
+ */
+ src_reg index = src_reg(this, glsl_type::int_type);
+ if (type_sz(inst->dst.type) < 8) {
+ emit_before(block, inst, ADD(dst_reg(index), *reladdr,
+ brw_imm_d(reg_offset)));
+ emit_before(block, inst, MUL(dst_reg(index), index,
+ brw_imm_d(message_header_scale)));
+ } else {
+ emit_before(block, inst, MUL(dst_reg(index), *reladdr,
+ brw_imm_d(message_header_scale * 2)));
+ emit_before(block, inst, ADD(dst_reg(index), index,
+ brw_imm_d(reg_offset * message_header_scale)));
+ }
+ return index;
+ } else {
+ return brw_imm_d(reg_offset * message_header_scale);
+ }
+}
+
+/**
+ * Emits an instruction before @inst to load the value named by @orig_src
+ * from scratch space at @base_offset to @temp.
+ *
+ * @base_offset is measured in 32-byte units (the size of a register).
+ */
+void
+vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
+ dst_reg temp, src_reg orig_src,
+ int base_offset)
+{
+ assert(orig_src.offset % REG_SIZE == 0);
+ int reg_offset = base_offset + orig_src.offset / REG_SIZE;
+ src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
+ reg_offset);
+
+ if (type_sz(orig_src.type) < 8) {
+ emit_before(block, inst, SCRATCH_READ(temp, index));
+ } else {
+ dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
+ dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
+ emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
+ index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
+ vec4_instruction *last_read =
+ SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
+ emit_before(block, inst, last_read);
+ shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
+ }
+}
+
+/**
+ * Emits an instruction after @inst to store the value to be written
+ * to @orig_dst to scratch space at @base_offset, from @temp.
+ *
+ * @base_offset is measured in 32-byte units (the size of a register).
+ */
+void
+vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
+ int base_offset)
+{
+ assert(inst->dst.offset % REG_SIZE == 0);
+ int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
+ src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
+ reg_offset);
+
+ /* Create a temporary register to store *inst's result in.
+ *
+ * We have to be careful in MOVing from our temporary result register in
+ * the scratch write. If we swizzle from channels of the temporary that
+ * weren't initialized, it will confuse live interval analysis, which will
+ * make spilling fail to make progress.
+ */
+ bool is_64bit = type_sz(inst->dst.type) == 8;
+ const glsl_type *alloc_type =
+ is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
+ const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
+ inst->dst.type),
+ brw_swizzle_for_mask(inst->dst.writemask));
+
+ if (!is_64bit) {
+ dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
+ inst->dst.writemask));
+ vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
+ if (inst->opcode != BRW_OPCODE_SEL)
+ write->predicate = inst->predicate;
+ write->ir = inst->ir;
+ write->annotation = inst->annotation;
+ inst->insert_after(block, write);
+ } else {
+ dst_reg shuffled = dst_reg(this, alloc_type);
+ vec4_instruction *last =
+ shuffle_64bit_data(shuffled, temp, true, block, inst);
+ src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
+
+ uint8_t mask = 0;
+ if (inst->dst.writemask & WRITEMASK_X)
+ mask |= WRITEMASK_XY;
+ if (inst->dst.writemask & WRITEMASK_Y)
+ mask |= WRITEMASK_ZW;
+ if (mask) {
+ dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
+
+ vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
+ if (inst->opcode != BRW_OPCODE_SEL)
+ write->predicate = inst->predicate;
+ write->ir = inst->ir;
+ write->annotation = inst->annotation;
+ last->insert_after(block, write);
+ }
+
+ mask = 0;
+ if (inst->dst.writemask & WRITEMASK_Z)
+ mask |= WRITEMASK_XY;
+ if (inst->dst.writemask & WRITEMASK_W)
+ mask |= WRITEMASK_ZW;
+ if (mask) {
+ dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
+
+ src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
+ reg_offset + 1);
+ vec4_instruction *write =
+ SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
+ if (inst->opcode != BRW_OPCODE_SEL)
+ write->predicate = inst->predicate;
+ write->ir = inst->ir;
+ write->annotation = inst->annotation;
+ last->insert_after(block, write);
+ }
+ }
+
+ inst->dst.file = temp.file;
+ inst->dst.nr = temp.nr;
+ inst->dst.offset %= REG_SIZE;
+ inst->dst.reladdr = NULL;
+}
+
+/**
+ * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
+ * adds the scratch read(s) before \p inst. The function also checks for
+ * recursive reladdr scratch accesses, issuing the corresponding scratch
+ * loads and rewriting reladdr references accordingly.
+ *
+ * \return \p src if it did not require a scratch load, otherwise, the
+ * register holding the result of the scratch load that the caller should
+ * use to rewrite src.
+ */
+src_reg
+vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
+ vec4_instruction *inst, src_reg src)
+{
+ /* Resolve recursive reladdr scratch access by calling ourselves
+ * with src.reladdr
+ */
+ if (src.reladdr)
+ *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
+ *src.reladdr);
+
+ /* Now handle scratch access on src */
+ if (src.file == VGRF && scratch_loc[src.nr] != -1) {
+ dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
+ glsl_type::dvec4_type : glsl_type::vec4_type);
+ emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
+ src.nr = temp.nr;
+ src.offset %= REG_SIZE;
+ src.reladdr = NULL;
+ }
+
+ return src;
+}
+
+/**
+ * We can't generally support array access in GRF space, because a
+ * single instruction's destination can only span 2 contiguous
+ * registers. So, we send all GRF arrays that get variable index
+ * access to scratch space.
+ */
+void
+vec4_visitor::move_grf_array_access_to_scratch()
+{
+ int scratch_loc[this->alloc.count];
+ memset(scratch_loc, -1, sizeof(scratch_loc));
+
+ /* First, calculate the set of virtual GRFs that need to be punted
+ * to scratch due to having any array access on them, and where in
+ * scratch.
+ */
+ foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+ if (inst->dst.file == VGRF && inst->dst.reladdr) {
+ if (scratch_loc[inst->dst.nr] == -1) {
+ scratch_loc[inst->dst.nr] = last_scratch;
+ last_scratch += this->alloc.sizes[inst->dst.nr];
+ }
+
+ for (src_reg *iter = inst->dst.reladdr;
+ iter->reladdr;
+ iter = iter->reladdr) {
+ if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
+ scratch_loc[iter->nr] = last_scratch;
+ last_scratch += this->alloc.sizes[iter->nr];
+ }
+ }
+ }
+
+ for (int i = 0 ; i < 3; i++) {
+ for (src_reg *iter = &inst->src[i];
+ iter->reladdr;
+ iter = iter->reladdr) {
+ if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
+ scratch_loc[iter->nr] = last_scratch;
+ last_scratch += this->alloc.sizes[iter->nr];
+ }
+ }
+ }
+ }
+
+ /* Now, for anything that will be accessed through scratch, rewrite
+ * it to load/store. Note that this is a _safe list walk, because
+ * we may generate a new scratch_write instruction after the one
+ * we're processing.
+ */
+ foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+ /* Set up the annotation tracking for new generated instructions. */
+ base_ir = inst->ir;
+ current_annotation = inst->annotation;
+
+ /* First handle scratch access on the dst. Notice we have to handle
+ * the case where the dst's reladdr also points to scratch space.
+ */
+ if (inst->dst.reladdr)
+ *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
+ *inst->dst.reladdr);
+
+ /* Now that we have handled any (possibly recursive) reladdr scratch
+ * accesses for dst we can safely do the scratch write for dst itself
+ */
+ if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
+ emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
+
+ /* Now handle scratch access on any src. In this case, since inst->src[i]
+ * already is a src_reg, we can just call emit_resolve_reladdr with
+ * inst->src[i] and it will take care of handling scratch loads for
+ * both src and src.reladdr (recursively).
+ */
+ for (int i = 0 ; i < 3; i++) {
+ inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
+ inst->src[i]);
+ }
+ }
+}
+
+/**
+ * Emits an instruction before @inst to load the value named by @orig_src
+ * from the pull constant buffer (surface) at @base_offset to @temp.
+ */
+void
+vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
+ dst_reg temp, src_reg orig_src,
+ int base_offset, src_reg indirect)
+{
+ assert(orig_src.offset % 16 == 0);
+ const unsigned index = prog_data->base.binding_table.pull_constants_start;
+
+ /* For 64bit loads we need to emit two 32-bit load messages and we also
+ * we need to shuffle the 32-bit data result into proper 64-bit data. To do
+ * that we emit the 32-bit loads into a temporary and we shuffle the result
+ * into the original destination.
+ */
+ dst_reg orig_temp = temp;
+ bool is_64bit = type_sz(orig_src.type) == 8;
+ if (is_64bit) {
+ assert(type_sz(temp.type) == 8);
+ dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
+ temp = retype(temp_df, BRW_REGISTER_TYPE_F);
+ }
+
+ src_reg src = orig_src;
+ for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
+ int reg_offset = base_offset + src.offset / 16;
+
+ src_reg offset;
+ if (indirect.file != BAD_FILE) {
+ offset = src_reg(this, glsl_type::uint_type);
+ emit_before(block, inst, ADD(dst_reg(offset), indirect,
+ brw_imm_ud(reg_offset * 16)));
+ } else if (devinfo->gen >= 8) {
+ /* Store the offset in a GRF so we can send-from-GRF. */
+ offset = src_reg(this, glsl_type::uint_type);
+ emit_before(block, inst, MOV(dst_reg(offset),
+ brw_imm_ud(reg_offset * 16)));
+ } else {
+ offset = brw_imm_d(reg_offset * 16);
+ }
+
+ emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
+ brw_imm_ud(index),
+ offset,
+ block, inst);
+
+ src = byte_offset(src, 16);
+ }
+
+ brw_mark_surface_used(&prog_data->base, index);
+
+ if (is_64bit) {
+ temp = retype(temp, BRW_REGISTER_TYPE_DF);
+ shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
+ }
+}
+
+/**
+ * Implements array access of uniforms by inserting a
+ * PULL_CONSTANT_LOAD instruction.
+ *
+ * Unlike temporary GRF array access (where we don't support it due to
+ * the difficulty of doing relative addressing on instruction
+ * destinations), we could potentially do array access of uniforms
+ * that were loaded in GRF space as push constants. In real-world
+ * usage we've seen, though, the arrays being used are always larger
+ * than we could load as push constants, so just always move all
+ * uniform array access out to a pull constant buffer.
+ */
+void
+vec4_visitor::move_uniform_array_access_to_pull_constants()
+{
+ /* The vulkan dirver doesn't support pull constants other than UBOs so
+ * everything has to be pushed regardless.
+ */
+ if (stage_prog_data->pull_param == NULL) {
+ split_uniform_registers();
+ return;
+ }
+
+ int pull_constant_loc[this->uniforms];
+ memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
+
+ /* First, walk through the instructions and determine which things need to
+ * be pulled. We mark something as needing to be pulled by setting
+ * pull_constant_loc to 0.
+ */
+ foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+ /* We only care about MOV_INDIRECT of a uniform */
+ if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
+ inst->src[0].file != UNIFORM)
+ continue;
+
+ int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
+
+ for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
+ pull_constant_loc[uniform_nr + j] = 0;
+ }
+
+ /* Next, we walk the list of uniforms and assign real pull constant
+ * locations and set their corresponding entries in pull_param.
+ */
+ for (int j = 0; j < this->uniforms; j++) {
+ if (pull_constant_loc[j] < 0)
+ continue;
+
+ pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
+
+ for (int i = 0; i < 4; i++) {
+ stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
+ = stage_prog_data->param[j * 4 + i];
+ }
+ }
+
+ /* Finally, we can walk through the instructions and lower MOV_INDIRECT
+ * instructions to actual uniform pulls.
+ */
+ foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+ /* We only care about MOV_INDIRECT of a uniform */
+ if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
+ inst->src[0].file != UNIFORM)
+ continue;
+
+ int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
+
+ assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
+
+ emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
+ pull_constant_loc[uniform_nr], inst->src[1]);
+ inst->remove(block);
+ }
+
+ /* Now there are no accesses of the UNIFORM file with a reladdr, so
+ * no need to track them as larger-than-vec4 objects. This will be
+ * relied on in cutting out unused uniform vectors from push
+ * constants.
+ */
+ split_uniform_registers();
+}
+
+void
+vec4_visitor::resolve_ud_negate(src_reg *reg)
+{
+ if (reg->type != BRW_REGISTER_TYPE_UD ||
+ !reg->negate)
+ return;
+
+ src_reg temp = src_reg(this, glsl_type::uvec4_type);
+ emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
+ *reg = temp;
+}
+
+vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
+ void *log_data,
+ const struct brw_sampler_prog_key_data *key_tex,
+ struct brw_vue_prog_data *prog_data,
+ const nir_shader *shader,
+ void *mem_ctx,
+ bool no_spills,
+ int shader_time_index)
+ : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
+ key_tex(key_tex),
+ prog_data(prog_data),
+ fail_msg(NULL),
+ first_non_payload_grf(0),
+ need_all_constants_in_pull_buffer(false),
+ no_spills(no_spills),
+ shader_time_index(shader_time_index),
+ last_scratch(0)
+{
+ this->failed = false;
+
+ this->base_ir = NULL;
+ this->current_annotation = NULL;
+ memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
+
+ memset(this->output_num_components, 0, sizeof(this->output_num_components));
+
+ this->virtual_grf_start = NULL;
+ this->virtual_grf_end = NULL;
+ this->live_intervals = NULL;
+
+ this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
+
+ this->uniforms = 0;
+}
+
+vec4_visitor::~vec4_visitor()
+{
+}
+
+
+void
+vec4_visitor::fail(const char *format, ...)
+{
+ va_list va;
+ char *msg;
+
+ if (failed)
+ return;
+
+ failed = true;
+
+ va_start(va, format);
+ msg = ralloc_vasprintf(mem_ctx, format, va);
+ va_end(va);
+ msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
+
+ this->fail_msg = msg;
+
+ if (debug_enabled) {
+ fprintf(stderr, "%s", msg);
+ }
+}
+
+} /* namespace brw */
diff --git a/src/intel/compiler/brw_vec4_vs.h b/src/intel/compiler/brw_vec4_vs.h
new file mode 100644
index 00000000000..8c346d7636a
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_vs.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright © 2006 - 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_VEC4_VS_VISITOR_H
+#define BRW_VEC4_VS_VISITOR_H
+
+#include "brw_vec4.h"
+
+namespace brw {
+
+class vec4_vs_visitor : public vec4_visitor
+{
+public:
+ vec4_vs_visitor(const struct brw_compiler *compiler,
+ void *log_data,
+ const struct brw_vs_prog_key *key,
+ struct brw_vs_prog_data *vs_prog_data,
+ const nir_shader *shader,
+ gl_clip_plane *clip_planes,
+ void *mem_ctx,
+ int shader_time_index,
+ bool use_legacy_snorm_formula);
+
+protected:
+ virtual dst_reg *make_reg_for_system_value(int location);
+ virtual void setup_payload();
+ virtual void emit_prolog();
+ virtual void emit_thread_end();
+ virtual void emit_urb_write_header(int mrf);
+ virtual void emit_urb_slot(dst_reg reg, int varying);
+ virtual vec4_instruction *emit_urb_write_opcode(bool complete);
+
+private:
+ int setup_attributes(int payload_reg);
+ void setup_uniform_clipplane_values();
+ void emit_clip_distances(dst_reg reg, int offset);
+
+ const struct brw_vs_prog_key *const key;
+ struct brw_vs_prog_data * const vs_prog_data;
+
+ gl_clip_plane *clip_planes;
+
+ bool use_legacy_snorm_formula;
+};
+
+} /* namespace brw */
+
+#endif /* BRW_VEC4_VS_VISITOR_H */
diff --git a/src/intel/compiler/brw_vec4_vs_visitor.cpp b/src/intel/compiler/brw_vec4_vs_visitor.cpp
new file mode 100644
index 00000000000..0cec77990d6
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_vs_visitor.cpp
@@ -0,0 +1,221 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+#include "brw_vec4_vs.h"
+#include "common/gen_debug.h"
+
+namespace brw {
+
+void
+vec4_vs_visitor::emit_prolog()
+{
+}
+
+
+dst_reg *
+vec4_vs_visitor::make_reg_for_system_value(int location)
+{
+ /* VertexID is stored by the VF as the last vertex element, but
+ * we don't represent it with a flag in inputs_read, so we call
+ * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
+ */
+ dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
+
+ switch (location) {
+ case SYSTEM_VALUE_BASE_VERTEX:
+ reg->writemask = WRITEMASK_X;
+ vs_prog_data->uses_basevertex = true;
+ break;
+ case SYSTEM_VALUE_BASE_INSTANCE:
+ reg->writemask = WRITEMASK_Y;
+ vs_prog_data->uses_baseinstance = true;
+ break;
+ case SYSTEM_VALUE_VERTEX_ID:
+ case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
+ reg->writemask = WRITEMASK_Z;
+ vs_prog_data->uses_vertexid = true;
+ break;
+ case SYSTEM_VALUE_INSTANCE_ID:
+ reg->writemask = WRITEMASK_W;
+ vs_prog_data->uses_instanceid = true;
+ break;
+ case SYSTEM_VALUE_DRAW_ID:
+ reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX + 1);
+ reg->writemask = WRITEMASK_X;
+ vs_prog_data->uses_drawid = true;
+ break;
+ default:
+ unreachable("not reached");
+ }
+
+ return reg;
+}
+
+
+void
+vec4_vs_visitor::emit_urb_write_header(int mrf)
+{
+ /* No need to do anything for VS; an implied write to this MRF will be
+ * performed by VS_OPCODE_URB_WRITE.
+ */
+ (void) mrf;
+}
+
+
+vec4_instruction *
+vec4_vs_visitor::emit_urb_write_opcode(bool complete)
+{
+ /* For VS, the URB writes end the thread. */
+ if (complete) {
+ if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+ emit_shader_time_end();
+ }
+
+ vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
+ inst->urb_write_flags = complete ?
+ BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS;
+
+ return inst;
+}
+
+
+void
+vec4_vs_visitor::emit_urb_slot(dst_reg reg, int varying)
+{
+ reg.type = BRW_REGISTER_TYPE_F;
+ output_reg[varying][0].type = reg.type;
+
+ switch (varying) {
+ case VARYING_SLOT_COL0:
+ case VARYING_SLOT_COL1:
+ case VARYING_SLOT_BFC0:
+ case VARYING_SLOT_BFC1: {
+ /* These built-in varyings are only supported in compatibility mode,
+ * and we only support GS in core profile. So, this must be a vertex
+ * shader.
+ */
+ vec4_instruction *inst = emit_generic_urb_slot(reg, varying, 0);
+ if (inst && key->clamp_vertex_color)
+ inst->saturate = true;
+ break;
+ }
+ default:
+ return vec4_visitor::emit_urb_slot(reg, varying);
+ }
+}
+
+
+void
+vec4_vs_visitor::emit_clip_distances(dst_reg reg, int offset)
+{
+ /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
+ *
+ * "If a linked set of shaders forming the vertex stage contains no
+ * static write to gl_ClipVertex or gl_ClipDistance, but the
+ * application has requested clipping against user clip planes through
+ * the API, then the coordinate written to gl_Position is used for
+ * comparison against the user clip planes."
+ *
+ * This function is only called if the shader didn't write to
+ * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
+ * if the user wrote to it; otherwise we use gl_Position.
+ */
+ gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
+ if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
+ clip_vertex = VARYING_SLOT_POS;
+ }
+
+ for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
+ ++i) {
+ reg.writemask = 1 << i;
+ emit(DP4(reg,
+ src_reg(output_reg[clip_vertex][0]),
+ src_reg(this->userplane[i + offset])));
+ }
+}
+
+
+void
+vec4_vs_visitor::setup_uniform_clipplane_values()
+{
+ for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
+ this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
+ this->userplane[i].type = BRW_REGISTER_TYPE_F;
+ for (int j = 0; j < 4; ++j) {
+ stage_prog_data->param[this->uniforms * 4 + j] =
+ (gl_constant_value *) &clip_planes[i][j];
+ }
+ ++this->uniforms;
+ }
+}
+
+
+void
+vec4_vs_visitor::emit_thread_end()
+{
+ setup_uniform_clipplane_values();
+
+ /* Lower legacy ff and ClipVertex clipping to clip distances */
+ if (key->nr_userclip_plane_consts > 0) {
+ current_annotation = "user clip distances";
+
+ output_reg[VARYING_SLOT_CLIP_DIST0][0] =
+ dst_reg(this, glsl_type::vec4_type);
+ output_reg[VARYING_SLOT_CLIP_DIST1][0] =
+ dst_reg(this, glsl_type::vec4_type);
+ output_num_components[VARYING_SLOT_CLIP_DIST0][0] = 4;
+ output_num_components[VARYING_SLOT_CLIP_DIST1][0] = 4;
+
+ emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0][0], 0);
+ emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1][0], 4);
+ }
+
+ /* For VS, we always end the thread by emitting a single vertex.
+ * emit_urb_write_opcode() will take care of setting the eot flag on the
+ * SEND instruction.
+ */
+ emit_vertex();
+}
+
+
+vec4_vs_visitor::vec4_vs_visitor(const struct brw_compiler *compiler,
+ void *log_data,
+ const struct brw_vs_prog_key *key,
+ struct brw_vs_prog_data *vs_prog_data,
+ const nir_shader *shader,
+ gl_clip_plane *clip_planes,
+ void *mem_ctx,
+ int shader_time_index,
+ bool use_legacy_snorm_formula)
+ : vec4_visitor(compiler, log_data, &key->tex, &vs_prog_data->base, shader,
+ mem_ctx, false /* no_spills */, shader_time_index),
+ key(key),
+ vs_prog_data(vs_prog_data),
+ clip_planes(clip_planes),
+ use_legacy_snorm_formula(use_legacy_snorm_formula)
+{
+}
+
+
+} /* namespace brw */
diff --git a/src/intel/compiler/brw_vue_map.c b/src/intel/compiler/brw_vue_map.c
new file mode 100644
index 00000000000..e14cba8f67d
--- /dev/null
+++ b/src/intel/compiler/brw_vue_map.c
@@ -0,0 +1,307 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file brw_vue_map.c
+ *
+ * This file computes the "VUE map" for a (non-fragment) shader stage, which
+ * describes the layout of its output varyings. The VUE map is used to match
+ * outputs from one stage with the inputs of the next.
+ *
+ * Largely, varyings can be placed however we like - producers/consumers simply
+ * have to agree on the layout. However, there is also a "VUE Header" that
+ * prescribes a fixed-layout for items that interact with fixed function
+ * hardware, such as the clipper and rasterizer.
+ *
+ * Authors:
+ * Paul Berry <[email protected]>
+ * Chris Forbes <[email protected]>
+ * Eric Anholt <[email protected]>
+ */
+
+
+#include "brw_compiler.h"
+#include "common/gen_debug.h"
+
+static inline void
+assign_vue_slot(struct brw_vue_map *vue_map, int varying, int slot)
+{
+ /* Make sure this varying hasn't been assigned a slot already */
+ assert (vue_map->varying_to_slot[varying] == -1);
+
+ vue_map->varying_to_slot[varying] = slot;
+ vue_map->slot_to_varying[slot] = varying;
+}
+
+/**
+ * Compute the VUE map for a shader stage.
+ */
+void
+brw_compute_vue_map(const struct gen_device_info *devinfo,
+ struct brw_vue_map *vue_map,
+ uint64_t slots_valid,
+ bool separate)
+{
+ /* Keep using the packed/contiguous layout on old hardware - we only need
+ * the SSO layout when using geometry/tessellation shaders or 32 FS input
+ * varyings, which only exist on Gen >= 6. It's also a bit more efficient.
+ */
+ if (devinfo->gen < 6)
+ separate = false;
+
+ if (separate) {
+ /* In SSO mode, we don't know whether the adjacent stage will
+ * read/write gl_ClipDistance, which has a fixed slot location.
+ * We have to assume the worst and reserve a slot for it, or else
+ * the rest of our varyings will be off by a slot.
+ *
+ * Note that we don't have to worry about COL/BFC, as those built-in
+ * variables only exist in legacy GL, which only supports VS and FS.
+ */
+ slots_valid |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0);
+ slots_valid |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
+ }
+
+ vue_map->slots_valid = slots_valid;
+ vue_map->separate = separate;
+
+ /* gl_Layer and gl_ViewportIndex don't get their own varying slots -- they
+ * are stored in the first VUE slot (VARYING_SLOT_PSIZ).
+ */
+ slots_valid &= ~(VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
+
+ /* Make sure that the values we store in vue_map->varying_to_slot and
+ * vue_map->slot_to_varying won't overflow the signed chars that are used
+ * to store them. Note that since vue_map->slot_to_varying sometimes holds
+ * values equal to BRW_VARYING_SLOT_COUNT, we need to ensure that
+ * BRW_VARYING_SLOT_COUNT is <= 127, not 128.
+ */
+ STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 127);
+
+ for (int i = 0; i < BRW_VARYING_SLOT_COUNT; ++i) {
+ vue_map->varying_to_slot[i] = -1;
+ vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_PAD;
+ }
+
+ int slot = 0;
+
+ /* VUE header: format depends on chip generation and whether clipping is
+ * enabled.
+ *
+ * See the Sandybridge PRM, Volume 2 Part 1, section 1.5.1 (page 30),
+ * "Vertex URB Entry (VUE) Formats" which describes the VUE header layout.
+ */
+ if (devinfo->gen < 6) {
+ /* There are 8 dwords in VUE header pre-Ironlake:
+ * dword 0-3 is indices, point width, clip flags.
+ * dword 4-7 is ndc position
+ * dword 8-11 is the first vertex data.
+ *
+ * On Ironlake the VUE header is nominally 20 dwords, but the hardware
+ * will accept the same header layout as Gen4 [and should be a bit faster]
+ */
+ assign_vue_slot(vue_map, VARYING_SLOT_PSIZ, slot++);
+ assign_vue_slot(vue_map, BRW_VARYING_SLOT_NDC, slot++);
+ assign_vue_slot(vue_map, VARYING_SLOT_POS, slot++);
+ } else {
+ /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
+ * dword 0-3 of the header is indices, point width, clip flags.
+ * dword 4-7 is the 4D space position
+ * dword 8-15 of the vertex header is the user clip distance if
+ * enabled.
+ * dword 8-11 or 16-19 is the first vertex element data we fill.
+ */
+ assign_vue_slot(vue_map, VARYING_SLOT_PSIZ, slot++);
+ assign_vue_slot(vue_map, VARYING_SLOT_POS, slot++);
+ if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0))
+ assign_vue_slot(vue_map, VARYING_SLOT_CLIP_DIST0, slot++);
+ if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1))
+ assign_vue_slot(vue_map, VARYING_SLOT_CLIP_DIST1, slot++);
+
+ /* front and back colors need to be consecutive so that we can use
+ * ATTRIBUTE_SWIZZLE_INPUTATTR_FACING to swizzle them when doing
+ * two-sided color.
+ */
+ if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_COL0))
+ assign_vue_slot(vue_map, VARYING_SLOT_COL0, slot++);
+ if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_BFC0))
+ assign_vue_slot(vue_map, VARYING_SLOT_BFC0, slot++);
+ if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_COL1))
+ assign_vue_slot(vue_map, VARYING_SLOT_COL1, slot++);
+ if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_BFC1))
+ assign_vue_slot(vue_map, VARYING_SLOT_BFC1, slot++);
+ }
+
+ /* The hardware doesn't care about the rest of the vertex outputs, so we
+ * can assign them however we like. For normal programs, we simply assign
+ * them contiguously.
+ *
+ * For separate shader pipelines, we first assign built-in varyings
+ * contiguous slots. This works because ARB_separate_shader_objects
+ * requires that all shaders have matching built-in varying interface
+ * blocks. Next, we assign generic varyings based on their location
+ * (either explicit or linker assigned). This guarantees a fixed layout.
+ *
+ * We generally don't need to assign a slot for VARYING_SLOT_CLIP_VERTEX,
+ * since it's encoded as the clip distances by emit_clip_distances().
+ * However, it may be output by transform feedback, and we'd rather not
+ * recompute state when TF changes, so we just always include it.
+ */
+ uint64_t builtins = slots_valid & BITFIELD64_MASK(VARYING_SLOT_VAR0);
+ while (builtins != 0) {
+ const int varying = ffsll(builtins) - 1;
+ if (vue_map->varying_to_slot[varying] == -1) {
+ assign_vue_slot(vue_map, varying, slot++);
+ }
+ builtins &= ~BITFIELD64_BIT(varying);
+ }
+
+ const int first_generic_slot = slot;
+ uint64_t generics = slots_valid & ~BITFIELD64_MASK(VARYING_SLOT_VAR0);
+ while (generics != 0) {
+ const int varying = ffsll(generics) - 1;
+ if (separate) {
+ slot = first_generic_slot + varying - VARYING_SLOT_VAR0;
+ }
+ assign_vue_slot(vue_map, varying, slot++);
+ generics &= ~BITFIELD64_BIT(varying);
+ }
+
+ vue_map->num_slots = slot;
+ vue_map->num_per_vertex_slots = 0;
+ vue_map->num_per_patch_slots = 0;
+}
+
+/**
+ * Compute the VUE map for tessellation control shader outputs and
+ * tessellation evaluation shader inputs.
+ */
+void
+brw_compute_tess_vue_map(struct brw_vue_map *vue_map,
+ uint64_t vertex_slots,
+ uint32_t patch_slots)
+{
+ /* I don't think anything actually uses this... */
+ vue_map->slots_valid = vertex_slots;
+
+ /* separate isn't really meaningful, but make sure it's initialized */
+ vue_map->separate = false;
+
+ vertex_slots &= ~(VARYING_BIT_TESS_LEVEL_OUTER |
+ VARYING_BIT_TESS_LEVEL_INNER);
+
+ /* Make sure that the values we store in vue_map->varying_to_slot and
+ * vue_map->slot_to_varying won't overflow the signed chars that are used
+ * to store them. Note that since vue_map->slot_to_varying sometimes holds
+ * values equal to VARYING_SLOT_TESS_MAX , we need to ensure that
+ * VARYING_SLOT_TESS_MAX is <= 127, not 128.
+ */
+ STATIC_ASSERT(VARYING_SLOT_TESS_MAX <= 127);
+
+ for (int i = 0; i < VARYING_SLOT_TESS_MAX ; ++i) {
+ vue_map->varying_to_slot[i] = -1;
+ vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_PAD;
+ }
+
+ int slot = 0;
+
+ /* The first 8 DWords are reserved for the "Patch Header".
+ *
+ * VARYING_SLOT_TESS_LEVEL_OUTER / INNER live here, but the exact layout
+ * depends on the domain type. They might not be in slots 0 and 1 as
+ * described here, but pretending they're separate allows us to uniquely
+ * identify them by distinct slot locations.
+ */
+ assign_vue_slot(vue_map, VARYING_SLOT_TESS_LEVEL_INNER, slot++);
+ assign_vue_slot(vue_map, VARYING_SLOT_TESS_LEVEL_OUTER, slot++);
+
+ /* first assign per-patch varyings */
+ while (patch_slots != 0) {
+ const int varying = ffsll(patch_slots) - 1;
+ if (vue_map->varying_to_slot[varying + VARYING_SLOT_PATCH0] == -1) {
+ assign_vue_slot(vue_map, varying + VARYING_SLOT_PATCH0, slot++);
+ }
+ patch_slots &= ~BITFIELD64_BIT(varying);
+ }
+
+ /* apparently, including the patch header... */
+ vue_map->num_per_patch_slots = slot;
+
+ /* then assign per-vertex varyings for each vertex in our patch */
+ while (vertex_slots != 0) {
+ const int varying = ffsll(vertex_slots) - 1;
+ if (vue_map->varying_to_slot[varying] == -1) {
+ assign_vue_slot(vue_map, varying, slot++);
+ }
+ vertex_slots &= ~BITFIELD64_BIT(varying);
+ }
+
+ vue_map->num_per_vertex_slots = slot - vue_map->num_per_patch_slots;
+ vue_map->num_slots = slot;
+}
+
+static const char *
+varying_name(brw_varying_slot slot)
+{
+ assume(slot < BRW_VARYING_SLOT_COUNT);
+
+ if (slot < VARYING_SLOT_MAX)
+ return gl_varying_slot_name(slot);
+
+ static const char *brw_names[] = {
+ [BRW_VARYING_SLOT_NDC - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_NDC",
+ [BRW_VARYING_SLOT_PAD - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_PAD",
+ [BRW_VARYING_SLOT_PNTC - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_PNTC",
+ };
+
+ return brw_names[slot - VARYING_SLOT_MAX];
+}
+
+void
+brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map)
+{
+ if (vue_map->num_per_vertex_slots > 0 || vue_map->num_per_patch_slots > 0) {
+ fprintf(fp, "PUE map (%d slots, %d/patch, %d/vertex, %s)\n",
+ vue_map->num_slots,
+ vue_map->num_per_patch_slots,
+ vue_map->num_per_vertex_slots,
+ vue_map->separate ? "SSO" : "non-SSO");
+ for (int i = 0; i < vue_map->num_slots; i++) {
+ if (vue_map->slot_to_varying[i] >= VARYING_SLOT_PATCH0) {
+ fprintf(fp, " [%d] VARYING_SLOT_PATCH%d\n", i,
+ vue_map->slot_to_varying[i] - VARYING_SLOT_PATCH0);
+ } else {
+ fprintf(fp, " [%d] %s\n", i,
+ varying_name(vue_map->slot_to_varying[i]));
+ }
+ }
+ } else {
+ fprintf(fp, "VUE map (%d slots, %s)\n",
+ vue_map->num_slots, vue_map->separate ? "SSO" : "non-SSO");
+ for (int i = 0; i < vue_map->num_slots; i++) {
+ fprintf(fp, " [%d] %s\n", i,
+ varying_name(vue_map->slot_to_varying[i]));
+ }
+ }
+ fprintf(fp, "\n");
+}
diff --git a/src/intel/compiler/brw_wm_iz.cpp b/src/intel/compiler/brw_wm_iz.cpp
new file mode 100644
index 00000000000..5162a369765
--- /dev/null
+++ b/src/intel/compiler/brw_wm_iz.cpp
@@ -0,0 +1,169 @@
+/*
+ Copyright (C) Intel Corp. 2006. All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+ * Authors:
+ * Keith Whitwell <[email protected]>
+ */
+
+
+#include "brw_fs.h"
+
+
+#undef P /* prompted depth */
+#undef C /* computed */
+#undef N /* non-promoted? */
+
+#define P 0
+#define C 1
+#define N 2
+
+static const struct {
+ GLuint mode:2;
+ GLuint sd_present:1;
+ GLuint sd_to_rt:1;
+ GLuint dd_present:1;
+ GLuint ds_present:1;
+} wm_iz_table[BRW_WM_IZ_BIT_MAX] =
+{
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 0 },
+ { N, 0, 1, 0, 0 },
+ { N, 0, 1, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { C, 0, 1, 1, 0 },
+ { C, 0, 1, 1, 0 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 0 },
+ { C, 0, 1, 1, 0 },
+ { C, 0, 1, 1, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 0 },
+ { N, 0, 1, 0, 0 },
+ { N, 0, 1, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { C, 0, 1, 1, 0 },
+ { C, 0, 1, 1, 0 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 0 },
+ { C, 0, 1, 1, 0 },
+ { C, 0, 1, 1, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 1 },
+ { N, 0, 1, 0, 1 },
+ { N, 0, 1, 0, 1 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { C, 0, 1, 1, 1 },
+ { C, 0, 1, 1, 1 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 1 },
+ { C, 0, 1, 1, 1 },
+ { C, 0, 1, 1, 1 },
+ { P, 0, 0, 0, 0 },
+ { C, 0, 0, 0, 1 },
+ { P, 0, 0, 0, 0 },
+ { C, 0, 1, 0, 1 },
+ { P, 0, 0, 0, 0 },
+ { C, 1, 1, 0, 1 },
+ { C, 0, 1, 0, 1 },
+ { C, 0, 1, 0, 1 },
+ { P, 0, 0, 0, 0 },
+ { C, 1, 1, 1, 1 },
+ { C, 0, 1, 1, 1 },
+ { C, 0, 1, 1, 1 },
+ { P, 0, 0, 0, 0 },
+ { C, 1, 1, 1, 1 },
+ { C, 0, 1, 1, 1 },
+ { C, 0, 1, 1, 1 }
+};
+
+/**
+ * \param line_aa BRW_WM_AA_NEVER, BRW_WM_AA_ALWAYS or BRW_WM_AA_SOMETIMES
+ * \param lookup bitmask of BRW_WM_IZ_* flags
+ */
+void fs_visitor::setup_fs_payload_gen4()
+{
+ assert(stage == MESA_SHADER_FRAGMENT);
+ struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+ brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+ GLuint reg = 2;
+ bool kill_stats_promoted_workaround = false;
+ int lookup = key->iz_lookup;
+
+ assert(lookup < BRW_WM_IZ_BIT_MAX);
+
+ /* Crazy workaround in the windowizer, which we need to track in
+ * our register allocation and render target writes. See the "If
+ * statistics are enabled..." paragraph of 11.5.3.2: Early Depth
+ * Test Cases [Pre-DevGT] of the 3D Pipeline - Windower B-Spec.
+ */
+ if (key->stats_wm &&
+ (lookup & BRW_WM_IZ_PS_KILL_ALPHATEST_BIT) &&
+ wm_iz_table[lookup].mode == P) {
+ kill_stats_promoted_workaround = true;
+ }
+
+ prog_data->uses_src_depth =
+ (nir->info->inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+ if (wm_iz_table[lookup].sd_present || prog_data->uses_src_depth ||
+ kill_stats_promoted_workaround) {
+ payload.source_depth_reg = reg;
+ reg += 2;
+ }
+
+ if (wm_iz_table[lookup].sd_to_rt || kill_stats_promoted_workaround)
+ source_depth_to_render_target = true;
+
+ if (wm_iz_table[lookup].ds_present || key->line_aa != BRW_WM_AA_NEVER) {
+ payload.aa_dest_stencil_reg = reg;
+ runtime_check_aads_emit =
+ !wm_iz_table[lookup].ds_present && key->line_aa == BRW_WM_AA_SOMETIMES;
+ reg++;
+ }
+
+ if (wm_iz_table[lookup].dd_present) {
+ payload.dest_depth_reg = reg;
+ reg+=2;
+ }
+
+ payload.num_regs = reg;
+}
+
diff --git a/src/intel/compiler/gen6_gs_visitor.cpp b/src/intel/compiler/gen6_gs_visitor.cpp
new file mode 100644
index 00000000000..075bc4ad487
--- /dev/null
+++ b/src/intel/compiler/gen6_gs_visitor.cpp
@@ -0,0 +1,753 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * This code is based on original work by Ilia Mirkin.
+ */
+
+/**
+ * \file gen6_gs_visitor.cpp
+ *
+ * Gen6 geometry shader implementation
+ */
+
+#include "gen6_gs_visitor.h"
+#include "brw_eu.h"
+
+namespace brw {
+
+void
+gen6_gs_visitor::emit_prolog()
+{
+ vec4_gs_visitor::emit_prolog();
+
+ /* Gen6 geometry shaders require to allocate an initial VUE handle via
+ * FF_SYNC message, however the documentation remarks that only one thread
+ * can write to the URB simultaneously and the FF_SYNC message provides the
+ * synchronization mechanism for this, so using this message effectively
+ * stalls the thread until it is its turn to write to the URB. Because of
+ * this, the best way to implement geometry shader algorithms in gen6 is to
+ * execute the algorithm before the FF_SYNC message to maximize parallelism.
+ *
+ * To achieve this we buffer the geometry shader outputs for each emitted
+ * vertex in vertex_output during operation. Then, when we have processed
+ * the last vertex (that is, at thread end time), we send the FF_SYNC
+ * message to allocate the initial VUE handle and write all buffered vertex
+ * data to the URB in one go.
+ *
+ * For each emitted vertex, vertex_output will hold vue_map.num_slots
+ * data items plus one additional item to hold required flags
+ * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
+ * which come right after the data items for that vertex. Vertex data and
+ * flags for the next vertex come right after the data items and flags for
+ * the previous vertex.
+ */
+ this->current_annotation = "gen6 prolog";
+ this->vertex_output = src_reg(this,
+ glsl_type::uint_type,
+ (prog_data->vue_map.num_slots + 1) *
+ nir->info->gs.vertices_out);
+ this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
+ emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
+
+ /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
+ * so initialize it once to R0.
+ */
+ vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
+ retype(brw_vec8_grf(0, 0),
+ BRW_REGISTER_TYPE_UD)));
+ inst->force_writemask_all = true;
+
+ /* This will be used as a temporary to store writeback data of FF_SYNC
+ * and URB_WRITE messages.
+ */
+ this->temp = src_reg(this, glsl_type::uint_type);
+
+ /* This will be used to know when we are processing the first vertex of
+ * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
+ * that we are processing the first vertex in the primitive and to zero
+ * otherwise. This way we can use its value directly in the URB write
+ * headers.
+ */
+ this->first_vertex = src_reg(this, glsl_type::uint_type);
+ emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(URB_WRITE_PRIM_START)));
+
+ /* The FF_SYNC message requires to know the number of primitives generated,
+ * so keep a counter for this.
+ */
+ this->prim_count = src_reg(this, glsl_type::uint_type);
+ emit(MOV(dst_reg(this->prim_count), brw_imm_ud(0u)));
+
+ if (prog->info.has_transform_feedback_varyings) {
+ /* Create a virtual register to hold destination indices in SOL */
+ this->destination_indices = src_reg(this, glsl_type::uvec4_type);
+ /* Create a virtual register to hold number of written primitives */
+ this->sol_prim_written = src_reg(this, glsl_type::uint_type);
+ /* Create a virtual register to hold Streamed Vertex Buffer Indices */
+ this->svbi = src_reg(this, glsl_type::uvec4_type);
+ /* Create a virtual register to hold max values of SVBI */
+ this->max_svbi = src_reg(this, glsl_type::uvec4_type);
+ emit(MOV(dst_reg(this->max_svbi),
+ src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));
+
+ xfb_setup();
+ }
+
+ /* PrimitveID is delivered in r0.1 of the thread payload. If the program
+ * needs it we have to move it to a separate register where we can map
+ * the atttribute.
+ *
+ * Notice that we cannot use a virtual register for this, because we need to
+ * map all input attributes to hardware registers in setup_payload(),
+ * which happens before virtual registers are mapped to hardware registers.
+ * We could work around that issue if we were able to compute the first
+ * non-payload register here and move the PrimitiveID information to that
+ * register, but we can't because at this point we don't know the final
+ * number uniforms that will be included in the payload.
+ *
+ * So, what we do is to place PrimitiveID information in r1, which is always
+ * delivered as part of the payload, but its only populated with data
+ * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE
+ * in the 3DSTATE_GS state packet. That information can be obtained by other
+ * means though, so we can safely use r1 for this purpose.
+ */
+ if (gs_prog_data->include_primitive_id) {
+ this->primitive_id =
+ src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+ emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
+ }
+}
+
+void
+gen6_gs_visitor::gs_emit_vertex(int stream_id)
+{
+ this->current_annotation = "gen6 emit vertex";
+
+ /* Buffer all output slots for this vertex in vertex_output */
+ for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
+ int varying = prog_data->vue_map.slot_to_varying[slot];
+ if (varying != VARYING_SLOT_PSIZ) {
+ dst_reg dst(this->vertex_output);
+ dst.reladdr = ralloc(mem_ctx, src_reg);
+ memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
+ emit_urb_slot(dst, varying);
+ } else {
+ /* The PSIZ slot can pack multiple varyings in different channels
+ * and emit_urb_slot() will produce a MOV instruction for each of
+ * them. Since we are writing to an array, that will translate to
+ * possibly multiple MOV instructions with an array destination and
+ * each will generate a scratch write with the same offset into
+ * scratch space (thus, each one overwriting the previous). This is
+ * not what we want. What we will do instead is emit PSIZ to a
+ * a regular temporary register, then move that resgister into the
+ * array. This way we only have one instruction with an array
+ * destination and we only produce a single scratch write.
+ */
+ dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type));
+ emit_urb_slot(tmp, varying);
+ dst_reg dst(this->vertex_output);
+ dst.reladdr = ralloc(mem_ctx, src_reg);
+ memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
+ vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
+ inst->force_writemask_all = true;
+ }
+
+ emit(ADD(dst_reg(this->vertex_output_offset),
+ this->vertex_output_offset, brw_imm_ud(1u)));
+ }
+
+ /* Now buffer flags for this vertex */
+ dst_reg dst(this->vertex_output);
+ dst.reladdr = ralloc(mem_ctx, src_reg);
+ memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
+ if (nir->info->gs.output_primitive == GL_POINTS) {
+ /* If we are outputting points, then every vertex has PrimStart and
+ * PrimEnd set.
+ */
+ emit(MOV(dst, brw_imm_d((_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
+ URB_WRITE_PRIM_START | URB_WRITE_PRIM_END)));
+ emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
+ } else {
+ /* Otherwise, we can only set the PrimStart flag, which we have stored
+ * in the first_vertex register. We will have to wait until we execute
+ * EndPrimitive() or we end the thread to set the PrimEnd flag on a
+ * vertex.
+ */
+ emit(OR(dst, this->first_vertex,
+ brw_imm_ud(gs_prog_data->output_topology <<
+ URB_WRITE_PRIM_TYPE_SHIFT)));
+ emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(0u)));
+ }
+ emit(ADD(dst_reg(this->vertex_output_offset),
+ this->vertex_output_offset, brw_imm_ud(1u)));
+}
+
+void
+gen6_gs_visitor::gs_end_primitive()
+{
+ this->current_annotation = "gen6 end primitive";
+ /* Calling EndPrimitive() is optional for point output. In this case we set
+ * the PrimEnd flag when we process EmitVertex().
+ */
+ if (nir->info->gs.output_primitive == GL_POINTS)
+ return;
+
+ /* Otherwise we know that the last vertex we have processed was the last
+ * vertex in the primitive and we need to set its PrimEnd flag, so do this
+ * unless we haven't emitted that vertex at all (vertex_count != 0).
+ *
+ * Notice that we have already incremented vertex_count when we processed
+ * the last emit_vertex, so we need to take that into account in the
+ * comparison below (hence the num_output_vertices + 1 in the comparison
+ * below).
+ */
+ unsigned num_output_vertices = nir->info->gs.vertices_out;
+ emit(CMP(dst_null_ud(), this->vertex_count,
+ brw_imm_ud(num_output_vertices + 1), BRW_CONDITIONAL_L));
+ vec4_instruction *inst = emit(CMP(dst_null_ud(),
+ this->vertex_count, brw_imm_ud(0u),
+ BRW_CONDITIONAL_NEQ));
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ emit(IF(BRW_PREDICATE_NORMAL));
+ {
+ /* vertex_output_offset is already pointing at the first entry of the
+ * next vertex. So subtract 1 to modify the flags for the previous
+ * vertex.
+ */
+ src_reg offset(this, glsl_type::uint_type);
+ emit(ADD(dst_reg(offset), this->vertex_output_offset, brw_imm_d(-1)));
+
+ src_reg dst(this->vertex_output);
+ dst.reladdr = ralloc(mem_ctx, src_reg);
+ memcpy(dst.reladdr, &offset, sizeof(src_reg));
+
+ emit(OR(dst_reg(dst), dst, brw_imm_d(URB_WRITE_PRIM_END)));
+ emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
+
+ /* Set the first vertex flag to indicate that the next vertex will start
+ * a primitive.
+ */
+ emit(MOV(dst_reg(this->first_vertex), brw_imm_d(URB_WRITE_PRIM_START)));
+ }
+ emit(BRW_OPCODE_ENDIF);
+}
+
+void
+gen6_gs_visitor::emit_urb_write_header(int mrf)
+{
+ this->current_annotation = "gen6 urb header";
+ /* Compute offset of the flags for the current vertex in vertex_output and
+ * write them in dw2 of the message header.
+ *
+ * Notice that by the time that emit_thread_end() calls here
+ * vertex_output_offset should point to the first data item of the current
+ * vertex in vertex_output, thus we only need to add the number of output
+ * slots per vertex to that offset to obtain the flags data offset.
+ */
+ src_reg flags_offset(this, glsl_type::uint_type);
+ emit(ADD(dst_reg(flags_offset),
+ this->vertex_output_offset,
+ brw_imm_d(prog_data->vue_map.num_slots)));
+
+ src_reg flags_data(this->vertex_output);
+ flags_data.reladdr = ralloc(mem_ctx, src_reg);
+ memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
+
+ emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
+}
+
+static int
+align_interleaved_urb_mlen(int mlen)
+{
+ /* URB data written (does not include the message header reg) must
+ * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
+ * section 5.4.3.2.2: URB_INTERLEAVED.
+ */
+ if ((mlen % 2) != 1)
+ mlen++;
+ return mlen;
+}
+
+void
+gen6_gs_visitor::emit_urb_write_opcode(bool complete, int base_mrf,
+ int last_mrf, int urb_offset)
+{
+ vec4_instruction *inst = NULL;
+
+ if (!complete) {
+ /* If the vertex is not complete we don't have to do anything special */
+ inst = emit(GS_OPCODE_URB_WRITE);
+ inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
+ } else {
+ /* Otherwise we always request to allocate a new VUE handle. If this is
+ * the last write before the EOT message and the new handle never gets
+ * used it will be dereferenced when we send the EOT message. This is
+ * necessary to avoid different setups for the EOT message (one for the
+ * case when there is no output and another for the case when there is)
+ * which would require to end the program with an IF/ELSE/ENDIF block,
+ * something we do not want.
+ */
+ inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE);
+ inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
+ inst->dst = dst_reg(MRF, base_mrf);
+ inst->src[0] = this->temp;
+ }
+
+ inst->base_mrf = base_mrf;
+ inst->mlen = align_interleaved_urb_mlen(last_mrf - base_mrf);
+ inst->offset = urb_offset;
+}
+
+void
+gen6_gs_visitor::emit_thread_end()
+{
+ /* Make sure the current primitive is ended: we know it is not ended when
+ * first_vertex is not zero. This is only relevant for outputs other than
+ * points because in the point case we set PrimEnd on all vertices.
+ */
+ if (nir->info->gs.output_primitive != GL_POINTS) {
+ emit(CMP(dst_null_ud(), this->first_vertex, brw_imm_ud(0u), BRW_CONDITIONAL_Z));
+ emit(IF(BRW_PREDICATE_NORMAL));
+ gs_end_primitive();
+ emit(BRW_OPCODE_ENDIF);
+ }
+
+ /* Here we have to:
+ * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle.
+ * 2) Loop over all buffered vertex data and write it to corresponding
+ * URB entries.
+ * 3) Allocate new VUE handles for all vertices other than the first.
+ * 4) Send a final EOT message.
+ */
+
+ /* MRF 0 is reserved for the debugger, so start with message header
+ * in MRF 1.
+ */
+ int base_mrf = 1;
+
+ /* In the process of generating our URB write message contents, we
+ * may need to unspill a register or load from an array. Those
+ * reads would use MRFs 21..23
+ */
+ int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
+
+ /* Issue the FF_SYNC message and obtain the initial VUE handle. */
+ emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u), BRW_CONDITIONAL_G));
+ emit(IF(BRW_PREDICATE_NORMAL));
+ {
+ this->current_annotation = "gen6 thread end: ff_sync";
+
+ vec4_instruction *inst;
+ if (prog->info.has_transform_feedback_varyings) {
+ src_reg sol_temp(this, glsl_type::uvec4_type);
+ emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
+ dst_reg(this->svbi),
+ this->vertex_count,
+ this->prim_count,
+ sol_temp);
+ inst = emit(GS_OPCODE_FF_SYNC,
+ dst_reg(this->temp), this->prim_count, this->svbi);
+ } else {
+ inst = emit(GS_OPCODE_FF_SYNC,
+ dst_reg(this->temp), this->prim_count, brw_imm_ud(0u));
+ }
+ inst->base_mrf = base_mrf;
+
+ /* Loop over all buffered vertices and emit URB write messages */
+ this->current_annotation = "gen6 thread end: urb writes init";
+ src_reg vertex(this, glsl_type::uint_type);
+ emit(MOV(dst_reg(vertex), brw_imm_ud(0u)));
+ emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
+
+ this->current_annotation = "gen6 thread end: urb writes";
+ emit(BRW_OPCODE_DO);
+ {
+ emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
+ inst = emit(BRW_OPCODE_BREAK);
+ inst->predicate = BRW_PREDICATE_NORMAL;
+
+ /* First we prepare the message header */
+ emit_urb_write_header(base_mrf);
+
+ /* Then add vertex data to the message in interleaved fashion */
+ int slot = 0;
+ bool complete = false;
+ do {
+ int mrf = base_mrf + 1;
+
+ /* URB offset is in URB row increments, and each of our MRFs is half
+ * of one of those, since we're doing interleaved writes.
+ */
+ int urb_offset = slot / 2;
+
+ for (; slot < prog_data->vue_map.num_slots; ++slot) {
+ int varying = prog_data->vue_map.slot_to_varying[slot];
+ current_annotation = output_reg_annotation[varying];
+
+ /* Compute offset of this slot for the current vertex
+ * in vertex_output
+ */
+ src_reg data(this->vertex_output);
+ data.reladdr = ralloc(mem_ctx, src_reg);
+ memcpy(data.reladdr, &this->vertex_output_offset,
+ sizeof(src_reg));
+
+ /* Copy this slot to the appropriate message register */
+ dst_reg reg = dst_reg(MRF, mrf);
+ reg.type = output_reg[varying][0].type;
+ data.type = reg.type;
+ vec4_instruction *inst = emit(MOV(reg, data));
+ inst->force_writemask_all = true;
+
+ mrf++;
+ emit(ADD(dst_reg(this->vertex_output_offset),
+ this->vertex_output_offset, brw_imm_ud(1u)));
+
+ /* If this was max_usable_mrf, we can't fit anything more into
+ * this URB WRITE. Same if we reached the max. message length.
+ */
+ if (mrf > max_usable_mrf ||
+ align_interleaved_urb_mlen(mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
+ slot++;
+ break;
+ }
+ }
+
+ complete = slot >= prog_data->vue_map.num_slots;
+ emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
+ } while (!complete);
+
+ /* Skip over the flags data item so that vertex_output_offset points
+ * to the first data item of the next vertex, so that we can start
+ * writing the next vertex.
+ */
+ emit(ADD(dst_reg(this->vertex_output_offset),
+ this->vertex_output_offset, brw_imm_ud(1u)));
+
+ emit(ADD(dst_reg(vertex), vertex, brw_imm_ud(1u)));
+ }
+ emit(BRW_OPCODE_WHILE);
+
+ if (prog->info.has_transform_feedback_varyings)
+ xfb_write();
+ }
+ emit(BRW_OPCODE_ENDIF);
+
+ /* Finally, emit EOT message.
+ *
+ * In gen6 we need to end the thread differently depending on whether we have
+ * emitted at least one vertex or not. In case we did, the EOT message must
+ * always include the COMPLETE flag or else the GPU hangs. If we have not
+ * produced any output we can't use the COMPLETE flag.
+ *
+ * However, this would lead us to end the program with an ENDIF opcode,
+ * which we want to avoid, so what we do is that we always request a new
+ * VUE handle every time we do a URB WRITE, even for the last vertex we emit.
+ * With this we make sure that whether we have emitted at least one vertex
+ * or none at all, we have to finish the thread without writing to the URB,
+ * which works for both cases by setting the COMPLETE and UNUSED flags in
+ * the EOT message.
+ */
+ this->current_annotation = "gen6 thread end: EOT";
+
+ if (prog->info.has_transform_feedback_varyings) {
+ /* When emitting EOT, set SONumPrimsWritten Increment Value. */
+ src_reg data(this, glsl_type::uint_type);
+ emit(AND(dst_reg(data), this->sol_prim_written, brw_imm_ud(0xffffu)));
+ emit(SHL(dst_reg(data), data, brw_imm_ud(16u)));
+ emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
+ }
+
+ vec4_instruction *inst = emit(GS_OPCODE_THREAD_END);
+ inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
+ inst->base_mrf = base_mrf;
+ inst->mlen = 1;
+}
+
+void
+gen6_gs_visitor::setup_payload()
+{
+ int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
+
+ /* Attributes are going to be interleaved, so one register contains two
+ * attribute slots.
+ */
+ int attributes_per_reg = 2;
+
+ /* If a geometry shader tries to read from an input that wasn't written by
+ * the vertex shader, that produces undefined results, but it shouldn't
+ * crash anything. So initialize attribute_map to zeros--that ensures that
+ * these undefined results are read from r0.
+ */
+ memset(attribute_map, 0, sizeof(attribute_map));
+
+ int reg = 0;
+
+ /* The payload always contains important data in r0. */
+ reg++;
+
+ /* r1 is always part of the payload and it holds information relevant
+ * for transform feedback when we set the GEN6_GS_SVBI_PAYLOAD_ENABLE bit in
+ * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID
+ * information (and move the original value to a virtual register if
+ * necessary).
+ */
+ if (gs_prog_data->include_primitive_id)
+ attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
+ reg++;
+
+ reg = setup_uniforms(reg);
+
+ reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
+
+ lower_attributes_to_hw_regs(attribute_map, true);
+
+ this->first_non_payload_grf = reg;
+}
+
+void
+gen6_gs_visitor::xfb_setup()
+{
+ static const unsigned swizzle_for_offset[4] = {
+ BRW_SWIZZLE4(0, 1, 2, 3),
+ BRW_SWIZZLE4(1, 2, 3, 3),
+ BRW_SWIZZLE4(2, 3, 3, 3),
+ BRW_SWIZZLE4(3, 3, 3, 3)
+ };
+
+ const struct gl_transform_feedback_info *linked_xfb_info =
+ this->prog->sh.LinkedTransformFeedback;
+ int i;
+
+ /* Make sure that the VUE slots won't overflow the unsigned chars in
+ * prog_data->transform_feedback_bindings[].
+ */
+ STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256);
+
+ /* Make sure that we don't need more binding table entries than we've
+ * set aside for use in transform feedback. (We shouldn't, since we
+ * set aside enough binding table entries to have one per component).
+ */
+ assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS);
+
+ gs_prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
+ for (i = 0; i < gs_prog_data->num_transform_feedback_bindings; i++) {
+ gs_prog_data->transform_feedback_bindings[i] =
+ linked_xfb_info->Outputs[i].OutputRegister;
+ gs_prog_data->transform_feedback_swizzles[i] =
+ swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset];
+ }
+}
+
+void
+gen6_gs_visitor::xfb_write()
+{
+ unsigned num_verts;
+
+ if (!gs_prog_data->num_transform_feedback_bindings)
+ return;
+
+ switch (gs_prog_data->output_topology) {
+ case _3DPRIM_POINTLIST:
+ num_verts = 1;
+ break;
+ case _3DPRIM_LINELIST:
+ case _3DPRIM_LINESTRIP:
+ case _3DPRIM_LINELOOP:
+ num_verts = 2;
+ break;
+ case _3DPRIM_TRILIST:
+ case _3DPRIM_TRIFAN:
+ case _3DPRIM_TRISTRIP:
+ case _3DPRIM_RECTLIST:
+ num_verts = 3;
+ break;
+ case _3DPRIM_QUADLIST:
+ case _3DPRIM_QUADSTRIP:
+ case _3DPRIM_POLYGON:
+ num_verts = 3;
+ break;
+ default:
+ unreachable("Unexpected primitive type in Gen6 SOL program.");
+ }
+
+ this->current_annotation = "gen6 thread end: svb writes init";
+
+ emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
+ emit(MOV(dst_reg(this->sol_prim_written), brw_imm_ud(0u)));
+
+ /* Check that at least one primitive can be written
+ *
+ * Note: since we use the binding table to keep track of buffer offsets
+ * and stride, the GS doesn't need to keep track of a separate pointer
+ * into each buffer; it uses a single pointer which increments by 1 for
+ * each vertex. So we use SVBI0 for this pointer, regardless of whether
+ * transform feedback is in interleaved or separate attribs mode.
+ */
+ src_reg sol_temp(this, glsl_type::uvec4_type);
+ emit(ADD(dst_reg(sol_temp), this->svbi, brw_imm_ud(num_verts)));
+
+ /* Compare SVBI calculated number with the maximum value, which is
+ * in R1.4 (previously saved in this->max_svbi) for gen6.
+ */
+ emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
+ emit(IF(BRW_PREDICATE_NORMAL));
+ {
+ vec4_instruction *inst = emit(MOV(dst_reg(destination_indices),
+ brw_imm_vf4(brw_float_to_vf(0.0),
+ brw_float_to_vf(1.0),
+ brw_float_to_vf(2.0),
+ brw_float_to_vf(0.0))));
+ inst->force_writemask_all = true;
+
+ emit(ADD(dst_reg(this->destination_indices),
+ this->destination_indices,
+ this->svbi));
+ }
+ emit(BRW_OPCODE_ENDIF);
+
+ /* Write transform feedback data for all processed vertices. */
+ for (int i = 0; i < (int)nir->info->gs.vertices_out; i++) {
+ emit(MOV(dst_reg(sol_temp), brw_imm_d(i)));
+ emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
+ BRW_CONDITIONAL_L));
+ emit(IF(BRW_PREDICATE_NORMAL));
+ {
+ xfb_program(i, num_verts);
+ }
+ emit(BRW_OPCODE_ENDIF);
+ }
+}
+
+void
+gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
+{
+ unsigned binding;
+ unsigned num_bindings = gs_prog_data->num_transform_feedback_bindings;
+ src_reg sol_temp(this, glsl_type::uvec4_type);
+
+ /* Check for buffer overflow: we need room to write the complete primitive
+ * (all vertices). Otherwise, avoid writing any vertices for it
+ */
+ emit(ADD(dst_reg(sol_temp), this->sol_prim_written, brw_imm_ud(1u)));
+ emit(MUL(dst_reg(sol_temp), sol_temp, brw_imm_ud(num_verts)));
+ emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
+ emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
+ emit(IF(BRW_PREDICATE_NORMAL));
+ {
+ /* Avoid overwriting MRF 1 as it is used as URB write message header */
+ dst_reg mrf_reg(MRF, 2);
+
+ this->current_annotation = "gen6: emit SOL vertex data";
+ /* For each vertex, generate code to output each varying using the
+ * appropriate binding table entry.
+ */
+ for (binding = 0; binding < num_bindings; ++binding) {
+ unsigned char varying =
+ gs_prog_data->transform_feedback_bindings[binding];
+
+ /* Set up the correct destination index for this vertex */
+ vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
+ mrf_reg,
+ this->destination_indices);
+ inst->sol_vertex = vertex % num_verts;
+
+ /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
+ *
+ * "Prior to End of Thread with a URB_WRITE, the kernel must
+ * ensure that all writes are complete by sending the final
+ * write as a committed write."
+ */
+ bool final_write = binding == (unsigned) num_bindings - 1 &&
+ inst->sol_vertex == num_verts - 1;
+
+ /* Compute offset of this varying for the current vertex
+ * in vertex_output
+ */
+ this->current_annotation = output_reg_annotation[varying];
+ src_reg data(this->vertex_output);
+ data.reladdr = ralloc(mem_ctx, src_reg);
+ int offset = get_vertex_output_offset_for_varying(vertex, varying);
+ emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_d(offset)));
+ memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
+ data.type = output_reg[varying][0].type;
+
+ /* PSIZ, LAYER and VIEWPORT are packed in different channels of the
+ * same slot, so make sure we write the appropriate channel
+ */
+ if (varying == VARYING_SLOT_PSIZ)
+ data.swizzle = BRW_SWIZZLE_WWWW;
+ else if (varying == VARYING_SLOT_LAYER)
+ data.swizzle = BRW_SWIZZLE_YYYY;
+ else if (varying == VARYING_SLOT_VIEWPORT)
+ data.swizzle = BRW_SWIZZLE_ZZZZ;
+ else
+ data.swizzle = gs_prog_data->transform_feedback_swizzles[binding];
+
+ /* Write data */
+ inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
+ inst->sol_binding = binding;
+ inst->sol_final_write = final_write;
+
+ if (final_write) {
+ /* This is the last vertex of the primitive, then increment
+ * SO num primitive counter and destination indices.
+ */
+ emit(ADD(dst_reg(this->destination_indices),
+ this->destination_indices,
+ brw_imm_ud(num_verts)));
+ emit(ADD(dst_reg(this->sol_prim_written),
+ this->sol_prim_written, brw_imm_ud(1u)));
+ }
+
+ }
+ this->current_annotation = NULL;
+ }
+ emit(BRW_OPCODE_ENDIF);
+}
+
+int
+gen6_gs_visitor::get_vertex_output_offset_for_varying(int vertex, int varying)
+{
+ /* Find the output slot assigned to this varying.
+ *
+ * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot
+ * as VARYING_SLOT_PSIZ.
+ */
+ if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT)
+ varying = VARYING_SLOT_PSIZ;
+ int slot = prog_data->vue_map.varying_to_slot[varying];
+
+ if (slot < 0) {
+ /* This varying does not exist in the VUE so we are not writing to it
+ * and its value is undefined. We still want to return a valid offset
+ * into vertex_output though, to prevent any out-of-bound accesses into
+ * the vertex_output array. Since the value for this varying is undefined
+ * we don't really care for the value we assign to it, so any offset
+ * within the limits of vertex_output will do.
+ */
+ slot = 0;
+ }
+
+ return vertex * (prog_data->vue_map.num_slots + 1) + slot;
+}
+
+} /* namespace brw */
diff --git a/src/intel/compiler/gen6_gs_visitor.h b/src/intel/compiler/gen6_gs_visitor.h
new file mode 100644
index 00000000000..1bdcf925880
--- /dev/null
+++ b/src/intel/compiler/gen6_gs_visitor.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#ifndef GEN6_GS_VISITOR_H
+#define GEN6_GS_VISITOR_H
+
+#include "brw_vec4.h"
+#include "brw_vec4_gs_visitor.h"
+
+#ifdef __cplusplus
+
+namespace brw {
+
+class gen6_gs_visitor : public vec4_gs_visitor
+{
+public:
+ gen6_gs_visitor(const struct brw_compiler *comp,
+ void *log_data,
+ struct brw_gs_compile *c,
+ struct brw_gs_prog_data *prog_data,
+ struct gl_program *prog,
+ const nir_shader *shader,
+ void *mem_ctx,
+ bool no_spills,
+ int shader_time_index) :
+ vec4_gs_visitor(comp, log_data, c, prog_data, shader, mem_ctx, no_spills,
+ shader_time_index),
+ prog(prog)
+ {
+ }
+
+protected:
+ virtual void emit_prolog();
+ virtual void emit_thread_end();
+ virtual void gs_emit_vertex(int stream_id);
+ virtual void gs_end_primitive();
+ virtual void emit_urb_write_header(int mrf);
+ virtual void emit_urb_write_opcode(bool complete,
+ int base_mrf,
+ int last_mrf,
+ int urb_offset);
+ virtual void setup_payload();
+
+private:
+ void xfb_write();
+ void xfb_program(unsigned vertex, unsigned num_verts);
+ void xfb_setup();
+ int get_vertex_output_offset_for_varying(int vertex, int varying);
+
+ const struct gl_program *prog;
+
+ src_reg vertex_output;
+ src_reg vertex_output_offset;
+ src_reg temp;
+ src_reg first_vertex;
+ src_reg prim_count;
+ src_reg primitive_id;
+
+ /* Transform Feedback members */
+ src_reg sol_prim_written;
+ src_reg svbi;
+ src_reg max_svbi;
+ src_reg destination_indices;
+};
+
+} /* namespace brw */
+
+#endif /* __cplusplus */
+
+#endif /* GEN6_GS_VISITOR_H */
diff --git a/src/intel/compiler/intel_asm_annotation.c b/src/intel/compiler/intel_asm_annotation.c
new file mode 100644
index 00000000000..1f3b78476e3
--- /dev/null
+++ b/src/intel/compiler/intel_asm_annotation.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_cfg.h"
+#include "brw_eu.h"
+#include "common/gen_debug.h"
+#include "intel_asm_annotation.h"
+#include "compiler/nir/nir.h"
+
+void
+dump_assembly(void *assembly, int num_annotations, struct annotation *annotation,
+ const struct gen_device_info *devinfo)
+{
+ const char *last_annotation_string = NULL;
+ const void *last_annotation_ir = NULL;
+
+ for (int i = 0; i < num_annotations; i++) {
+ int start_offset = annotation[i].offset;
+ int end_offset = annotation[i + 1].offset;
+
+ if (annotation[i].block_start) {
+ fprintf(stderr, " START B%d", annotation[i].block_start->num);
+ foreach_list_typed(struct bblock_link, predecessor_link, link,
+ &annotation[i].block_start->parents) {
+ struct bblock_t *predecessor_block = predecessor_link->block;
+ fprintf(stderr, " <-B%d", predecessor_block->num);
+ }
+ fprintf(stderr, " (%u cycles)\n", annotation[i].block_start->cycle_count);
+ }
+
+ if (last_annotation_ir != annotation[i].ir) {
+ last_annotation_ir = annotation[i].ir;
+ if (last_annotation_ir) {
+ fprintf(stderr, " ");
+ nir_print_instr(annotation[i].ir, stderr);
+ fprintf(stderr, "\n");
+ }
+ }
+
+ if (last_annotation_string != annotation[i].annotation) {
+ last_annotation_string = annotation[i].annotation;
+ if (last_annotation_string)
+ fprintf(stderr, " %s\n", last_annotation_string);
+ }
+
+ brw_disassemble(devinfo, assembly, start_offset, end_offset, stderr);
+
+ if (annotation[i].error) {
+ fputs(annotation[i].error, stderr);
+ }
+
+ if (annotation[i].block_end) {
+ fprintf(stderr, " END B%d", annotation[i].block_end->num);
+ foreach_list_typed(struct bblock_link, successor_link, link,
+ &annotation[i].block_end->children) {
+ struct bblock_t *successor_block = successor_link->block;
+ fprintf(stderr, " ->B%d", successor_block->num);
+ }
+ fprintf(stderr, "\n");
+ }
+ }
+ fprintf(stderr, "\n");
+}
+
+static bool
+annotation_array_ensure_space(struct annotation_info *annotation)
+{
+ if (annotation->ann_size <= annotation->ann_count) {
+ int old_size = annotation->ann_size;
+ annotation->ann_size = MAX2(1024, annotation->ann_size * 2);
+ annotation->ann = reralloc(annotation->mem_ctx, annotation->ann,
+ struct annotation, annotation->ann_size);
+ if (!annotation->ann)
+ return false;
+
+ memset(annotation->ann + old_size, 0,
+ (annotation->ann_size - old_size) * sizeof(struct annotation));
+ }
+
+ return true;
+}
+
+void annotate(const struct gen_device_info *devinfo,
+ struct annotation_info *annotation, const struct cfg_t *cfg,
+ struct backend_instruction *inst, unsigned offset)
+{
+ if (annotation->mem_ctx == NULL)
+ annotation->mem_ctx = ralloc_context(NULL);
+
+ if (!annotation_array_ensure_space(annotation))
+ return;
+
+ struct annotation *ann = &annotation->ann[annotation->ann_count++];
+ ann->offset = offset;
+ if ((INTEL_DEBUG & DEBUG_ANNOTATION) != 0) {
+ ann->ir = inst->ir;
+ ann->annotation = inst->annotation;
+ }
+
+ if (bblock_start(cfg->blocks[annotation->cur_block]) == inst) {
+ ann->block_start = cfg->blocks[annotation->cur_block];
+ }
+
+ /* There is no hardware DO instruction on Gen6+, so since DO always
+ * starts a basic block, we need to set the .block_start of the next
+ * instruction's annotation with a pointer to the bblock started by
+ * the DO.
+ *
+ * There's also only complication from emitting an annotation without
+ * a corresponding hardware instruction to disassemble.
+ */
+ if (devinfo->gen >= 6 && inst->opcode == BRW_OPCODE_DO) {
+ annotation->ann_count--;
+ }
+
+ if (bblock_end(cfg->blocks[annotation->cur_block]) == inst) {
+ ann->block_end = cfg->blocks[annotation->cur_block];
+ annotation->cur_block++;
+ }
+}
+
+void
+annotation_finalize(struct annotation_info *annotation,
+ unsigned next_inst_offset)
+{
+ if (!annotation->ann_count)
+ return;
+
+ if (annotation->ann_count == annotation->ann_size) {
+ annotation->ann = reralloc(annotation->mem_ctx, annotation->ann,
+ struct annotation, annotation->ann_size + 1);
+ }
+ annotation->ann[annotation->ann_count].offset = next_inst_offset;
+}
+
+void
+annotation_insert_error(struct annotation_info *annotation, unsigned offset,
+ const char *error)
+{
+ struct annotation *ann;
+
+ if (!annotation->ann_count)
+ return;
+
+ /* We may have to split an annotation, so ensure we have enough space
+ * allocated for that case up front.
+ */
+ if (!annotation_array_ensure_space(annotation))
+ return;
+
+ assume(annotation->ann_count > 0);
+
+ for (int i = 0; i < annotation->ann_count; i++) {
+ struct annotation *cur = &annotation->ann[i];
+ struct annotation *next = &annotation->ann[i + 1];
+ ann = cur;
+
+ if (next->offset <= offset)
+ continue;
+
+ if (offset + sizeof(brw_inst) != next->offset) {
+ memmove(next, cur,
+ (annotation->ann_count - i + 2) * sizeof(struct annotation));
+ cur->error = NULL;
+ cur->error_length = 0;
+ cur->block_end = NULL;
+ next->offset = offset + sizeof(brw_inst);
+ next->block_start = NULL;
+ annotation->ann_count++;
+ }
+ break;
+ }
+
+ if (ann->error)
+ ralloc_strcat(&ann->error, error);
+ else
+ ann->error = ralloc_strdup(annotation->mem_ctx, error);
+}
diff --git a/src/intel/compiler/intel_asm_annotation.h b/src/intel/compiler/intel_asm_annotation.h
new file mode 100644
index 00000000000..2d905b10a96
--- /dev/null
+++ b/src/intel/compiler/intel_asm_annotation.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _INTEL_ASM_ANNOTATION_H
+#define _INTEL_ASM_ANNOTATION_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct backend_instruction;
+struct cfg_t;
+
+struct annotation {
+ int offset;
+
+ size_t error_length;
+ char *error;
+
+ /* Pointers to the basic block in the CFG if the instruction group starts
+ * or ends a basic block.
+ */
+ struct bblock_t *block_start;
+ struct bblock_t *block_end;
+
+ /* Annotation for the generated IR. One of the two can be set. */
+ const void *ir;
+ const char *annotation;
+};
+
+struct annotation_info {
+ void *mem_ctx;
+ struct annotation *ann;
+ int ann_count;
+ int ann_size;
+
+ /** Block index in the cfg. */
+ int cur_block;
+};
+
+void
+dump_assembly(void *assembly, int num_annotations, struct annotation *annotation,
+ const struct gen_device_info *devinfo);
+
+void
+annotate(const struct gen_device_info *devinfo,
+ struct annotation_info *annotation, const struct cfg_t *cfg,
+ struct backend_instruction *inst, unsigned offset);
+void
+annotation_finalize(struct annotation_info *annotation, unsigned offset);
+
+void
+annotation_insert_error(struct annotation_info *annotation, unsigned offset,
+ const char *error);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* _INTEL_ASM_ANNOTATION_H */
diff --git a/src/intel/compiler/test_eu_compact.c b/src/intel/compiler/test_eu_compact.c
new file mode 100644
index 00000000000..77a57f4aa65
--- /dev/null
+++ b/src/intel/compiler/test_eu_compact.c
@@ -0,0 +1,300 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include "util/ralloc.h"
+#include "brw_eu.h"
+
+static bool
+test_compact_instruction(struct brw_codegen *p, brw_inst src)
+{
+ brw_compact_inst dst;
+ memset(&dst, 0xd0, sizeof(dst));
+
+ if (brw_try_compact_instruction(p->devinfo, &dst, &src)) {
+ brw_inst uncompacted;
+
+ brw_uncompact_instruction(p->devinfo, &uncompacted, &dst);
+ if (memcmp(&uncompacted, &src, sizeof(src))) {
+ brw_debug_compact_uncompact(p->devinfo, &src, &uncompacted);
+ return false;
+ }
+ } else {
+ brw_compact_inst unchanged;
+ memset(&unchanged, 0xd0, sizeof(unchanged));
+ /* It's not supposed to change dst unless it compacted. */
+ if (memcmp(&unchanged, &dst, sizeof(dst))) {
+ fprintf(stderr, "Failed to compact, but dst changed\n");
+ fprintf(stderr, " Instruction: ");
+ brw_disassemble_inst(stderr, p->devinfo, &src, false);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * When doing fuzz testing, pad bits won't round-trip.
+ *
+ * This sort of a superset of skip_bit, which is testing for changing bits that
+ * aren't worth testing for fuzzing. We also just want to clear bits that
+ * become meaningless once fuzzing twiddles a related bit.
+ */
+static void
+clear_pad_bits(const struct gen_device_info *devinfo, brw_inst *inst)
+{
+ if (brw_inst_opcode(devinfo, inst) != BRW_OPCODE_SEND &&
+ brw_inst_opcode(devinfo, inst) != BRW_OPCODE_SENDC &&
+ brw_inst_opcode(devinfo, inst) != BRW_OPCODE_BREAK &&
+ brw_inst_opcode(devinfo, inst) != BRW_OPCODE_CONTINUE &&
+ brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE &&
+ brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE) {
+ brw_inst_set_bits(inst, 127, 111, 0);
+ }
+}
+
+static bool
+skip_bit(const struct gen_device_info *devinfo, brw_inst *src, int bit)
+{
+ /* pad bit */
+ if (bit == 7)
+ return true;
+
+ /* The compact bit -- uncompacted can't have it set. */
+ if (bit == 29)
+ return true;
+
+ /* pad bit */
+ if (bit == 47)
+ return true;
+
+ /* pad bits */
+ if (bit >= 90 && bit <= 95)
+ return true;
+
+ /* sometimes these are pad bits. */
+ if (brw_inst_opcode(devinfo, src) != BRW_OPCODE_SEND &&
+ brw_inst_opcode(devinfo, src) != BRW_OPCODE_SENDC &&
+ brw_inst_opcode(devinfo, src) != BRW_OPCODE_BREAK &&
+ brw_inst_opcode(devinfo, src) != BRW_OPCODE_CONTINUE &&
+ brw_inst_src0_reg_file(devinfo, src) != BRW_IMMEDIATE_VALUE &&
+ brw_inst_src1_reg_file(devinfo, src) != BRW_IMMEDIATE_VALUE &&
+ bit >= 121) {
+ return true;
+ }
+
+ return false;
+}
+
+static bool
+test_fuzz_compact_instruction(struct brw_codegen *p, brw_inst src)
+{
+ for (int bit0 = 0; bit0 < 128; bit0++) {
+ if (skip_bit(p->devinfo, &src, bit0))
+ continue;
+
+ for (int bit1 = 0; bit1 < 128; bit1++) {
+ brw_inst instr = src;
+ uint32_t *bits = (uint32_t *)&instr;
+
+ if (skip_bit(p->devinfo, &src, bit1))
+ continue;
+
+ bits[bit0 / 32] ^= (1 << (bit0 & 31));
+ bits[bit1 / 32] ^= (1 << (bit1 & 31));
+
+ clear_pad_bits(p->devinfo, &instr);
+
+ if (!test_compact_instruction(p, instr)) {
+ printf(" twiddled bits for fuzzing %d, %d\n", bit0, bit1);
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+static void
+gen_ADD_GRF_GRF_GRF(struct brw_codegen *p)
+{
+ struct brw_reg g0 = brw_vec8_grf(0, 0);
+ struct brw_reg g2 = brw_vec8_grf(2, 0);
+ struct brw_reg g4 = brw_vec8_grf(4, 0);
+
+ brw_ADD(p, g0, g2, g4);
+}
+
+static void
+gen_ADD_GRF_GRF_IMM(struct brw_codegen *p)
+{
+ struct brw_reg g0 = brw_vec8_grf(0, 0);
+ struct brw_reg g2 = brw_vec8_grf(2, 0);
+
+ brw_ADD(p, g0, g2, brw_imm_f(1.0));
+}
+
+static void
+gen_ADD_GRF_GRF_IMM_d(struct brw_codegen *p)
+{
+ struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_D);
+ struct brw_reg g2 = retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_D);
+
+ brw_ADD(p, g0, g2, brw_imm_d(1));
+}
+
+static void
+gen_MOV_GRF_GRF(struct brw_codegen *p)
+{
+ struct brw_reg g0 = brw_vec8_grf(0, 0);
+ struct brw_reg g2 = brw_vec8_grf(2, 0);
+
+ brw_MOV(p, g0, g2);
+}
+
+static void
+gen_ADD_MRF_GRF_GRF(struct brw_codegen *p)
+{
+ struct brw_reg m6 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 6, 0);
+ struct brw_reg g2 = brw_vec8_grf(2, 0);
+ struct brw_reg g4 = brw_vec8_grf(4, 0);
+
+ brw_ADD(p, m6, g2, g4);
+}
+
+static void
+gen_ADD_vec1_GRF_GRF_GRF(struct brw_codegen *p)
+{
+ struct brw_reg g0 = brw_vec1_grf(0, 0);
+ struct brw_reg g2 = brw_vec1_grf(2, 0);
+ struct brw_reg g4 = brw_vec1_grf(4, 0);
+
+ brw_ADD(p, g0, g2, g4);
+}
+
+static void
+gen_PLN_MRF_GRF_GRF(struct brw_codegen *p)
+{
+ struct brw_reg m6 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 6, 0);
+ struct brw_reg interp = brw_vec1_grf(2, 0);
+ struct brw_reg g4 = brw_vec8_grf(4, 0);
+
+ brw_PLN(p, m6, interp, g4);
+}
+
+static void
+gen_f0_0_MOV_GRF_GRF(struct brw_codegen *p)
+{
+ struct brw_reg g0 = brw_vec8_grf(0, 0);
+ struct brw_reg g2 = brw_vec8_grf(2, 0);
+
+ brw_push_insn_state(p);
+ brw_set_default_predicate_control(p, true);
+ brw_MOV(p, g0, g2);
+ brw_pop_insn_state(p);
+}
+
+/* The handling of f0.1 vs f0.0 changes between gen6 and gen7. Explicitly test
+ * it, so that we run the fuzzing can run over all the other bits that might
+ * interact with it.
+ */
+static void
+gen_f0_1_MOV_GRF_GRF(struct brw_codegen *p)
+{
+ struct brw_reg g0 = brw_vec8_grf(0, 0);
+ struct brw_reg g2 = brw_vec8_grf(2, 0);
+
+ brw_push_insn_state(p);
+ brw_set_default_predicate_control(p, true);
+ brw_inst *mov = brw_MOV(p, g0, g2);
+ brw_inst_set_flag_subreg_nr(p->devinfo, mov, 1);
+ brw_pop_insn_state(p);
+}
+
+struct {
+ void (*func)(struct brw_codegen *p);
+} tests[] = {
+ { gen_MOV_GRF_GRF },
+ { gen_ADD_GRF_GRF_GRF },
+ { gen_ADD_GRF_GRF_IMM },
+ { gen_ADD_GRF_GRF_IMM_d },
+ { gen_ADD_MRF_GRF_GRF },
+ { gen_ADD_vec1_GRF_GRF_GRF },
+ { gen_PLN_MRF_GRF_GRF },
+ { gen_f0_0_MOV_GRF_GRF },
+ { gen_f0_1_MOV_GRF_GRF },
+};
+
+static bool
+run_tests(const struct gen_device_info *devinfo)
+{
+ brw_init_compaction_tables(devinfo);
+ bool fail = false;
+
+ for (int i = 0; i < ARRAY_SIZE(tests); i++) {
+ for (int align_16 = 0; align_16 <= 1; align_16++) {
+ struct brw_codegen *p = rzalloc(NULL, struct brw_codegen);
+ brw_init_codegen(devinfo, p, p);
+
+ brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+ if (align_16)
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ else
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ tests[i].func(p);
+ assert(p->nr_insn == 1);
+
+ if (!test_compact_instruction(p, p->store[0])) {
+ fail = true;
+ continue;
+ }
+
+ if (!test_fuzz_compact_instruction(p, p->store[0])) {
+ fail = true;
+ continue;
+ }
+
+ ralloc_free(p);
+ }
+ }
+
+ return fail;
+}
+
+int
+main(int argc, char **argv)
+{
+ struct gen_device_info *devinfo = calloc(1, sizeof(*devinfo));
+ devinfo->gen = 6;
+ bool fail = false;
+
+ for (devinfo->gen = 6; devinfo->gen <= 7; devinfo->gen++) {
+ fail |= run_tests(devinfo);
+ }
+
+ return fail;
+}
diff --git a/src/intel/compiler/test_eu_validate.cpp b/src/intel/compiler/test_eu_validate.cpp
new file mode 100644
index 00000000000..76652dc43d0
--- /dev/null
+++ b/src/intel/compiler/test_eu_validate.cpp
@@ -0,0 +1,847 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "brw_eu.h"
+#include "util/ralloc.h"
+
+enum subgen {
+ IS_G45 = 1,
+ IS_BYT,
+ IS_HSW,
+ IS_CHV,
+ IS_BXT,
+ IS_KBL,
+};
+
+static const struct gen_info {
+ const char *name;
+ int gen;
+ enum subgen subgen;
+} gens[] = {
+ { "brw", 4 },
+ { "g45", 4, IS_G45 },
+ { "ilk", 5 },
+ { "snb", 6 },
+ { "ivb", 7 },
+ { "byt", 7, IS_BYT },
+ { "hsw", 7, IS_HSW },
+ { "bdw", 8 },
+ { "chv", 8, IS_CHV },
+ { "skl", 9 },
+ { "bxt", 9, IS_BXT },
+ { "kbl", 9, IS_KBL },
+};
+
+class validation_test: public ::testing::TestWithParam<struct gen_info> {
+ virtual void SetUp();
+
+public:
+ validation_test();
+ virtual ~validation_test();
+
+ struct brw_codegen *p;
+ struct gen_device_info devinfo;
+};
+
+validation_test::validation_test()
+{
+ p = rzalloc(NULL, struct brw_codegen);
+ memset(&devinfo, 0, sizeof(devinfo));
+}
+
+validation_test::~validation_test()
+{
+ ralloc_free(p);
+}
+
+void validation_test::SetUp()
+{
+ struct gen_info info = GetParam();
+
+ devinfo.gen = info.gen;
+ devinfo.is_g4x = info.subgen == IS_G45;
+ devinfo.is_baytrail = info.subgen == IS_BYT;
+ devinfo.is_haswell = info.subgen == IS_HSW;
+ devinfo.is_cherryview = info.subgen == IS_CHV;
+ devinfo.is_broxton = info.subgen == IS_BXT;
+ devinfo.is_kabylake = info.subgen == IS_KBL;
+
+ brw_init_codegen(&devinfo, p, p);
+}
+
+struct gen_name {
+ template <class ParamType>
+ std::string
+ operator()(const ::testing::TestParamInfo<ParamType>& info) const {
+ return info.param.name;
+ }
+};
+
+INSTANTIATE_TEST_CASE_P(eu_assembly, validation_test,
+ ::testing::ValuesIn(gens),
+ gen_name());
+
+static bool
+validate(struct brw_codegen *p)
+{
+ const bool print = getenv("TEST_DEBUG");
+ struct annotation_info annotation;
+ memset(&annotation, 0, sizeof(annotation));
+
+ if (print) {
+ annotation.mem_ctx = ralloc_context(NULL);
+ annotation.ann_count = 1;
+ annotation.ann_size = 2;
+ annotation.ann = rzalloc_array(annotation.mem_ctx, struct annotation,
+ annotation.ann_size);
+ annotation.ann[annotation.ann_count].offset = p->next_insn_offset;
+ }
+
+ bool ret = brw_validate_instructions(p, 0, &annotation);
+
+ if (print) {
+ dump_assembly(p->store, annotation.ann_count, annotation.ann, p->devinfo);
+ ralloc_free(annotation.mem_ctx);
+ }
+
+ return ret;
+}
+
+#define last_inst (&p->store[p->nr_insn - 1])
+#define g0 brw_vec8_grf(0, 0)
+#define null brw_null_reg()
+
+static void
+clear_instructions(struct brw_codegen *p)
+{
+ p->next_insn_offset = 0;
+ p->nr_insn = 0;
+}
+
+TEST_P(validation_test, sanity)
+{
+ brw_ADD(p, g0, g0, g0);
+
+ EXPECT_TRUE(validate(p));
+}
+
+TEST_P(validation_test, src0_null_reg)
+{
+ brw_MOV(p, g0, null);
+
+ EXPECT_FALSE(validate(p));
+}
+
+TEST_P(validation_test, src1_null_reg)
+{
+ brw_ADD(p, g0, g0, null);
+
+ EXPECT_FALSE(validate(p));
+}
+
+TEST_P(validation_test, math_src0_null_reg)
+{
+ if (devinfo.gen >= 6) {
+ gen6_math(p, g0, BRW_MATH_FUNCTION_SIN, null, null);
+ } else {
+ gen4_math(p, g0, BRW_MATH_FUNCTION_SIN, 0, null, BRW_MATH_PRECISION_FULL);
+ }
+
+ EXPECT_FALSE(validate(p));
+}
+
+TEST_P(validation_test, math_src1_null_reg)
+{
+ if (devinfo.gen >= 6) {
+ gen6_math(p, g0, BRW_MATH_FUNCTION_POW, g0, null);
+ EXPECT_FALSE(validate(p));
+ } else {
+ /* Math instructions on Gen4/5 are actually SEND messages with payloads.
+ * src1 is an immediate message descriptor set by gen4_math.
+ */
+ }
+}
+
+TEST_P(validation_test, opcode46)
+{
+ /* opcode 46 is "push" on Gen 4 and 5
+ * "fork" on Gen 6
+ * reserved on Gen 7
+ * "goto" on Gen8+
+ */
+ brw_next_insn(p, 46);
+
+ if (devinfo.gen == 7) {
+ EXPECT_FALSE(validate(p));
+ } else {
+ EXPECT_TRUE(validate(p));
+ }
+}
+
+/* When the Execution Data Type is wider than the destination data type, the
+ * destination must [...] specify a HorzStride equal to the ratio in sizes of
+ * the two data types.
+ */
+TEST_P(validation_test, dest_stride_must_be_equal_to_the_ratio_of_exec_size_to_dest_size)
+{
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D);
+ brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D);
+
+ EXPECT_FALSE(validate(p));
+
+ clear_instructions(p);
+
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+ brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D);
+ brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D);
+
+ EXPECT_TRUE(validate(p));
+}
+
+/* When the Execution Data Type is wider than the destination data type, the
+ * destination must be aligned as required by the wider execution data type
+ * [...]
+ */
+TEST_P(validation_test, dst_subreg_must_be_aligned_to_exec_type_size)
+{
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 2);
+ brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+ brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D);
+ brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D);
+
+ EXPECT_FALSE(validate(p));
+
+ clear_instructions(p);
+
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_4);
+ brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 8);
+ brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+ brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D);
+ brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+ brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+ brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+ brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D);
+ brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+ brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+ brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+ EXPECT_TRUE(validate(p));
+}
+
+/* ExecSize must be greater than or equal to Width. */
+TEST_P(validation_test, exec_size_less_than_width)
+{
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_16);
+
+ EXPECT_FALSE(validate(p));
+
+ clear_instructions(p);
+
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_16);
+
+ EXPECT_FALSE(validate(p));
+}
+
+/* If ExecSize = Width and HorzStride ≠ 0,
+ * VertStride must be set to Width * HorzStride.
+ */
+TEST_P(validation_test, vertical_stride_is_width_by_horizontal_stride)
+{
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+
+ EXPECT_FALSE(validate(p));
+
+ clear_instructions(p);
+
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+
+ EXPECT_FALSE(validate(p));
+}
+
+/* If Width = 1, HorzStride must be 0 regardless of the values
+ * of ExecSize and VertStride.
+ */
+TEST_P(validation_test, horizontal_stride_must_be_0_if_width_is_1)
+{
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0);
+ brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1);
+ brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+ EXPECT_FALSE(validate(p));
+
+ clear_instructions(p);
+
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0);
+ brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_1);
+ brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+ EXPECT_FALSE(validate(p));
+}
+
+/* If ExecSize = Width = 1, both VertStride and HorzStride must be 0. */
+TEST_P(validation_test, scalar_region_must_be_0_1_0)
+{
+ struct brw_reg g0_0 = brw_vec1_grf(0, 0);
+
+ brw_ADD(p, g0, g0, g0_0);
+ brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_1);
+ brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_1);
+ brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1);
+ brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+
+ EXPECT_FALSE(validate(p));
+
+ clear_instructions(p);
+
+ brw_ADD(p, g0, g0_0, g0);
+ brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_1);
+ brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_1);
+ brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_1);
+ brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+
+ EXPECT_FALSE(validate(p));
+}
+
+/* If VertStride = HorzStride = 0, Width must be 1 regardless of the value
+ * of ExecSize.
+ */
+TEST_P(validation_test, zero_stride_implies_0_1_0)
+{
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0);
+ brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_2);
+ brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+
+ EXPECT_FALSE(validate(p));
+
+ clear_instructions(p);
+
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0);
+ brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_2);
+ brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+
+ EXPECT_FALSE(validate(p));
+}
+
+/* Dst.HorzStride must not be 0. */
+TEST_P(validation_test, dst_horizontal_stride_0)
+{
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+
+ EXPECT_FALSE(validate(p));
+
+ clear_instructions(p);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+
+ EXPECT_FALSE(validate(p));
+}
+
+/* VertStride must be used to cross GRF register boundaries. This rule implies
+ * that elements within a 'Width' cannot cross GRF boundaries.
+ */
+TEST_P(validation_test, must_not_cross_grf_boundary_in_a_width)
+{
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_src0_da1_subreg_nr(&devinfo, last_inst, 4);
+
+ EXPECT_FALSE(validate(p));
+
+ clear_instructions(p);
+
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_src1_da1_subreg_nr(&devinfo, last_inst, 4);
+
+ EXPECT_FALSE(validate(p));
+
+ clear_instructions(p);
+
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+ brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+ brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+ EXPECT_FALSE(validate(p));
+
+ clear_instructions(p);
+
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+ brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+ brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+ EXPECT_FALSE(validate(p));
+}
+
+/* Destination Horizontal must be 1 in Align16 */
+TEST_P(validation_test, dst_hstride_on_align16_must_be_1)
+{
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+ EXPECT_FALSE(validate(p));
+
+ clear_instructions(p);
+
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+ EXPECT_TRUE(validate(p));
+}
+
+/* VertStride must be 0 or 4 in Align16 */
+TEST_P(validation_test, vstride_on_align16_must_be_0_or_4)
+{
+ const struct {
+ enum brw_vertical_stride vstride;
+ bool expected_result;
+ } vstride[] = {
+ { BRW_VERTICAL_STRIDE_0, true },
+ { BRW_VERTICAL_STRIDE_1, false },
+ { BRW_VERTICAL_STRIDE_2, devinfo.is_haswell || devinfo.gen >= 8 },
+ { BRW_VERTICAL_STRIDE_4, true },
+ { BRW_VERTICAL_STRIDE_8, false },
+ { BRW_VERTICAL_STRIDE_16, false },
+ { BRW_VERTICAL_STRIDE_32, false },
+ { BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL, false },
+ };
+
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+ for (unsigned i = 0; i < sizeof(vstride) / sizeof(vstride[0]); i++) {
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_src0_vstride(&devinfo, last_inst, vstride[i].vstride);
+
+ EXPECT_EQ(vstride[i].expected_result, validate(p));
+
+ clear_instructions(p);
+ }
+
+ for (unsigned i = 0; i < sizeof(vstride) / sizeof(vstride[0]); i++) {
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_src1_vstride(&devinfo, last_inst, vstride[i].vstride);
+
+ EXPECT_EQ(vstride[i].expected_result, validate(p));
+
+ clear_instructions(p);
+ }
+}
+
+/* In Direct Addressing mode, a source cannot span more than 2 adjacent GRF
+ * registers.
+ */
+TEST_P(validation_test, source_cannot_span_more_than_2_registers)
+{
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_32);
+ brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+ brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_8);
+ brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+ EXPECT_FALSE(validate(p));
+
+ clear_instructions(p);
+
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+ brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+ brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_8);
+ brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+ brw_inst_set_src1_da1_subreg_nr(&devinfo, last_inst, 2);
+
+ EXPECT_TRUE(validate(p));
+
+ clear_instructions(p);
+
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+
+ EXPECT_TRUE(validate(p));
+}
+
+/* A destination cannot span more than 2 adjacent GRF registers. */
+TEST_P(validation_test, destination_cannot_span_more_than_2_registers)
+{
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_32);
+ brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+ brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+
+ EXPECT_FALSE(validate(p));
+
+ clear_instructions(p);
+
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_8);
+ brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 6);
+ brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_4);
+ brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+ brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+ brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+ brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+ brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+ brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+ EXPECT_TRUE(validate(p));
+}
+
+TEST_P(validation_test, src_region_spans_two_regs_dst_region_spans_one)
+{
+ /* Writes to dest are to the lower OWord */
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+ brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+ brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+ EXPECT_TRUE(validate(p));
+
+ clear_instructions(p);
+
+ /* Writes to dest are to the upper OWord */
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 16);
+ brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+ brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+ brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+ EXPECT_TRUE(validate(p));
+
+ clear_instructions(p);
+
+ /* Writes to dest are evenly split between OWords */
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+ brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+ brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_8);
+ brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+ EXPECT_TRUE(validate(p));
+
+ clear_instructions(p);
+
+ /* Writes to dest are uneven between OWords */
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_4);
+ brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 10);
+ brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+ brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+ brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+ brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+ brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_2);
+ brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+ if (devinfo.gen >= 9) {
+ EXPECT_TRUE(validate(p));
+ } else {
+ EXPECT_FALSE(validate(p));
+ }
+}
+
+TEST_P(validation_test, dst_elements_must_be_evenly_split_between_registers)
+{
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 4);
+
+ if (devinfo.gen >= 9) {
+ EXPECT_TRUE(validate(p));
+ } else {
+ EXPECT_FALSE(validate(p));
+ }
+
+ clear_instructions(p);
+
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+
+ EXPECT_TRUE(validate(p));
+
+ clear_instructions(p);
+
+ if (devinfo.gen >= 6) {
+ gen6_math(p, g0, BRW_MATH_FUNCTION_SIN, g0, null);
+
+ EXPECT_TRUE(validate(p));
+
+ clear_instructions(p);
+
+ gen6_math(p, g0, BRW_MATH_FUNCTION_SIN, g0, null);
+ brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 4);
+
+ EXPECT_FALSE(validate(p));
+ }
+}
+
+TEST_P(validation_test, two_src_two_dst_source_offsets_must_be_same)
+{
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_4);
+ brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_4);
+ brw_inst_set_src0_da1_subreg_nr(&devinfo, last_inst, 16);
+ brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_2);
+ brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1);
+ brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+ brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+ brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+ brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+ if (devinfo.gen <= 7) {
+ EXPECT_FALSE(validate(p));
+ } else {
+ EXPECT_TRUE(validate(p));
+ }
+
+ clear_instructions(p);
+
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_4);
+ brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_4);
+ brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+ brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1);
+ brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+ brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_8);
+ brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_2);
+ brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+ EXPECT_TRUE(validate(p));
+}
+
+#if 0
+TEST_P(validation_test, two_src_two_dst_each_dst_must_be_derived_from_one_src)
+{
+ // mov (16) r10.0<2>:w r12.4<4;4,1>:w
+
+ brw_MOV(p, g0, g0);
+ brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+ brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+ brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src0_da1_subreg_nr(&devinfo, last_inst, 8);
+ brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+ brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+ brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_4);
+
+ EXPECT_FALSE(validate(p));
+
+ clear_instructions(p);
+
+#if 0
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_src1_da1_subreg_nr(&devinfo, last_inst, 16);
+ brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+ brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+ brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+ EXPECT_FALSE(validate(p));
+ #endif
+}
+#endif
+
+TEST_P(validation_test, one_src_two_dst)
+{
+ struct brw_reg g0_0 = brw_vec1_grf(0, 0);
+
+ brw_ADD(p, g0, g0_0, g0_0);
+ brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+
+ EXPECT_TRUE(validate(p));
+
+ clear_instructions(p);
+
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+ brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D);
+ brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+
+ EXPECT_TRUE(validate(p));
+
+ clear_instructions(p);
+
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+ brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+ brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0);
+ brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_1);
+ brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+
+ if (devinfo.gen >= 8) {
+ EXPECT_TRUE(validate(p));
+ } else {
+ EXPECT_FALSE(validate(p));
+ }
+
+ clear_instructions(p);
+
+ brw_ADD(p, g0, g0, g0);
+ brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+ brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+ brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+ brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0);
+ brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1);
+ brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+ brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+
+ if (devinfo.gen >= 8) {
+ EXPECT_TRUE(validate(p));
+ } else {
+ EXPECT_FALSE(validate(p));
+ }
+}
+
+TEST_P(validation_test, packed_byte_destination)
+{
+ static const struct {
+ enum brw_reg_type dst_type;
+ enum brw_reg_type src_type;
+ bool neg, abs, sat;
+ bool expected_result;
+ } move[] = {
+ { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UB, 0, 0, 0, true },
+ { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_B , 0, 0, 0, true },
+ { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_B , 0, 0, 0, true },
+ { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_UB, 0, 0, 0, true },
+
+ { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UB, 1, 0, 0, false },
+ { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_B , 1, 0, 0, false },
+ { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_B , 1, 0, 0, false },
+ { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_UB, 1, 0, 0, false },
+
+ { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UB, 0, 1, 0, false },
+ { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_B , 0, 1, 0, false },
+ { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_B , 0, 1, 0, false },
+ { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_UB, 0, 1, 0, false },
+
+ { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UB, 0, 0, 1, false },
+ { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_B , 0, 0, 1, false },
+ { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_B , 0, 0, 1, false },
+ { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_UB, 0, 0, 1, false },
+
+ { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UW, 0, 0, 0, false },
+ { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_W , 0, 0, 0, false },
+ { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UD, 0, 0, 0, false },
+ { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_D , 0, 0, 0, false },
+ };
+
+ for (unsigned i = 0; i < sizeof(move) / sizeof(move[0]); i++) {
+ brw_MOV(p, retype(g0, move[i].dst_type), retype(g0, move[i].src_type));
+ brw_inst_set_src0_negate(&devinfo, last_inst, move[i].neg);
+ brw_inst_set_src0_abs(&devinfo, last_inst, move[i].abs);
+ brw_inst_set_saturate(&devinfo, last_inst, move[i].sat);
+
+ EXPECT_EQ(move[i].expected_result, validate(p));
+
+ clear_instructions(p);
+ }
+
+ brw_SEL(p, retype(g0, BRW_REGISTER_TYPE_UB),
+ retype(g0, BRW_REGISTER_TYPE_UB),
+ retype(g0, BRW_REGISTER_TYPE_UB));
+ brw_inst_set_pred_control(&devinfo, last_inst, BRW_PREDICATE_NORMAL);
+
+ EXPECT_FALSE(validate(p));
+
+ clear_instructions(p);
+
+ brw_SEL(p, retype(g0, BRW_REGISTER_TYPE_B),
+ retype(g0, BRW_REGISTER_TYPE_B),
+ retype(g0, BRW_REGISTER_TYPE_B));
+ brw_inst_set_pred_control(&devinfo, last_inst, BRW_PREDICATE_NORMAL);
+
+ EXPECT_FALSE(validate(p));
+}
+
+TEST_P(validation_test, byte_destination_relaxed_alignment)
+{
+ brw_SEL(p, retype(g0, BRW_REGISTER_TYPE_B),
+ retype(g0, BRW_REGISTER_TYPE_W),
+ retype(g0, BRW_REGISTER_TYPE_W));
+ brw_inst_set_pred_control(&devinfo, last_inst, BRW_PREDICATE_NORMAL);
+ brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+ EXPECT_TRUE(validate(p));
+
+ clear_instructions(p);
+
+ brw_SEL(p, retype(g0, BRW_REGISTER_TYPE_B),
+ retype(g0, BRW_REGISTER_TYPE_W),
+ retype(g0, BRW_REGISTER_TYPE_W));
+ brw_inst_set_pred_control(&devinfo, last_inst, BRW_PREDICATE_NORMAL);
+ brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+ brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 1);
+
+ if (devinfo.gen > 4 || devinfo.is_g4x) {
+ EXPECT_TRUE(validate(p));
+ } else {
+ EXPECT_FALSE(validate(p));
+ }
+
+}
diff --git a/src/intel/compiler/test_fs_cmod_propagation.cpp b/src/intel/compiler/test_fs_cmod_propagation.cpp
new file mode 100644
index 00000000000..a97e374f74e
--- /dev/null
+++ b/src/intel/compiler/test_fs_cmod_propagation.cpp
@@ -0,0 +1,556 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "program/program.h"
+
+using namespace brw;
+
+class cmod_propagation_test : public ::testing::Test {
+ virtual void SetUp();
+
+public:
+ struct brw_compiler *compiler;
+ struct gen_device_info *devinfo;
+ struct gl_context *ctx;
+ struct brw_wm_prog_data *prog_data;
+ struct gl_shader_program *shader_prog;
+ fs_visitor *v;
+};
+
+class cmod_propagation_fs_visitor : public fs_visitor
+{
+public:
+ cmod_propagation_fs_visitor(struct brw_compiler *compiler,
+ struct brw_wm_prog_data *prog_data,
+ nir_shader *shader)
+ : fs_visitor(compiler, NULL, NULL, NULL,
+ &prog_data->base, (struct gl_program *) NULL,
+ shader, 8, -1) {}
+};
+
+
+void cmod_propagation_test::SetUp()
+{
+ ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+ compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+ devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo));
+ compiler->devinfo = devinfo;
+
+ prog_data = ralloc(NULL, struct brw_wm_prog_data);
+ nir_shader *shader =
+ nir_shader_create(NULL, MESA_SHADER_FRAGMENT, NULL, NULL);
+
+ v = new cmod_propagation_fs_visitor(compiler, prog_data, shader);
+
+ devinfo->gen = 4;
+}
+
+static fs_inst *
+instruction(bblock_t *block, int num)
+{
+ fs_inst *inst = (fs_inst *)block->start();
+ for (int i = 0; i < num; i++) {
+ inst = (fs_inst *)inst->next;
+ }
+ return inst;
+}
+
+static bool
+cmod_propagation(fs_visitor *v)
+{
+ const bool print = getenv("TEST_DEBUG");
+
+ if (print) {
+ fprintf(stderr, "= Before =\n");
+ v->cfg->dump(v);
+ }
+
+ bool ret = v->opt_cmod_propagation();
+
+ if (print) {
+ fprintf(stderr, "\n= After =\n");
+ v->cfg->dump(v);
+ }
+
+ return ret;
+}
+
+TEST_F(cmod_propagation_test, basic)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dest = v->vgrf(glsl_type::float_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg src1 = v->vgrf(glsl_type::float_type);
+ fs_reg zero(brw_imm_f(0.0f));
+ bld.ADD(dest, src0, src1);
+ bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE);
+
+ /* = Before =
+ *
+ * 0: add(8) dest src0 src1
+ * 1: cmp.ge.f0(8) null dest 0.0f
+ *
+ * = After =
+ * 0: add.ge.f0(8) dest src0 src1
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_TRUE(cmod_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(0, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, cmp_nonzero)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dest = v->vgrf(glsl_type::float_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg src1 = v->vgrf(glsl_type::float_type);
+ fs_reg nonzero(brw_imm_f(1.0f));
+ bld.ADD(dest, src0, src1);
+ bld.CMP(bld.null_reg_f(), dest, nonzero, BRW_CONDITIONAL_GE);
+
+ /* = Before =
+ *
+ * 0: add(8) dest src0 src1
+ * 1: cmp.ge.f0(8) null dest 1.0f
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_FALSE(cmod_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, non_cmod_instruction)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dest = v->vgrf(glsl_type::uint_type);
+ fs_reg src0 = v->vgrf(glsl_type::uint_type);
+ fs_reg zero(brw_imm_ud(0u));
+ bld.FBL(dest, src0);
+ bld.CMP(bld.null_reg_ud(), dest, zero, BRW_CONDITIONAL_GE);
+
+ /* = Before =
+ *
+ * 0: fbl(8) dest src0
+ * 1: cmp.ge.f0(8) null dest 0u
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_FALSE(cmod_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_FBL, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_flag_write)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dest = v->vgrf(glsl_type::float_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg src1 = v->vgrf(glsl_type::float_type);
+ fs_reg src2 = v->vgrf(glsl_type::float_type);
+ fs_reg zero(brw_imm_f(0.0f));
+ bld.ADD(dest, src0, src1);
+ bld.CMP(bld.null_reg_f(), src2, zero, BRW_CONDITIONAL_GE);
+ bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE);
+
+ /* = Before =
+ *
+ * 0: add(8) dest src0 src1
+ * 1: cmp.ge.f0(8) null src2 0.0f
+ * 2: cmp.ge.f0(8) null dest 0.0f
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+
+ EXPECT_FALSE(cmod_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_flag_read)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dest0 = v->vgrf(glsl_type::float_type);
+ fs_reg dest1 = v->vgrf(glsl_type::float_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg src1 = v->vgrf(glsl_type::float_type);
+ fs_reg src2 = v->vgrf(glsl_type::float_type);
+ fs_reg zero(brw_imm_f(0.0f));
+ bld.ADD(dest0, src0, src1);
+ set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
+ bld.CMP(bld.null_reg_f(), dest0, zero, BRW_CONDITIONAL_GE);
+
+ /* = Before =
+ *
+ * 0: add(8) dest0 src0 src1
+ * 1: (+f0) sel(8) dest1 src2 0.0f
+ * 2: cmp.ge.f0(8) null dest0 0.0f
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+
+ EXPECT_FALSE(cmod_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+ EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_dest_write)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dest = v->vgrf(glsl_type::vec4_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg src1 = v->vgrf(glsl_type::float_type);
+ fs_reg src2 = v->vgrf(glsl_type::vec2_type);
+ fs_reg zero(brw_imm_f(0.0f));
+ bld.ADD(offset(dest, bld, 2), src0, src1);
+ bld.emit(SHADER_OPCODE_TEX, dest, src2)
+ ->size_written = 4 * REG_SIZE;
+ bld.CMP(bld.null_reg_f(), offset(dest, bld, 2), zero, BRW_CONDITIONAL_GE);
+
+ /* = Before =
+ *
+ * 0: add(8) dest+2 src0 src1
+ * 1: tex(8) rlen 4 dest+0 src2
+ * 2: cmp.ge.f0(8) null dest+2 0.0f
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+
+ EXPECT_FALSE(cmod_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+ EXPECT_EQ(SHADER_OPCODE_TEX, instruction(block0, 1)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_flag_read_same_value)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dest0 = v->vgrf(glsl_type::float_type);
+ fs_reg dest1 = v->vgrf(glsl_type::float_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg src1 = v->vgrf(glsl_type::float_type);
+ fs_reg src2 = v->vgrf(glsl_type::float_type);
+ fs_reg zero(brw_imm_f(0.0f));
+ set_condmod(BRW_CONDITIONAL_GE, bld.ADD(dest0, src0, src1));
+ set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
+ bld.CMP(bld.null_reg_f(), dest0, zero, BRW_CONDITIONAL_GE);
+
+ /* = Before =
+ *
+ * 0: add.ge.f0(8) dest0 src0 src1
+ * 1: (+f0) sel(8) dest1 src2 0.0f
+ * 2: cmp.ge.f0(8) null dest0 0.0f
+ *
+ * = After =
+ * 0: add.ge.f0(8) dest0 src0 src1
+ * 1: (+f0) sel(8) dest1 src2 0.0f
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+
+ EXPECT_TRUE(cmod_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+ EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+ EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+}
+
+TEST_F(cmod_propagation_test, negate)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dest = v->vgrf(glsl_type::float_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg src1 = v->vgrf(glsl_type::float_type);
+ fs_reg zero(brw_imm_f(0.0f));
+ bld.ADD(dest, src0, src1);
+ dest.negate = true;
+ bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE);
+
+ /* = Before =
+ *
+ * 0: add(8) dest src0 src1
+ * 1: cmp.ge.f0(8) null -dest 0.0f
+ *
+ * = After =
+ * 0: add.le.f0(8) dest src0 src1
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_TRUE(cmod_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(0, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_LE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, movnz)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dest = v->vgrf(glsl_type::float_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg src1 = v->vgrf(glsl_type::float_type);
+ bld.CMP(dest, src0, src1, BRW_CONDITIONAL_GE);
+ set_condmod(BRW_CONDITIONAL_NZ,
+ bld.MOV(bld.null_reg_f(), dest));
+
+ /* = Before =
+ *
+ * 0: cmp.ge.f0(8) dest src0 src1
+ * 1: mov.nz.f0(8) null dest
+ *
+ * = After =
+ * 0: cmp.ge.f0(8) dest src0 src1
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_TRUE(cmod_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(0, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, different_types_cmod_with_zero)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dest = v->vgrf(glsl_type::int_type);
+ fs_reg src0 = v->vgrf(glsl_type::int_type);
+ fs_reg src1 = v->vgrf(glsl_type::int_type);
+ fs_reg zero(brw_imm_f(0.0f));
+ bld.ADD(dest, src0, src1);
+ bld.CMP(bld.null_reg_f(), retype(dest, BRW_REGISTER_TYPE_F), zero,
+ BRW_CONDITIONAL_GE);
+
+ /* = Before =
+ *
+ * 0: add(8) dest:D src0:D src1:D
+ * 1: cmp.ge.f0(8) null:F dest:F 0.0f
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_FALSE(cmod_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, andnz_one)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dest = v->vgrf(glsl_type::int_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg zero(brw_imm_f(0.0f));
+ fs_reg one(brw_imm_d(1));
+
+ bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+ set_condmod(BRW_CONDITIONAL_NZ,
+ bld.AND(bld.null_reg_d(), dest, one));
+
+ /* = Before =
+ * 0: cmp.l.f0(8) dest:F src0:F 0F
+ * 1: and.nz.f0(8) null:D dest:D 1D
+ *
+ * = After =
+ * 0: cmp.l.f0(8) dest:F src0:F 0F
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_TRUE(cmod_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(0, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+ EXPECT_TRUE(retype(dest, BRW_REGISTER_TYPE_F)
+ .equals(instruction(block0, 0)->dst));
+}
+
+TEST_F(cmod_propagation_test, andnz_non_one)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dest = v->vgrf(glsl_type::int_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg zero(brw_imm_f(0.0f));
+ fs_reg nonone(brw_imm_d(38));
+
+ bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+ set_condmod(BRW_CONDITIONAL_NZ,
+ bld.AND(bld.null_reg_d(), dest, nonone));
+
+ /* = Before =
+ * 0: cmp.l.f0(8) dest:F src0:F 0F
+ * 1: and.nz.f0(8) null:D dest:D 38D
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_FALSE(cmod_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+ EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 1)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, andz_one)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dest = v->vgrf(glsl_type::int_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg zero(brw_imm_f(0.0f));
+ fs_reg one(brw_imm_d(1));
+
+ bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+ set_condmod(BRW_CONDITIONAL_Z,
+ bld.AND(bld.null_reg_d(), dest, one));
+
+ /* = Before =
+ * 0: cmp.l.f0(8) dest:F src0:F 0F
+ * 1: and.z.f0(8) null:D dest:D 1D
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_FALSE(cmod_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+ EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 1)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_EQ, instruction(block0, 1)->conditional_mod);
+}
diff --git a/src/intel/compiler/test_fs_copy_propagation.cpp b/src/intel/compiler/test_fs_copy_propagation.cpp
new file mode 100644
index 00000000000..37736ec86f4
--- /dev/null
+++ b/src/intel/compiler/test_fs_copy_propagation.cpp
@@ -0,0 +1,213 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "program/program.h"
+
+using namespace brw;
+
+class copy_propagation_test : public ::testing::Test {
+ virtual void SetUp();
+
+public:
+ struct brw_compiler *compiler;
+ struct gen_device_info *devinfo;
+ struct gl_context *ctx;
+ struct brw_wm_prog_data *prog_data;
+ struct gl_shader_program *shader_prog;
+ fs_visitor *v;
+};
+
+class copy_propagation_fs_visitor : public fs_visitor
+{
+public:
+ copy_propagation_fs_visitor(struct brw_compiler *compiler,
+ struct brw_wm_prog_data *prog_data,
+ nir_shader *shader)
+ : fs_visitor(compiler, NULL, NULL, NULL,
+ &prog_data->base, (struct gl_program *) NULL,
+ shader, 8, -1) {}
+};
+
+
+void copy_propagation_test::SetUp()
+{
+ ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+ compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+ devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo));
+ compiler->devinfo = devinfo;
+
+ prog_data = ralloc(NULL, struct brw_wm_prog_data);
+ nir_shader *shader =
+ nir_shader_create(NULL, MESA_SHADER_FRAGMENT, NULL, NULL);
+
+ v = new copy_propagation_fs_visitor(compiler, prog_data, shader);
+
+ devinfo->gen = 4;
+}
+
+static fs_inst *
+instruction(bblock_t *block, int num)
+{
+ fs_inst *inst = (fs_inst *)block->start();
+ for (int i = 0; i < num; i++) {
+ inst = (fs_inst *)inst->next;
+ }
+ return inst;
+}
+
+static bool
+copy_propagation(fs_visitor *v)
+{
+ const bool print = getenv("TEST_DEBUG");
+
+ if (print) {
+ fprintf(stderr, "= Before =\n");
+ v->cfg->dump(v);
+ }
+
+ bool ret = v->opt_copy_propagation();
+
+ if (print) {
+ fprintf(stderr, "\n= After =\n");
+ v->cfg->dump(v);
+ }
+
+ return ret;
+}
+
+TEST_F(copy_propagation_test, basic)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg vgrf0 = v->vgrf(glsl_type::float_type);
+ fs_reg vgrf1 = v->vgrf(glsl_type::float_type);
+ fs_reg vgrf2 = v->vgrf(glsl_type::float_type);
+ fs_reg vgrf3 = v->vgrf(glsl_type::float_type);
+ bld.MOV(vgrf0, vgrf2);
+ bld.ADD(vgrf1, vgrf0, vgrf3);
+
+ /* = Before =
+ *
+ * 0: mov(8) vgrf0 vgrf2
+ * 1: add(8) vgrf1 vgrf0 vgrf3
+ *
+ * = After =
+ * 0: mov(8) vgrf0 vgrf2
+ * 1: add(8) vgrf1 vgrf2 vgrf3
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_TRUE(copy_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ fs_inst *mov = instruction(block0, 0);
+ EXPECT_EQ(BRW_OPCODE_MOV, mov->opcode);
+ EXPECT_TRUE(mov->dst.equals(vgrf0));
+ EXPECT_TRUE(mov->src[0].equals(vgrf2));
+
+ fs_inst *add = instruction(block0, 1);
+ EXPECT_EQ(BRW_OPCODE_ADD, add->opcode);
+ EXPECT_TRUE(add->dst.equals(vgrf1));
+ EXPECT_TRUE(add->src[0].equals(vgrf2));
+ EXPECT_TRUE(add->src[1].equals(vgrf3));
+}
+
+TEST_F(copy_propagation_test, maxmax_sat_imm)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg vgrf0 = v->vgrf(glsl_type::float_type);
+ fs_reg vgrf1 = v->vgrf(glsl_type::float_type);
+ fs_reg vgrf2 = v->vgrf(glsl_type::float_type);
+
+ static const struct {
+ enum brw_conditional_mod conditional_mod;
+ float immediate;
+ bool expected_result;
+ } test[] = {
+ /* conditional mod, imm, expected_result */
+ { BRW_CONDITIONAL_GE , 0.1f, true },
+ { BRW_CONDITIONAL_L , 0.1f, true },
+ { BRW_CONDITIONAL_GE , 0.5f, true },
+ { BRW_CONDITIONAL_L , 0.5f, true },
+ { BRW_CONDITIONAL_GE , 0.9f, true },
+ { BRW_CONDITIONAL_L , 0.9f, true },
+ { BRW_CONDITIONAL_GE , -1.5f, false },
+ { BRW_CONDITIONAL_L , -1.5f, false },
+ { BRW_CONDITIONAL_GE , 1.5f, false },
+ { BRW_CONDITIONAL_L , 1.5f, false },
+
+ { BRW_CONDITIONAL_NONE, 0.5f, false },
+ { BRW_CONDITIONAL_Z , 0.5f, false },
+ { BRW_CONDITIONAL_NZ , 0.5f, false },
+ { BRW_CONDITIONAL_G , 0.5f, false },
+ { BRW_CONDITIONAL_LE , 0.5f, false },
+ { BRW_CONDITIONAL_R , 0.5f, false },
+ { BRW_CONDITIONAL_O , 0.5f, false },
+ { BRW_CONDITIONAL_U , 0.5f, false },
+ };
+
+ for (unsigned i = 0; i < sizeof(test) / sizeof(test[0]); i++) {
+ fs_inst *mov = set_saturate(true, bld.MOV(vgrf0, vgrf1));
+ fs_inst *sel = set_condmod(test[i].conditional_mod,
+ bld.SEL(vgrf2, vgrf0,
+ brw_imm_f(test[i].immediate)));
+
+ v->calculate_cfg();
+
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_EQ(test[i].expected_result, copy_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_EQ(BRW_OPCODE_MOV, mov->opcode);
+ EXPECT_TRUE(mov->saturate);
+ EXPECT_TRUE(mov->dst.equals(vgrf0));
+ EXPECT_TRUE(mov->src[0].equals(vgrf1));
+
+ EXPECT_EQ(BRW_OPCODE_SEL, sel->opcode);
+ EXPECT_EQ(test[i].conditional_mod, sel->conditional_mod);
+ EXPECT_EQ(test[i].expected_result, sel->saturate);
+ EXPECT_TRUE(sel->dst.equals(vgrf2));
+ if (test[i].expected_result) {
+ EXPECT_TRUE(sel->src[0].equals(vgrf1));
+ } else {
+ EXPECT_TRUE(sel->src[0].equals(vgrf0));
+ }
+ EXPECT_TRUE(sel->src[1].equals(brw_imm_f(test[i].immediate)));
+
+ delete v->cfg;
+ v->cfg = NULL;
+ }
+}
diff --git a/src/intel/compiler/test_fs_saturate_propagation.cpp b/src/intel/compiler/test_fs_saturate_propagation.cpp
new file mode 100644
index 00000000000..db472143994
--- /dev/null
+++ b/src/intel/compiler/test_fs_saturate_propagation.cpp
@@ -0,0 +1,600 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "program/program.h"
+
+using namespace brw;
+
+class saturate_propagation_test : public ::testing::Test {
+ virtual void SetUp();
+
+public:
+ struct brw_compiler *compiler;
+ struct gen_device_info *devinfo;
+ struct gl_context *ctx;
+ struct brw_wm_prog_data *prog_data;
+ struct gl_shader_program *shader_prog;
+ fs_visitor *v;
+};
+
+class saturate_propagation_fs_visitor : public fs_visitor
+{
+public:
+ saturate_propagation_fs_visitor(struct brw_compiler *compiler,
+ struct brw_wm_prog_data *prog_data,
+ nir_shader *shader)
+ : fs_visitor(compiler, NULL, NULL, NULL,
+ &prog_data->base, (struct gl_program *) NULL,
+ shader, 8, -1) {}
+};
+
+
+void saturate_propagation_test::SetUp()
+{
+ ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+ compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+ devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo));
+ compiler->devinfo = devinfo;
+
+ prog_data = ralloc(NULL, struct brw_wm_prog_data);
+ nir_shader *shader =
+ nir_shader_create(NULL, MESA_SHADER_FRAGMENT, NULL, NULL);
+
+ v = new saturate_propagation_fs_visitor(compiler, prog_data, shader);
+
+ devinfo->gen = 4;
+}
+
+static fs_inst *
+instruction(bblock_t *block, int num)
+{
+ fs_inst *inst = (fs_inst *)block->start();
+ for (int i = 0; i < num; i++) {
+ inst = (fs_inst *)inst->next;
+ }
+ return inst;
+}
+
+static bool
+saturate_propagation(fs_visitor *v)
+{
+ const bool print = false;
+
+ if (print) {
+ fprintf(stderr, "= Before =\n");
+ v->cfg->dump(v);
+ }
+
+ bool ret = v->opt_saturate_propagation();
+
+ if (print) {
+ fprintf(stderr, "\n= After =\n");
+ v->cfg->dump(v);
+ }
+
+ return ret;
+}
+
+TEST_F(saturate_propagation_test, basic)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dst0 = v->vgrf(glsl_type::float_type);
+ fs_reg dst1 = v->vgrf(glsl_type::float_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg src1 = v->vgrf(glsl_type::float_type);
+ bld.ADD(dst0, src0, src1);
+ set_saturate(true, bld.MOV(dst1, dst0));
+
+ /* = Before =
+ *
+ * 0: add(8) dst0 src0 src1
+ * 1: mov.sat(8) dst1 dst0
+ *
+ * = After =
+ * 0: add.sat(8) dst0 src0 src1
+ * 1: mov(8) dst1 dst0
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_TRUE(saturate_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_TRUE(instruction(block0, 0)->saturate);
+ EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+ EXPECT_FALSE(instruction(block0, 1)->saturate);
+}
+
+TEST_F(saturate_propagation_test, other_non_saturated_use)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dst0 = v->vgrf(glsl_type::float_type);
+ fs_reg dst1 = v->vgrf(glsl_type::float_type);
+ fs_reg dst2 = v->vgrf(glsl_type::float_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg src1 = v->vgrf(glsl_type::float_type);
+ bld.ADD(dst0, src0, src1);
+ set_saturate(true, bld.MOV(dst1, dst0));
+ bld.ADD(dst2, dst0, src0);
+
+ /* = Before =
+ *
+ * 0: add(8) dst0 src0 src1
+ * 1: mov.sat(8) dst1 dst0
+ * 2: add(8) dst2 dst0 src0
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+
+ EXPECT_FALSE(saturate_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_FALSE(instruction(block0, 0)->saturate);
+ EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+ EXPECT_TRUE(instruction(block0, 1)->saturate);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 2)->opcode);
+}
+
+TEST_F(saturate_propagation_test, predicated_instruction)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dst0 = v->vgrf(glsl_type::float_type);
+ fs_reg dst1 = v->vgrf(glsl_type::float_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg src1 = v->vgrf(glsl_type::float_type);
+ bld.ADD(dst0, src0, src1)
+ ->predicate = BRW_PREDICATE_NORMAL;
+ set_saturate(true, bld.MOV(dst1, dst0));
+
+ /* = Before =
+ *
+ * 0: (+f0) add(8) dst0 src0 src1
+ * 1: mov.sat(8) dst1 dst0
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_FALSE(saturate_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_FALSE(instruction(block0, 0)->saturate);
+ EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+ EXPECT_TRUE(instruction(block0, 1)->saturate);
+}
+
+TEST_F(saturate_propagation_test, neg_mov_sat)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dst0 = v->vgrf(glsl_type::float_type);
+ fs_reg dst1 = v->vgrf(glsl_type::float_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ bld.RNDU(dst0, src0);
+ dst0.negate = true;
+ set_saturate(true, bld.MOV(dst1, dst0));
+
+ /* = Before =
+ *
+ * 0: rndu(8) dst0 src0
+ * 1: mov.sat(8) dst1 -dst0
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_FALSE(saturate_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_RNDU, instruction(block0, 0)->opcode);
+ EXPECT_FALSE(instruction(block0, 0)->saturate);
+ EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+ EXPECT_TRUE(instruction(block0, 1)->saturate);
+}
+
+TEST_F(saturate_propagation_test, add_neg_mov_sat)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dst0 = v->vgrf(glsl_type::float_type);
+ fs_reg dst1 = v->vgrf(glsl_type::float_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg src1 = v->vgrf(glsl_type::float_type);
+ bld.ADD(dst0, src0, src1);
+ dst0.negate = true;
+ set_saturate(true, bld.MOV(dst1, dst0));
+
+ /* = Before =
+ *
+ * 0: add(8) dst0 src0 src1
+ * 1: mov.sat(8) dst1 -dst0
+ *
+ * = After =
+ * 0: add.sat(8) dst0 -src0 -src1
+ * 1: mov(8) dst1 dst0
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_TRUE(saturate_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_TRUE(instruction(block0, 0)->saturate);
+ EXPECT_TRUE(instruction(block0, 0)->src[0].negate);
+ EXPECT_TRUE(instruction(block0, 0)->src[1].negate);
+ EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+ EXPECT_FALSE(instruction(block0, 1)->saturate);
+}
+
+TEST_F(saturate_propagation_test, mul_neg_mov_sat)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dst0 = v->vgrf(glsl_type::float_type);
+ fs_reg dst1 = v->vgrf(glsl_type::float_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg src1 = v->vgrf(glsl_type::float_type);
+ bld.MUL(dst0, src0, src1);
+ dst0.negate = true;
+ set_saturate(true, bld.MOV(dst1, dst0));
+
+ /* = Before =
+ *
+ * 0: mul(8) dst0 src0 src1
+ * 1: mov.sat(8) dst1 -dst0
+ *
+ * = After =
+ * 0: mul.sat(8) dst0 src0 -src1
+ * 1: mov(8) dst1 dst0
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_TRUE(saturate_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+ EXPECT_TRUE(instruction(block0, 0)->saturate);
+ EXPECT_TRUE(instruction(block0, 0)->src[0].negate);
+ EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+ EXPECT_FALSE(instruction(block0, 1)->saturate);
+ EXPECT_FALSE(instruction(block0, 1)->src[0].negate);
+}
+
+TEST_F(saturate_propagation_test, mul_mov_sat_neg_mov_sat)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dst0 = v->vgrf(glsl_type::float_type);
+ fs_reg dst1 = v->vgrf(glsl_type::float_type);
+ fs_reg dst2 = v->vgrf(glsl_type::float_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg src1 = v->vgrf(glsl_type::float_type);
+ bld.MUL(dst0, src0, src1);
+ set_saturate(true, bld.MOV(dst1, dst0));
+ dst0.negate = true;
+ set_saturate(true, bld.MOV(dst2, dst0));
+
+ /* = Before =
+ *
+ * 0: mul(8) dst0 src0 src1
+ * 1: mov.sat(8) dst1 dst0
+ * 2: mov.sat(8) dst2 -dst0
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+
+ EXPECT_FALSE(saturate_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+ EXPECT_FALSE(instruction(block0, 0)->saturate);
+ EXPECT_FALSE(instruction(block0, 0)->src[1].negate);
+ EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+ EXPECT_TRUE(instruction(block0, 1)->saturate);
+ EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 2)->opcode);
+ EXPECT_TRUE(instruction(block0, 2)->src[0].negate);
+ EXPECT_TRUE(instruction(block0, 2)->saturate);
+}
+
+TEST_F(saturate_propagation_test, mul_neg_mov_sat_neg_mov_sat)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dst0 = v->vgrf(glsl_type::float_type);
+ fs_reg dst1 = v->vgrf(glsl_type::float_type);
+ fs_reg dst2 = v->vgrf(glsl_type::float_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg src1 = v->vgrf(glsl_type::float_type);
+ bld.MUL(dst0, src0, src1);
+ dst0.negate = true;
+ set_saturate(true, bld.MOV(dst1, dst0));
+ set_saturate(true, bld.MOV(dst2, dst0));
+
+ /* = Before =
+ *
+ * 0: mul(8) dst0 src0 src1
+ * 1: mov.sat(8) dst1 -dst0
+ * 2: mov.sat(8) dst2 -dst0
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+
+ EXPECT_FALSE(saturate_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+ EXPECT_FALSE(instruction(block0, 0)->saturate);
+ EXPECT_FALSE(instruction(block0, 0)->src[1].negate);
+ EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+ EXPECT_TRUE(instruction(block0, 1)->src[0].negate);
+ EXPECT_TRUE(instruction(block0, 1)->saturate);
+ EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 2)->opcode);
+ EXPECT_TRUE(instruction(block0, 2)->src[0].negate);
+ EXPECT_TRUE(instruction(block0, 2)->saturate);
+}
+
+TEST_F(saturate_propagation_test, abs_mov_sat)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dst0 = v->vgrf(glsl_type::float_type);
+ fs_reg dst1 = v->vgrf(glsl_type::float_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg src1 = v->vgrf(glsl_type::float_type);
+ bld.ADD(dst0, src0, src1);
+ dst0.abs = true;
+ set_saturate(true, bld.MOV(dst1, dst0));
+
+ /* = Before =
+ *
+ * 0: add(8) dst0 src0 src1
+ * 1: mov.sat(8) dst1 (abs)dst0
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_FALSE(saturate_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_FALSE(instruction(block0, 0)->saturate);
+ EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+ EXPECT_TRUE(instruction(block0, 1)->saturate);
+}
+
+TEST_F(saturate_propagation_test, producer_saturates)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dst0 = v->vgrf(glsl_type::float_type);
+ fs_reg dst1 = v->vgrf(glsl_type::float_type);
+ fs_reg dst2 = v->vgrf(glsl_type::float_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg src1 = v->vgrf(glsl_type::float_type);
+ set_saturate(true, bld.ADD(dst0, src0, src1));
+ set_saturate(true, bld.MOV(dst1, dst0));
+ bld.MOV(dst2, dst0);
+
+ /* = Before =
+ *
+ * 0: add.sat(8) dst0 src0 src1
+ * 1: mov.sat(8) dst1 dst0
+ * 2: mov(8) dst2 dst0
+ *
+ * = After =
+ * 0: add.sat(8) dst0 src0 src1
+ * 1: mov(8) dst1 dst0
+ * 2: mov(8) dst2 dst0
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+
+ EXPECT_TRUE(saturate_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_TRUE(instruction(block0, 0)->saturate);
+ EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+ EXPECT_FALSE(instruction(block0, 1)->saturate);
+}
+
+TEST_F(saturate_propagation_test, intervening_saturating_copy)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dst0 = v->vgrf(glsl_type::float_type);
+ fs_reg dst1 = v->vgrf(glsl_type::float_type);
+ fs_reg dst2 = v->vgrf(glsl_type::float_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg src1 = v->vgrf(glsl_type::float_type);
+ bld.ADD(dst0, src0, src1);
+ set_saturate(true, bld.MOV(dst1, dst0));
+ set_saturate(true, bld.MOV(dst2, dst0));
+
+ /* = Before =
+ *
+ * 0: add(8) dst0 src0 src1
+ * 1: mov.sat(8) dst1 dst0
+ * 2: mov.sat(8) dst2 dst0
+ *
+ * = After =
+ * 0: add.sat(8) dst0 src0 src1
+ * 1: mov(8) dst1 dst0
+ * 2: mov(8) dst2 dst0
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+
+ EXPECT_TRUE(saturate_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_TRUE(instruction(block0, 0)->saturate);
+ EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+ EXPECT_FALSE(instruction(block0, 1)->saturate);
+ EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 2)->opcode);
+ EXPECT_FALSE(instruction(block0, 2)->saturate);
+}
+
+TEST_F(saturate_propagation_test, intervening_dest_write)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dst0 = v->vgrf(glsl_type::vec4_type);
+ fs_reg dst1 = v->vgrf(glsl_type::float_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg src1 = v->vgrf(glsl_type::float_type);
+ fs_reg src2 = v->vgrf(glsl_type::vec2_type);
+ bld.ADD(offset(dst0, bld, 2), src0, src1);
+ bld.emit(SHADER_OPCODE_TEX, dst0, src2)
+ ->size_written = 4 * REG_SIZE;
+ set_saturate(true, bld.MOV(dst1, offset(dst0, bld, 2)));
+
+ /* = Before =
+ *
+ * 0: add(8) dst0+2 src0 src1
+ * 1: tex(8) rlen 4 dst0+0 src2
+ * 2: mov.sat(8) dst1 dst0+2
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+
+ EXPECT_FALSE(saturate_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_FALSE(instruction(block0, 0)->saturate);
+ EXPECT_EQ(SHADER_OPCODE_TEX, instruction(block0, 1)->opcode);
+ EXPECT_FALSE(instruction(block0, 0)->saturate);
+ EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 2)->opcode);
+ EXPECT_TRUE(instruction(block0, 2)->saturate);
+}
+
+TEST_F(saturate_propagation_test, mul_neg_mov_sat_mov_sat)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dst0 = v->vgrf(glsl_type::float_type);
+ fs_reg dst1 = v->vgrf(glsl_type::float_type);
+ fs_reg dst2 = v->vgrf(glsl_type::float_type);
+ fs_reg src0 = v->vgrf(glsl_type::float_type);
+ fs_reg src1 = v->vgrf(glsl_type::float_type);
+ bld.MUL(dst0, src0, src1);
+ dst0.negate = true;
+ set_saturate(true, bld.MOV(dst1, dst0));
+ dst0.negate = false;
+ set_saturate(true, bld.MOV(dst2, dst0));
+
+ /* = Before =
+ *
+ * 0: mul(8) dst0 src0 src1
+ * 1: mov.sat(8) dst1 -dst0
+ * 2: mov.sat(8) dst2 dst0
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+
+ EXPECT_FALSE(saturate_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+ EXPECT_FALSE(instruction(block0, 0)->saturate);
+ EXPECT_FALSE(instruction(block0, 0)->src[1].negate);
+ EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+ EXPECT_TRUE(instruction(block0, 1)->saturate);
+ EXPECT_TRUE(instruction(block0, 1)->src[0].negate);
+ EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 2)->opcode);
+ EXPECT_TRUE(instruction(block0, 2)->saturate);
+}
diff --git a/src/intel/compiler/test_vec4_cmod_propagation.cpp b/src/intel/compiler/test_vec4_cmod_propagation.cpp
new file mode 100644
index 00000000000..7d9792b4a55
--- /dev/null
+++ b/src/intel/compiler/test_vec4_cmod_propagation.cpp
@@ -0,0 +1,823 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Based on test_fs_cmod_propagation.cpp
+ */
+
+#include <gtest/gtest.h>
+#include "brw_vec4.h"
+#include "brw_vec4_builder.h"
+#include "brw_cfg.h"
+#include "program/program.h"
+
+using namespace brw;
+
+class cmod_propagation_test : public ::testing::Test {
+ virtual void SetUp();
+
+public:
+ struct brw_compiler *compiler;
+ struct gen_device_info *devinfo;
+ struct gl_context *ctx;
+ struct gl_shader_program *shader_prog;
+ struct brw_vue_prog_data *prog_data;
+ vec4_visitor *v;
+};
+
+class cmod_propagation_vec4_visitor : public vec4_visitor
+{
+public:
+ cmod_propagation_vec4_visitor(struct brw_compiler *compiler,
+ nir_shader *shader,
+ struct brw_vue_prog_data *prog_data)
+ : vec4_visitor(compiler, NULL, NULL, prog_data, shader, NULL,
+ false, -1)
+ {
+ prog_data->dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
+ }
+
+protected:
+ /* Dummy implementation for pure virtual methods */
+ virtual dst_reg *make_reg_for_system_value(int location)
+ {
+ unreachable("Not reached");
+ }
+
+ virtual void setup_payload()
+ {
+ unreachable("Not reached");
+ }
+
+ virtual void emit_prolog()
+ {
+ unreachable("Not reached");
+ }
+
+ virtual void emit_program_code()
+ {
+ unreachable("Not reached");
+ }
+
+ virtual void emit_thread_end()
+ {
+ unreachable("Not reached");
+ }
+
+ virtual void emit_urb_write_header(int mrf)
+ {
+ unreachable("Not reached");
+ }
+
+ virtual vec4_instruction *emit_urb_write_opcode(bool complete)
+ {
+ unreachable("Not reached");
+ }
+};
+
+
+void cmod_propagation_test::SetUp()
+{
+ ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+ compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+ devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo));
+ prog_data = (struct brw_vue_prog_data *)calloc(1, sizeof(*prog_data));
+ compiler->devinfo = devinfo;
+
+ nir_shader *shader =
+ nir_shader_create(NULL, MESA_SHADER_VERTEX, NULL, NULL);
+
+ v = new cmod_propagation_vec4_visitor(compiler, shader, prog_data);
+
+ devinfo->gen = 4;
+}
+
+static vec4_instruction *
+instruction(bblock_t *block, int num)
+{
+ vec4_instruction *inst = (vec4_instruction *)block->start();
+ for (int i = 0; i < num; i++) {
+ inst = (vec4_instruction *)inst->next;
+ }
+ return inst;
+}
+
+static bool
+cmod_propagation(vec4_visitor *v)
+{
+ const bool print = getenv("TEST_DEBUG");
+
+ if (print) {
+ fprintf(stderr, "= Before =\n");
+ v->dump_instructions();
+ }
+
+ bool ret = v->opt_cmod_propagation();
+
+ if (print) {
+ fprintf(stderr, "\n= After =\n");
+ v->dump_instructions();
+ }
+
+ return ret;
+}
+
+TEST_F(cmod_propagation_test, basic)
+{
+ const vec4_builder bld = vec4_builder(v).at_end();
+ dst_reg dest = dst_reg(v, glsl_type::float_type);
+ src_reg src0 = src_reg(v, glsl_type::float_type);
+ src_reg src1 = src_reg(v, glsl_type::float_type);
+ src_reg zero(brw_imm_f(0.0f));
+ dst_reg dest_null = bld.null_reg_f();
+ dest_null.writemask = WRITEMASK_X;
+
+ bld.ADD(dest, src0, src1);
+ bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_GE);
+
+ /* = Before =
+ *
+ * 0: add dest.x src0.xxxx src1.xxxx
+ * 1: cmp.ge.f0 null.x dest.xxxx 0.0f
+ *
+ * = After =
+ * 0: add.ge.f0 dest.x src0.xxxx src1.xxxx
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_TRUE(cmod_propagation(v));
+
+ ASSERT_EQ(0, block0->start_ip);
+ ASSERT_EQ(0, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, basic_different_dst_writemask)
+{
+ const vec4_builder bld = vec4_builder(v).at_end();
+ dst_reg dest = dst_reg(v, glsl_type::float_type);
+ src_reg src0 = src_reg(v, glsl_type::float_type);
+ src_reg src1 = src_reg(v, glsl_type::float_type);
+ src_reg zero(brw_imm_f(0.0f));
+ dst_reg dest_null = bld.null_reg_f();
+
+ bld.ADD(dest, src0, src1);
+ bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_GE);
+
+ /* = Before =
+ *
+ * 0: add dest.x src0 src1
+ * 1: cmp.ge.f0 null.xyzw dest 0.0f
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_FALSE(cmod_propagation(v));
+
+ ASSERT_EQ(0, block0->start_ip);
+ ASSERT_EQ(1, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, andz_one)
+{
+ const vec4_builder bld = vec4_builder(v).at_end();
+ dst_reg dest = dst_reg(v, glsl_type::int_type);
+ src_reg src0 = src_reg(v, glsl_type::float_type);
+ src_reg zero(brw_imm_f(0.0f));
+ src_reg one(brw_imm_d(1));
+
+ bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+ set_condmod(BRW_CONDITIONAL_Z,
+ bld.AND(bld.null_reg_d(), src_reg(dest), one));
+
+ /* = Before =
+ * 0: cmp.l.f0 dest:F src0:F 0F
+ * 1: and.z.f0 null:D dest:D 1D
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_FALSE(cmod_propagation(v));
+
+ ASSERT_EQ(0, block0->start_ip);
+ ASSERT_EQ(1, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+ EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 1)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_EQ, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, non_cmod_instruction)
+{
+ const vec4_builder bld = vec4_builder(v).at_end();
+ dst_reg dest = dst_reg(v, glsl_type::uint_type);
+ src_reg src0 = src_reg(v, glsl_type::uint_type);
+ src_reg zero(brw_imm_ud(0u));
+ bld.FBL(dest, src0);
+ bld.CMP(bld.null_reg_ud(), src_reg(dest), zero, BRW_CONDITIONAL_GE);
+
+ /* = Before =
+ *
+ * 0: fbl dest src0
+ * 1: cmp.ge.f0 null dest 0u
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_FALSE(cmod_propagation(v));
+
+ ASSERT_EQ(0, block0->start_ip);
+ ASSERT_EQ(1, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_FBL, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_flag_write)
+{
+ const vec4_builder bld = vec4_builder(v).at_end();
+ dst_reg dest = dst_reg(v, glsl_type::float_type);
+ src_reg src0 = src_reg(v, glsl_type::float_type);
+ src_reg src1 = src_reg(v, glsl_type::float_type);
+ src_reg src2 = src_reg(v, glsl_type::float_type);
+ src_reg zero(brw_imm_f(0.0f));
+ bld.ADD(dest, src0, src1);
+ bld.CMP(bld.null_reg_f(), src2, zero, BRW_CONDITIONAL_GE);
+ bld.CMP(bld.null_reg_f(), src_reg(dest), zero, BRW_CONDITIONAL_GE);
+
+ /* = Before =
+ *
+ * 0: add dest src0 src1
+ * 1: cmp.ge.f0 null src2 0.0f
+ * 2: cmp.ge.f0 null dest 0.0f
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+
+ EXPECT_FALSE(cmod_propagation(v));
+
+ ASSERT_EQ(0, block0->start_ip);
+ ASSERT_EQ(2, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_flag_read)
+{
+ const vec4_builder bld = vec4_builder(v).at_end();
+ dst_reg dest0 = dst_reg(v, glsl_type::float_type);
+ dst_reg dest1 = dst_reg(v, glsl_type::float_type);
+ src_reg src0 = src_reg(v, glsl_type::float_type);
+ src_reg src1 = src_reg(v, glsl_type::float_type);
+ src_reg src2 = src_reg(v, glsl_type::float_type);
+ src_reg zero(brw_imm_f(0.0f));
+ bld.ADD(dest0, src0, src1);
+ set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
+ bld.CMP(bld.null_reg_f(), src_reg(dest0), zero, BRW_CONDITIONAL_GE);
+
+ /* = Before =
+ *
+ * 0: add dest0 src0 src1
+ * 1: (+f0) sel dest1 src2 0.0f
+ * 2: cmp.ge.f0 null dest0 0.0f
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+
+ EXPECT_FALSE(cmod_propagation(v));
+
+ ASSERT_EQ(0, block0->start_ip);
+ ASSERT_EQ(2, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+ EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_dest_write)
+{
+ const vec4_builder bld = vec4_builder(v).at_end();
+ dst_reg dest = dst_reg(v, glsl_type::vec4_type);
+ src_reg src0 = src_reg(v, glsl_type::float_type);
+ src_reg src1 = src_reg(v, glsl_type::float_type);
+ src_reg src2 = src_reg(v, glsl_type::vec2_type);
+ src_reg zero(brw_imm_f(0.0f));
+ bld.ADD(offset(dest, 8, 2), src0, src1);
+ bld.emit(SHADER_OPCODE_TEX, dest, src2)
+ ->size_written = 4 * REG_SIZE;
+ bld.CMP(bld.null_reg_f(), offset(src_reg(dest), 8, 2), zero, BRW_CONDITIONAL_GE);
+
+ /* = Before =
+ *
+ * 0: add dest+2 src0 src1
+ * 1: tex rlen 4 dest+0 src2
+ * 2: cmp.ge.f0 null dest+2 0.0f
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+
+ EXPECT_FALSE(cmod_propagation(v));
+
+ ASSERT_EQ(0, block0->start_ip);
+ ASSERT_EQ(2, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+ EXPECT_EQ(SHADER_OPCODE_TEX, instruction(block0, 1)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_flag_read_same_value)
+{
+ const vec4_builder bld = vec4_builder(v).at_end();
+ dst_reg dest0 = dst_reg(v, glsl_type::float_type);
+ dst_reg dest1 = dst_reg(v, glsl_type::float_type);
+ src_reg src0 = src_reg(v, glsl_type::float_type);
+ src_reg src1 = src_reg(v, glsl_type::float_type);
+ src_reg src2 = src_reg(v, glsl_type::float_type);
+ src_reg zero(brw_imm_f(0.0f));
+ dst_reg dest_null = bld.null_reg_f();
+ dest_null.writemask = WRITEMASK_X;
+
+ set_condmod(BRW_CONDITIONAL_GE, bld.ADD(dest0, src0, src1));
+ set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
+ bld.CMP(dest_null, src_reg(dest0), zero, BRW_CONDITIONAL_GE);
+
+ /* = Before =
+ *
+ * 0: add.ge.f0 dest0 src0 src1
+ * 1: (+f0) sel dest1 src2 0.0f
+ * 2: cmp.ge.f0 null.x dest0 0.0f
+ *
+ * = After =
+ * 0: add.ge.f0 dest0 src0 src1
+ * 1: (+f0) sel dest1 src2 0.0f
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(2, block0->end_ip);
+
+ EXPECT_TRUE(cmod_propagation(v));
+ ASSERT_EQ(0, block0->start_ip);
+ ASSERT_EQ(1, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+ EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+ EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+}
+
+TEST_F(cmod_propagation_test, negate)
+{
+ const vec4_builder bld = vec4_builder(v).at_end();
+ dst_reg dest = dst_reg(v, glsl_type::float_type);
+ src_reg src0 = src_reg(v, glsl_type::float_type);
+ src_reg src1 = src_reg(v, glsl_type::float_type);
+ src_reg zero(brw_imm_f(0.0f));
+ bld.ADD(dest, src0, src1);
+ src_reg tmp_src = src_reg(dest);
+ tmp_src.negate = true;
+ dst_reg dest_null = bld.null_reg_f();
+ dest_null.writemask = WRITEMASK_X;
+ bld.CMP(dest_null, tmp_src, zero, BRW_CONDITIONAL_GE);
+
+ /* = Before =
+ *
+ * 0: add dest src0 src1
+ * 1: cmp.ge.f0 null.x -dest 0.0f
+ *
+ * = After =
+ * 0: add.le.f0 dest src0 src1
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_TRUE(cmod_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(0, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_LE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, movnz)
+{
+ const vec4_builder bld = vec4_builder(v).at_end();
+ dst_reg dest = dst_reg(v, glsl_type::float_type);
+ src_reg src0 = src_reg(v, glsl_type::float_type);
+ src_reg src1 = src_reg(v, glsl_type::float_type);
+ dst_reg dest_null = bld.null_reg_f();
+ dest_null.writemask = WRITEMASK_X;
+
+ bld.CMP(dest, src0, src1, BRW_CONDITIONAL_L);
+ set_condmod(BRW_CONDITIONAL_NZ,
+ bld.MOV(dest_null, src_reg(dest)));
+
+ /* = Before =
+ *
+ * 0: cmp.l.f0 dest:F src0:F src1:F
+ * 1: mov.nz.f0 null.x dest:F
+ *
+ * = After =
+ * 0: cmp.l.f0 dest src0:F src1:F
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_TRUE(cmod_propagation(v));
+
+ ASSERT_EQ(0, block0->start_ip);
+ ASSERT_EQ(0, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, different_types_cmod_with_zero)
+{
+ const vec4_builder bld = vec4_builder(v).at_end();
+ dst_reg dest = dst_reg(v, glsl_type::int_type);
+ src_reg src0 = src_reg(v, glsl_type::int_type);
+ src_reg src1 = src_reg(v, glsl_type::int_type);
+ src_reg zero(brw_imm_f(0.0f));
+ bld.ADD(dest, src0, src1);
+ bld.CMP(bld.null_reg_f(), retype(src_reg(dest), BRW_REGISTER_TYPE_F), zero,
+ BRW_CONDITIONAL_GE);
+
+ /* = Before =
+ *
+ * 0: add dest:D src0:D src1:D
+ * 1: cmp.ge.f0 null:F dest:F 0.0f
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_FALSE(cmod_propagation(v));
+
+ ASSERT_EQ(0, block0->start_ip);
+ ASSERT_EQ(1, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, andnz_non_one)
+{
+ const vec4_builder bld = vec4_builder(v).at_end();
+ dst_reg dest = dst_reg(v, glsl_type::int_type);
+ src_reg src0 = src_reg(v, glsl_type::float_type);
+ src_reg zero(brw_imm_f(0.0f));
+ src_reg nonone(brw_imm_d(38));
+
+ bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+ set_condmod(BRW_CONDITIONAL_NZ,
+ bld.AND(bld.null_reg_d(), src_reg(dest), nonone));
+
+ /* = Before =
+ * 0: cmp.l.f0 dest:F src0:F 0F
+ * 1: and.nz.f0 null:D dest:D 38D
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_FALSE(cmod_propagation(v));
+
+ ASSERT_EQ(0, block0->start_ip);
+ ASSERT_EQ(1, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+ EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 1)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod);
+}
+
+/* Note that basic is using glsl_type:float types, while this one is using
+ * glsl_type::vec4 */
+TEST_F(cmod_propagation_test, basic_vec4)
+{
+ const vec4_builder bld = vec4_builder(v).at_end();
+ dst_reg dest = dst_reg(v, glsl_type::vec4_type);
+ src_reg src0 = src_reg(v, glsl_type::vec4_type);
+ src_reg src1 = src_reg(v, glsl_type::vec4_type);
+ src_reg zero(brw_imm_f(0.0f));
+
+ bld.MUL(dest, src0, src1);
+ bld.CMP(bld.null_reg_f(), src_reg(dest), zero, BRW_CONDITIONAL_NZ);
+
+ /* = Before =
+ * 0: mul dest.xyzw src0.xyzw src1.xyzw
+ * 1: cmp.nz.f0.0 null.xyzw dest.xyzw 0.0f
+ *
+ * = After =
+ * 0: mul.nz.f0.0 dest.xyzw src0.xyzw src1.xyzw
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_TRUE(cmod_propagation(v));
+
+ ASSERT_EQ(0, block0->start_ip);
+ ASSERT_EQ(0, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, basic_vec4_different_dst_writemask)
+{
+ const vec4_builder bld = vec4_builder(v).at_end();
+ dst_reg dest = dst_reg(v, glsl_type::vec4_type);
+ dest.writemask = WRITEMASK_X;
+ src_reg src0 = src_reg(v, glsl_type::vec4_type);
+ src_reg src1 = src_reg(v, glsl_type::vec4_type);
+ src_reg zero(brw_imm_f(0.0f));
+ dst_reg dest_null = bld.null_reg_f();
+
+ bld.MUL(dest, src0, src1);
+ bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_NZ);
+
+ /* = Before =
+ * 0: mul dest.x src0 src1
+ * 1: cmp.nz.f0.0 null dest 0.0f
+ *
+ * = After =
+ * (no changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_FALSE(cmod_propagation(v));
+
+ ASSERT_EQ(0, block0->start_ip);
+ ASSERT_EQ(1, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, mad_one_component_vec4)
+{
+ const vec4_builder bld = vec4_builder(v).at_end();
+ dst_reg dest = dst_reg(v, glsl_type::vec4_type);
+ dest.writemask = WRITEMASK_X;
+ src_reg src0 = src_reg(v, glsl_type::vec4_type);
+ src_reg src1 = src_reg(v, glsl_type::vec4_type);
+ src_reg src2 = src_reg(v, glsl_type::vec4_type);
+ src0.swizzle = src1.swizzle = src2.swizzle = BRW_SWIZZLE_XXXX;
+ src2.negate = true;
+ src_reg zero(brw_imm_f(0.0f));
+ src_reg tmp(dest);
+ tmp.swizzle = BRW_SWIZZLE_XXXX;
+ dst_reg dest_null = bld.null_reg_f();
+ dest_null.writemask = WRITEMASK_X;
+
+ bld.MAD(dest, src0, src1, src2);
+ bld.CMP(dest_null, tmp, zero, BRW_CONDITIONAL_L);
+
+ /* = Before =
+ *
+ * 0: mad dest.x:F src0.xxxx:F src10.xxxx:F -src2.xxxx:F
+ * 1: cmp.l.f0.0 null.x:F dest.xxxx:F 0.0f
+ *
+ * = After =
+ * 0: mad.l.f0 dest.x:F src0.xxxx:F src10.xxxx:F -src2.xxxx:F
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_TRUE(cmod_propagation(v));
+
+ ASSERT_EQ(0, block0->start_ip);
+ ASSERT_EQ(0, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_MAD, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, mad_more_one_component_vec4)
+{
+ const vec4_builder bld = vec4_builder(v).at_end();
+ dst_reg dest = dst_reg(v, glsl_type::vec4_type);
+ dest.writemask = WRITEMASK_XW;
+ src_reg src0 = src_reg(v, glsl_type::vec4_type);
+ src_reg src1 = src_reg(v, glsl_type::vec4_type);
+ src_reg src2 = src_reg(v, glsl_type::vec4_type);
+ src0.swizzle = src1.swizzle = src2.swizzle = BRW_SWIZZLE_XXXX;
+ src2.negate = true;
+ src_reg zero(brw_imm_f(0.0f));
+ src_reg tmp(dest);
+ tmp.swizzle = BRW_SWIZZLE_XXXX;
+ dst_reg dest_null = bld.null_reg_f();
+
+ bld.MAD(dest, src0, src1, src2);
+ bld.CMP(dest_null, tmp, zero, BRW_CONDITIONAL_L);
+
+ /* = Before =
+ *
+ * 0: mad dest.xw:F src0.xxxx:F src10.xxxx:F -src2.xxxx:F
+ * 1: cmp.l.f0.0 null:F dest.xxxx:F zeroF
+ *
+ * = After =
+ * (No changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_FALSE(cmod_propagation(v));
+
+ ASSERT_EQ(0, block0->start_ip);
+ ASSERT_EQ(1, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_MAD, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, cmp_mov_vec4)
+{
+ const vec4_builder bld = vec4_builder(v).at_end();
+ dst_reg dest = dst_reg(v, glsl_type::ivec4_type);
+ dest.writemask = WRITEMASK_X;
+ src_reg src0 = src_reg(v, glsl_type::ivec4_type);
+ src0.swizzle = BRW_SWIZZLE_XXXX;
+ src0.file = UNIFORM;
+ src_reg nonone = retype(brw_imm_d(16), BRW_REGISTER_TYPE_D);
+ src_reg mov_src = src_reg(dest);
+ mov_src.swizzle = BRW_SWIZZLE_XXXX;
+ dst_reg dest_null = bld.null_reg_d();
+ dest_null.writemask = WRITEMASK_X;
+
+ bld.CMP(dest, src0, nonone, BRW_CONDITIONAL_GE);
+ set_condmod(BRW_CONDITIONAL_NZ,
+ bld.MOV(dest_null, mov_src));
+
+ /* = Before =
+ *
+ * 0: cmp.ge.f0 dest.x:D u.xxxx:D 16D
+ * 1: mov.nz.f0 null.x:D dest.xxxx:D
+ *
+ * = After =
+ * 0: cmp.ge.f0 dest.x:D u.xxxx:D 16D
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_TRUE(cmod_propagation(v));
+
+ ASSERT_EQ(0, block0->start_ip);
+ ASSERT_EQ(0, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, mul_cmp_different_channels_vec4)
+{
+ const vec4_builder bld = vec4_builder(v).at_end();
+ dst_reg dest = dst_reg(v, glsl_type::vec4_type);
+ src_reg src0 = src_reg(v, glsl_type::vec4_type);
+ src_reg src1 = src_reg(v, glsl_type::vec4_type);
+ src_reg zero(brw_imm_f(0.0f));
+ src_reg cmp_src = src_reg(dest);
+ cmp_src.swizzle = BRW_SWIZZLE4(0,1,3,2);
+
+ bld.MUL(dest, src0, src1);
+ bld.CMP(bld.null_reg_f(), cmp_src, zero, BRW_CONDITIONAL_NZ);
+
+ /* = Before =
+ * 0: mul dest src0 src1
+ * 1: cmp.nz.f0.0 null dest.xywz 0.0f
+ *
+ * = After =
+ * (No changes)
+ */
+
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_FALSE(cmod_propagation(v));
+
+ ASSERT_EQ(0, block0->start_ip);
+ ASSERT_EQ(1, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod);
+}
diff --git a/src/intel/compiler/test_vec4_copy_propagation.cpp b/src/intel/compiler/test_vec4_copy_propagation.cpp
new file mode 100644
index 00000000000..f4f91d8c8c7
--- /dev/null
+++ b/src/intel/compiler/test_vec4_copy_propagation.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "brw_vec4.h"
+#include "program/program.h"
+
+using namespace brw;
+
+int ret = 0;
+
+class copy_propagation_test : public ::testing::Test {
+ virtual void SetUp();
+
+public:
+ struct brw_compiler *compiler;
+ struct gen_device_info *devinfo;
+ struct gl_context *ctx;
+ struct gl_shader_program *shader_prog;
+ struct brw_vue_prog_data *prog_data;
+ vec4_visitor *v;
+};
+
+class copy_propagation_vec4_visitor : public vec4_visitor
+{
+public:
+ copy_propagation_vec4_visitor(struct brw_compiler *compiler,
+ nir_shader *shader,
+ struct brw_vue_prog_data *prog_data)
+ : vec4_visitor(compiler, NULL, NULL, prog_data, shader, NULL,
+ false /* no_spills */, -1)
+ {
+ prog_data->dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
+ }
+
+protected:
+ virtual dst_reg *make_reg_for_system_value(int location)
+ {
+ unreachable("Not reached");
+ }
+
+ virtual void setup_payload()
+ {
+ unreachable("Not reached");
+ }
+
+ virtual void emit_prolog()
+ {
+ unreachable("Not reached");
+ }
+
+ virtual void emit_thread_end()
+ {
+ unreachable("Not reached");
+ }
+
+ virtual void emit_urb_write_header(int mrf)
+ {
+ unreachable("Not reached");
+ }
+
+ virtual vec4_instruction *emit_urb_write_opcode(bool complete)
+ {
+ unreachable("Not reached");
+ }
+};
+
+
+void copy_propagation_test::SetUp()
+{
+ ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+ compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+ devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo));
+ prog_data = (struct brw_vue_prog_data *)calloc(1, sizeof(*prog_data));
+ compiler->devinfo = devinfo;
+
+ nir_shader *shader =
+ nir_shader_create(NULL, MESA_SHADER_VERTEX, NULL, NULL);
+
+ v = new copy_propagation_vec4_visitor(compiler, shader, prog_data);
+
+ devinfo->gen = 4;
+}
+
+static void
+copy_propagation(vec4_visitor *v)
+{
+ bool print = false;
+
+ if (print) {
+ fprintf(stderr, "instructions before:\n");
+ v->dump_instructions();
+ }
+
+ v->calculate_cfg();
+ v->opt_copy_propagation();
+
+ if (print) {
+ fprintf(stderr, "instructions after:\n");
+ v->dump_instructions();
+ }
+}
+
+TEST_F(copy_propagation_test, test_swizzle_swizzle)
+{
+ dst_reg a = dst_reg(v, glsl_type::vec4_type);
+ dst_reg b = dst_reg(v, glsl_type::vec4_type);
+ dst_reg c = dst_reg(v, glsl_type::vec4_type);
+
+ v->emit(v->ADD(a, src_reg(a), src_reg(a)));
+
+ v->emit(v->MOV(b, swizzle(src_reg(a), BRW_SWIZZLE4(SWIZZLE_Y,
+ SWIZZLE_Z,
+ SWIZZLE_W,
+ SWIZZLE_X))));
+
+ vec4_instruction *test_mov =
+ v->MOV(c, swizzle(src_reg(b), BRW_SWIZZLE4(SWIZZLE_Y,
+ SWIZZLE_Z,
+ SWIZZLE_W,
+ SWIZZLE_X)));
+ v->emit(test_mov);
+
+ copy_propagation(v);
+
+ EXPECT_EQ(test_mov->src[0].nr, a.nr);
+ EXPECT_EQ(test_mov->src[0].swizzle, BRW_SWIZZLE4(SWIZZLE_Z,
+ SWIZZLE_W,
+ SWIZZLE_X,
+ SWIZZLE_Y));
+}
+
+TEST_F(copy_propagation_test, test_swizzle_writemask)
+{
+ dst_reg a = dst_reg(v, glsl_type::vec4_type);
+ dst_reg b = dst_reg(v, glsl_type::vec4_type);
+ dst_reg c = dst_reg(v, glsl_type::vec4_type);
+
+ v->emit(v->MOV(b, swizzle(src_reg(a), BRW_SWIZZLE4(SWIZZLE_X,
+ SWIZZLE_Y,
+ SWIZZLE_X,
+ SWIZZLE_Z))));
+
+ v->emit(v->MOV(writemask(a, WRITEMASK_XYZ), brw_imm_f(1.0f)));
+
+ vec4_instruction *test_mov =
+ v->MOV(c, swizzle(src_reg(b), BRW_SWIZZLE4(SWIZZLE_W,
+ SWIZZLE_W,
+ SWIZZLE_W,
+ SWIZZLE_W)));
+ v->emit(test_mov);
+
+ copy_propagation(v);
+
+ /* should not copy propagate */
+ EXPECT_EQ(test_mov->src[0].nr, b.nr);
+ EXPECT_EQ(test_mov->src[0].swizzle, BRW_SWIZZLE4(SWIZZLE_W,
+ SWIZZLE_W,
+ SWIZZLE_W,
+ SWIZZLE_W));
+}
diff --git a/src/intel/compiler/test_vec4_register_coalesce.cpp b/src/intel/compiler/test_vec4_register_coalesce.cpp
new file mode 100644
index 00000000000..a3dbb0a72e4
--- /dev/null
+++ b/src/intel/compiler/test_vec4_register_coalesce.cpp
@@ -0,0 +1,242 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "brw_vec4.h"
+#include "program/program.h"
+
+using namespace brw;
+
+int ret = 0;
+
+#define register_coalesce(v) _register_coalesce(v, __func__)
+
+class register_coalesce_test : public ::testing::Test {
+ virtual void SetUp();
+
+public:
+ struct brw_compiler *compiler;
+ struct gen_device_info *devinfo;
+ struct gl_context *ctx;
+ struct gl_shader_program *shader_prog;
+ struct brw_vue_prog_data *prog_data;
+ vec4_visitor *v;
+};
+
+
+class register_coalesce_vec4_visitor : public vec4_visitor
+{
+public:
+ register_coalesce_vec4_visitor(struct brw_compiler *compiler,
+ nir_shader *shader,
+ struct brw_vue_prog_data *prog_data)
+ : vec4_visitor(compiler, NULL, NULL, prog_data, shader, NULL,
+ false /* no_spills */, -1)
+ {
+ prog_data->dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
+ }
+
+protected:
+ virtual dst_reg *make_reg_for_system_value(int location)
+ {
+ unreachable("Not reached");
+ }
+
+ virtual void setup_payload()
+ {
+ unreachable("Not reached");
+ }
+
+ virtual void emit_prolog()
+ {
+ unreachable("Not reached");
+ }
+
+ virtual void emit_thread_end()
+ {
+ unreachable("Not reached");
+ }
+
+ virtual void emit_urb_write_header(int mrf)
+ {
+ unreachable("Not reached");
+ }
+
+ virtual vec4_instruction *emit_urb_write_opcode(bool complete)
+ {
+ unreachable("Not reached");
+ }
+};
+
+
+void register_coalesce_test::SetUp()
+{
+ ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+ compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+ devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo));
+ prog_data = (struct brw_vue_prog_data *)calloc(1, sizeof(*prog_data));
+ compiler->devinfo = devinfo;
+
+ nir_shader *shader =
+ nir_shader_create(NULL, MESA_SHADER_VERTEX, NULL, NULL);
+
+ v = new register_coalesce_vec4_visitor(compiler, shader, prog_data);
+
+ devinfo->gen = 4;
+}
+
+static void
+_register_coalesce(vec4_visitor *v, const char *func)
+{
+ bool print = false;
+
+ if (print) {
+ printf("%s: instructions before:\n", func);
+ v->dump_instructions();
+ }
+
+ v->calculate_cfg();
+ v->opt_register_coalesce();
+
+ if (print) {
+ printf("%s: instructions after:\n", func);
+ v->dump_instructions();
+ }
+}
+
+TEST_F(register_coalesce_test, test_compute_to_mrf)
+{
+ src_reg something = src_reg(v, glsl_type::float_type);
+ dst_reg temp = dst_reg(v, glsl_type::float_type);
+ dst_reg init;
+
+ dst_reg m0 = dst_reg(MRF, 0);
+ m0.writemask = WRITEMASK_X;
+ m0.type = BRW_REGISTER_TYPE_F;
+
+ vec4_instruction *mul = v->emit(v->MUL(temp, something, brw_imm_f(1.0f)));
+ v->emit(v->MOV(m0, src_reg(temp)));
+
+ register_coalesce(v);
+
+ EXPECT_EQ(mul->dst.file, MRF);
+}
+
+
+TEST_F(register_coalesce_test, test_multiple_use)
+{
+ src_reg something = src_reg(v, glsl_type::float_type);
+ dst_reg temp = dst_reg(v, glsl_type::vec4_type);
+ dst_reg init;
+
+ dst_reg m0 = dst_reg(MRF, 0);
+ m0.writemask = WRITEMASK_X;
+ m0.type = BRW_REGISTER_TYPE_F;
+
+ dst_reg m1 = dst_reg(MRF, 1);
+ m1.writemask = WRITEMASK_XYZW;
+ m1.type = BRW_REGISTER_TYPE_F;
+
+ src_reg src = src_reg(temp);
+ vec4_instruction *mul = v->emit(v->MUL(temp, something, brw_imm_f(1.0f)));
+ src.swizzle = BRW_SWIZZLE_XXXX;
+ v->emit(v->MOV(m0, src));
+ src.swizzle = BRW_SWIZZLE_XYZW;
+ v->emit(v->MOV(m1, src));
+
+ register_coalesce(v);
+
+ EXPECT_NE(mul->dst.file, MRF);
+}
+
+TEST_F(register_coalesce_test, test_dp4_mrf)
+{
+ src_reg some_src_1 = src_reg(v, glsl_type::vec4_type);
+ src_reg some_src_2 = src_reg(v, glsl_type::vec4_type);
+ dst_reg init;
+
+ dst_reg m0 = dst_reg(MRF, 0);
+ m0.writemask = WRITEMASK_Y;
+ m0.type = BRW_REGISTER_TYPE_F;
+
+ dst_reg temp = dst_reg(v, glsl_type::float_type);
+
+ vec4_instruction *dp4 = v->emit(v->DP4(temp, some_src_1, some_src_2));
+ v->emit(v->MOV(m0, src_reg(temp)));
+
+ register_coalesce(v);
+
+ EXPECT_EQ(dp4->dst.file, MRF);
+ EXPECT_EQ(dp4->dst.writemask, WRITEMASK_Y);
+}
+
+TEST_F(register_coalesce_test, test_dp4_grf)
+{
+ src_reg some_src_1 = src_reg(v, glsl_type::vec4_type);
+ src_reg some_src_2 = src_reg(v, glsl_type::vec4_type);
+ dst_reg init;
+
+ dst_reg to = dst_reg(v, glsl_type::vec4_type);
+ dst_reg temp = dst_reg(v, glsl_type::float_type);
+
+ vec4_instruction *dp4 = v->emit(v->DP4(temp, some_src_1, some_src_2));
+ to.writemask = WRITEMASK_Y;
+ v->emit(v->MOV(to, src_reg(temp)));
+
+ /* if we don't do something with the result, the automatic dead code
+ * elimination will remove all our instructions.
+ */
+ src_reg src = src_reg(to);
+ src.negate = true;
+ v->emit(v->MOV(dst_reg(MRF, 0), src));
+
+ register_coalesce(v);
+
+ EXPECT_EQ(dp4->dst.nr, to.nr);
+ EXPECT_EQ(dp4->dst.writemask, WRITEMASK_Y);
+}
+
+TEST_F(register_coalesce_test, test_channel_mul_grf)
+{
+ src_reg some_src_1 = src_reg(v, glsl_type::vec4_type);
+ src_reg some_src_2 = src_reg(v, glsl_type::vec4_type);
+ dst_reg init;
+
+ dst_reg to = dst_reg(v, glsl_type::vec4_type);
+ dst_reg temp = dst_reg(v, glsl_type::float_type);
+
+ vec4_instruction *mul = v->emit(v->MUL(temp, some_src_1, some_src_2));
+ to.writemask = WRITEMASK_Y;
+ v->emit(v->MOV(to, src_reg(temp)));
+
+ /* if we don't do something with the result, the automatic dead code
+ * elimination will remove all our instructions.
+ */
+ src_reg src = src_reg(to);
+ src.negate = true;
+ v->emit(v->MOV(dst_reg(MRF, 0), src));
+
+ register_coalesce(v);
+
+ EXPECT_EQ(mul->dst.nr, to.nr);
+}
diff --git a/src/intel/compiler/test_vf_float_conversions.cpp b/src/intel/compiler/test_vf_float_conversions.cpp
new file mode 100644
index 00000000000..7af97d0d097
--- /dev/null
+++ b/src/intel/compiler/test_vf_float_conversions.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include <math.h>
+#include "brw_reg.h"
+
+class vf_float_conversion_test : public ::testing::Test {
+ virtual void SetUp();
+
+public:
+ float vf_to_float[128];
+};
+
+void vf_float_conversion_test::SetUp() {
+ /* 0 is special cased. */
+ vf_to_float[0] = 0.0;
+
+ for (int vf = 1; vf < 128; vf++) {
+ int ebits = (vf >> 4) & 0x7;
+ int mbits = vf & 0xf;
+
+ float x = 1.0f + mbits / 16.0f;
+ int exp = ebits - 3;
+
+ vf_to_float[vf] = ldexpf(x, exp);
+ }
+}
+
+union fu {
+ float f;
+ unsigned u;
+};
+
+static unsigned
+f2u(float f)
+{
+ union fu fu;
+ fu.f = f;
+ return fu.u;
+}
+
+TEST_F(vf_float_conversion_test, test_vf_to_float)
+{
+ for (int vf = 0; vf < 256; vf++) {
+ float expected = vf_to_float[vf % 128];
+ if (vf > 127)
+ expected = -expected;
+
+ EXPECT_EQ(f2u(expected), f2u(brw_vf_to_float(vf)));
+ }
+}
+
+TEST_F(vf_float_conversion_test, test_float_to_vf)
+{
+ for (int vf = 0; vf < 256; vf++) {
+ float f = vf_to_float[vf % 128];
+ if (vf > 127)
+ f = -f;
+
+ EXPECT_EQ(vf, brw_float_to_vf(f));
+ }
+}
+
+TEST_F(vf_float_conversion_test, test_special_case_0)
+{
+ /* ±0.0f are special cased to the VFs that would otherwise correspond
+ * to ±0.125f. Make sure we can't convert these values to VF.
+ */
+ EXPECT_EQ(brw_float_to_vf(+0.125f), -1);
+ EXPECT_EQ(brw_float_to_vf(-0.125f), -1);
+
+ EXPECT_EQ(f2u(brw_vf_to_float(brw_float_to_vf(+0.0f))), f2u(+0.0f));
+ EXPECT_EQ(f2u(brw_vf_to_float(brw_float_to_vf(-0.0f))), f2u(-0.0f));
+}
+
+TEST_F(vf_float_conversion_test, test_nonrepresentable_float_input)
+{
+ EXPECT_EQ(brw_float_to_vf(+32.0f), -1);
+ EXPECT_EQ(brw_float_to_vf(-32.0f), -1);
+
+ EXPECT_EQ(brw_float_to_vf(+16.5f), -1);
+ EXPECT_EQ(brw_float_to_vf(-16.5f), -1);
+
+ EXPECT_EQ(brw_float_to_vf(+8.25f), -1);
+ EXPECT_EQ(brw_float_to_vf(-8.25f), -1);
+
+ EXPECT_EQ(brw_float_to_vf(+4.125f), -1);
+ EXPECT_EQ(brw_float_to_vf(-4.125f), -1);
+}